1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
84 A simple base register plus immediate offset.
87 A base register indexed by immediate offset with writeback.
90 A base register indexed by (optionally scaled) register.
93 A base register indexed by (optionally scaled) zero-extended register.
96 A base register indexed by (optionally scaled) sign-extended register.
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type
{
114 struct aarch64_address_info
{
115 enum aarch64_address_type type
;
118 poly_int64 const_offset
;
120 enum aarch64_symbol_type symbol_type
;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type
{ MOV
, MVN
};
127 enum modifier_type
{ LSL
, MSL
};
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode
, rtx
);
131 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
132 insn_type
= MOV
, modifier_type
= LSL
,
134 simd_immediate_info (scalar_mode
, rtx
, rtx
);
136 /* The mode of the elements. */
137 scalar_mode elt_mode
;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
143 /* The value of the step if the constant is a series, null otherwise. */
146 /* The instruction to use to move the immediate into a vector. */
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier
;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
159 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
160 modifier (LSL
), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
168 unsigned HOST_WIDE_INT value_in
,
169 insn_type insn_in
, modifier_type modifier_in
,
170 unsigned int shift_in
)
171 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
172 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
179 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
180 modifier (LSL
), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel
;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg
;
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
194 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
197 machine_mode
*, int *,
199 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
200 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode
);
203 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
208 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode
, rtx
);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version
;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune
= cortexa53
;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags
= 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads
;
223 /* Support for command line parsing of boolean flags in the tuning
225 struct aarch64_flag_desc
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
235 { "none", AARCH64_FUSE_NOTHING
},
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL
},
238 { NULL
, AARCH64_FUSE_NOTHING
}
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
245 { "none", AARCH64_EXTRA_TUNE_NONE
},
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL
},
248 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
251 /* Tuning parameters. */
253 static const struct cpu_addrcost_table generic_addrcost_table
=
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
269 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
285 static const struct cpu_addrcost_table xgene1_addrcost_table
=
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
317 static const struct cpu_regmove_cost generic_regmove_cost
=
320 /* Avoid the use of slow int<->fp moves for spilling by setting
321 their cost higher than memmov_cost. */
327 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
330 /* Avoid the use of slow int<->fp moves for spilling by setting
331 their cost higher than memmov_cost. */
337 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
347 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost (actual, 4 and 9). */
357 static const struct cpu_regmove_cost thunderx_regmove_cost
=
365 static const struct cpu_regmove_cost xgene1_regmove_cost
=
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
375 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
378 /* Avoid the use of int<->fp moves for spilling. */
384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
387 /* Avoid the use of int<->fp moves for spilling. */
393 /* Generic costs for vector insn classes. */
394 static const struct cpu_vector_cost generic_vector_cost
=
396 1, /* scalar_int_stmt_cost */
397 1, /* scalar_fp_stmt_cost */
398 1, /* scalar_load_cost */
399 1, /* scalar_store_cost */
400 1, /* vec_int_stmt_cost */
401 1, /* vec_fp_stmt_cost */
402 2, /* vec_permute_cost */
403 1, /* vec_to_scalar_cost */
404 1, /* scalar_to_vec_cost */
405 1, /* vec_align_load_cost */
406 1, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 3, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 /* ThunderX costs for vector insn classes. */
414 static const struct cpu_vector_cost thunderx_vector_cost
=
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 3, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 4, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 4, /* vec_permute_cost */
423 2, /* vec_to_scalar_cost */
424 2, /* scalar_to_vec_cost */
425 3, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 5, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 3 /* cond_not_taken_branch_cost */
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost cortexa57_vector_cost
=
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 4, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 2, /* vec_int_stmt_cost */
441 2, /* vec_fp_stmt_cost */
442 3, /* vec_permute_cost */
443 8, /* vec_to_scalar_cost */
444 8, /* scalar_to_vec_cost */
445 4, /* vec_align_load_cost */
446 4, /* vec_unalign_load_cost */
447 1, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 1, /* cond_taken_branch_cost */
450 1 /* cond_not_taken_branch_cost */
453 static const struct cpu_vector_cost exynosm1_vector_cost
=
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 5, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 3, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 3, /* vec_to_scalar_cost */
463 3, /* scalar_to_vec_cost */
464 5, /* vec_align_load_cost */
465 5, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for vector insn classes. */
473 static const struct cpu_vector_cost xgene1_vector_cost
=
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 2, /* vec_int_stmt_cost */
480 2, /* vec_fp_stmt_cost */
481 2, /* vec_permute_cost */
482 4, /* vec_to_scalar_cost */
483 4, /* scalar_to_vec_cost */
484 10, /* vec_align_load_cost */
485 10, /* vec_unalign_load_cost */
486 2, /* vec_unalign_store_cost */
487 2, /* vec_store_cost */
488 2, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Costs for vector insn classes for Vulcan. */
493 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
495 1, /* scalar_int_stmt_cost */
496 6, /* scalar_fp_stmt_cost */
497 4, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 5, /* vec_int_stmt_cost */
500 6, /* vec_fp_stmt_cost */
501 3, /* vec_permute_cost */
502 6, /* vec_to_scalar_cost */
503 5, /* scalar_to_vec_cost */
504 8, /* vec_align_load_cost */
505 8, /* vec_unalign_load_cost */
506 4, /* vec_unalign_store_cost */
507 4, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Generic costs for branch instructions. */
513 static const struct cpu_branch_cost generic_branch_cost
=
515 1, /* Predictable. */
516 3 /* Unpredictable. */
519 /* Generic approximation modes. */
520 static const cpu_approx_modes generic_approx_modes
=
522 AARCH64_APPROX_NONE
, /* division */
523 AARCH64_APPROX_NONE
, /* sqrt */
524 AARCH64_APPROX_NONE
/* recip_sqrt */
527 /* Approximation modes for Exynos M1. */
528 static const cpu_approx_modes exynosm1_approx_modes
=
530 AARCH64_APPROX_NONE
, /* division */
531 AARCH64_APPROX_ALL
, /* sqrt */
532 AARCH64_APPROX_ALL
/* recip_sqrt */
535 /* Approximation modes for X-Gene 1. */
536 static const cpu_approx_modes xgene1_approx_modes
=
538 AARCH64_APPROX_NONE
, /* division */
539 AARCH64_APPROX_NONE
, /* sqrt */
540 AARCH64_APPROX_ALL
/* recip_sqrt */
543 /* Generic prefetch settings (which disable prefetch). */
544 static const cpu_prefetch_tune generic_prefetch_tune
=
547 -1, /* l1_cache_size */
548 -1, /* l1_cache_line_size */
549 -1, /* l2_cache_size */
550 -1 /* default_opt_level */
553 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 -1 /* default_opt_level */
562 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
565 32, /* l1_cache_size */
566 64, /* l1_cache_line_size */
567 512, /* l2_cache_size */
568 -1 /* default_opt_level */
571 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
574 32, /* l1_cache_size */
575 128, /* l1_cache_line_size */
576 16*1024, /* l2_cache_size */
577 3 /* default_opt_level */
580 static const cpu_prefetch_tune thunderx_prefetch_tune
=
583 32, /* l1_cache_size */
584 128, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
592 32, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 256, /* l2_cache_size */
595 -1 /* default_opt_level */
598 static const struct tune_params generic_tunings
=
600 &cortexa57_extra_costs
,
601 &generic_addrcost_table
,
602 &generic_regmove_cost
,
603 &generic_vector_cost
,
604 &generic_branch_cost
,
605 &generic_approx_modes
,
608 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
609 8, /* function_align. */
612 2, /* int_reassoc_width. */
613 4, /* fp_reassoc_width. */
614 1, /* vec_reassoc_width. */
615 2, /* min_div_recip_mul_sf. */
616 2, /* min_div_recip_mul_df. */
617 0, /* max_case_values. */
618 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
619 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
620 &generic_prefetch_tune
623 static const struct tune_params cortexa35_tunings
=
625 &cortexa53_extra_costs
,
626 &generic_addrcost_table
,
627 &cortexa53_regmove_cost
,
628 &generic_vector_cost
,
629 &generic_branch_cost
,
630 &generic_approx_modes
,
633 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
634 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
635 16, /* function_align. */
638 2, /* int_reassoc_width. */
639 4, /* fp_reassoc_width. */
640 1, /* vec_reassoc_width. */
641 2, /* min_div_recip_mul_sf. */
642 2, /* min_div_recip_mul_df. */
643 0, /* max_case_values. */
644 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
645 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
646 &generic_prefetch_tune
649 static const struct tune_params cortexa53_tunings
=
651 &cortexa53_extra_costs
,
652 &generic_addrcost_table
,
653 &cortexa53_regmove_cost
,
654 &generic_vector_cost
,
655 &generic_branch_cost
,
656 &generic_approx_modes
,
659 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
660 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
661 16, /* function_align. */
664 2, /* int_reassoc_width. */
665 4, /* fp_reassoc_width. */
666 1, /* vec_reassoc_width. */
667 2, /* min_div_recip_mul_sf. */
668 2, /* min_div_recip_mul_df. */
669 0, /* max_case_values. */
670 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
671 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
672 &generic_prefetch_tune
675 static const struct tune_params cortexa57_tunings
=
677 &cortexa57_extra_costs
,
678 &generic_addrcost_table
,
679 &cortexa57_regmove_cost
,
680 &cortexa57_vector_cost
,
681 &generic_branch_cost
,
682 &generic_approx_modes
,
685 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
686 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
687 16, /* function_align. */
690 2, /* int_reassoc_width. */
691 4, /* fp_reassoc_width. */
692 1, /* vec_reassoc_width. */
693 2, /* min_div_recip_mul_sf. */
694 2, /* min_div_recip_mul_df. */
695 0, /* max_case_values. */
696 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
697 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
698 &generic_prefetch_tune
701 static const struct tune_params cortexa72_tunings
=
703 &cortexa57_extra_costs
,
704 &generic_addrcost_table
,
705 &cortexa57_regmove_cost
,
706 &cortexa57_vector_cost
,
707 &generic_branch_cost
,
708 &generic_approx_modes
,
711 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
712 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
713 16, /* function_align. */
716 2, /* int_reassoc_width. */
717 4, /* fp_reassoc_width. */
718 1, /* vec_reassoc_width. */
719 2, /* min_div_recip_mul_sf. */
720 2, /* min_div_recip_mul_df. */
721 0, /* max_case_values. */
722 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
723 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
724 &generic_prefetch_tune
727 static const struct tune_params cortexa73_tunings
=
729 &cortexa57_extra_costs
,
730 &generic_addrcost_table
,
731 &cortexa57_regmove_cost
,
732 &cortexa57_vector_cost
,
733 &generic_branch_cost
,
734 &generic_approx_modes
,
735 4, /* memmov_cost. */
737 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
738 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
739 16, /* function_align. */
742 2, /* int_reassoc_width. */
743 4, /* fp_reassoc_width. */
744 1, /* vec_reassoc_width. */
745 2, /* min_div_recip_mul_sf. */
746 2, /* min_div_recip_mul_df. */
747 0, /* max_case_values. */
748 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
749 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
750 &generic_prefetch_tune
755 static const struct tune_params exynosm1_tunings
=
757 &exynosm1_extra_costs
,
758 &exynosm1_addrcost_table
,
759 &exynosm1_regmove_cost
,
760 &exynosm1_vector_cost
,
761 &generic_branch_cost
,
762 &exynosm1_approx_modes
,
765 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
766 4, /* function_align. */
769 2, /* int_reassoc_width. */
770 4, /* fp_reassoc_width. */
771 1, /* vec_reassoc_width. */
772 2, /* min_div_recip_mul_sf. */
773 2, /* min_div_recip_mul_df. */
774 48, /* max_case_values. */
775 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
776 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
777 &exynosm1_prefetch_tune
780 static const struct tune_params thunderxt88_tunings
=
782 &thunderx_extra_costs
,
783 &generic_addrcost_table
,
784 &thunderx_regmove_cost
,
785 &thunderx_vector_cost
,
786 &generic_branch_cost
,
787 &generic_approx_modes
,
790 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
791 8, /* function_align. */
794 2, /* int_reassoc_width. */
795 4, /* fp_reassoc_width. */
796 1, /* vec_reassoc_width. */
797 2, /* min_div_recip_mul_sf. */
798 2, /* min_div_recip_mul_df. */
799 0, /* max_case_values. */
800 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
801 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
802 &thunderxt88_prefetch_tune
805 static const struct tune_params thunderx_tunings
=
807 &thunderx_extra_costs
,
808 &generic_addrcost_table
,
809 &thunderx_regmove_cost
,
810 &thunderx_vector_cost
,
811 &generic_branch_cost
,
812 &generic_approx_modes
,
815 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
816 8, /* function_align. */
819 2, /* int_reassoc_width. */
820 4, /* fp_reassoc_width. */
821 1, /* vec_reassoc_width. */
822 2, /* min_div_recip_mul_sf. */
823 2, /* min_div_recip_mul_df. */
824 0, /* max_case_values. */
825 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
826 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
827 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
828 &thunderx_prefetch_tune
831 static const struct tune_params xgene1_tunings
=
834 &xgene1_addrcost_table
,
835 &xgene1_regmove_cost
,
837 &generic_branch_cost
,
838 &xgene1_approx_modes
,
841 AARCH64_FUSE_NOTHING
, /* fusible_ops */
842 16, /* function_align. */
844 16, /* loop_align. */
845 2, /* int_reassoc_width. */
846 4, /* fp_reassoc_width. */
847 1, /* vec_reassoc_width. */
848 2, /* min_div_recip_mul_sf. */
849 2, /* min_div_recip_mul_df. */
850 0, /* max_case_values. */
851 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
852 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
853 &generic_prefetch_tune
856 static const struct tune_params qdf24xx_tunings
=
858 &qdf24xx_extra_costs
,
859 &generic_addrcost_table
,
860 &qdf24xx_regmove_cost
,
861 &generic_vector_cost
,
862 &generic_branch_cost
,
863 &generic_approx_modes
,
866 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
867 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
868 16, /* function_align. */
870 16, /* loop_align. */
871 2, /* int_reassoc_width. */
872 4, /* fp_reassoc_width. */
873 1, /* vec_reassoc_width. */
874 2, /* min_div_recip_mul_sf. */
875 2, /* min_div_recip_mul_df. */
876 0, /* max_case_values. */
877 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
878 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
879 &qdf24xx_prefetch_tune
882 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
884 static const struct tune_params saphira_tunings
=
886 &generic_extra_costs
,
887 &generic_addrcost_table
,
888 &generic_regmove_cost
,
889 &generic_vector_cost
,
890 &generic_branch_cost
,
891 &generic_approx_modes
,
894 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
895 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
896 16, /* function_align. */
898 16, /* loop_align. */
899 2, /* int_reassoc_width. */
900 4, /* fp_reassoc_width. */
901 1, /* vec_reassoc_width. */
902 2, /* min_div_recip_mul_sf. */
903 2, /* min_div_recip_mul_df. */
904 0, /* max_case_values. */
905 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
906 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
907 &generic_prefetch_tune
910 static const struct tune_params thunderx2t99_tunings
=
912 &thunderx2t99_extra_costs
,
913 &thunderx2t99_addrcost_table
,
914 &thunderx2t99_regmove_cost
,
915 &thunderx2t99_vector_cost
,
916 &generic_branch_cost
,
917 &generic_approx_modes
,
918 4, /* memmov_cost. */
920 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
921 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
922 16, /* function_align. */
924 16, /* loop_align. */
925 3, /* int_reassoc_width. */
926 2, /* fp_reassoc_width. */
927 2, /* vec_reassoc_width. */
928 2, /* min_div_recip_mul_sf. */
929 2, /* min_div_recip_mul_df. */
930 0, /* max_case_values. */
931 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
932 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
933 &thunderx2t99_prefetch_tune
936 /* Support for fine-grained override of the tuning structures. */
937 struct aarch64_tuning_override_function
940 void (*parse_override
)(const char*, struct tune_params
*);
943 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
944 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
946 static const struct aarch64_tuning_override_function
947 aarch64_tuning_override_functions
[] =
949 { "fuse", aarch64_parse_fuse_string
},
950 { "tune", aarch64_parse_tune_string
},
954 /* A processor implementing AArch64. */
957 const char *const name
;
958 enum aarch64_processor ident
;
959 enum aarch64_processor sched_core
;
960 enum aarch64_arch arch
;
961 unsigned architecture_version
;
962 const unsigned long flags
;
963 const struct tune_params
*const tune
;
966 /* Architectures implementing AArch64. */
967 static const struct processor all_architectures
[] =
969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
970 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
971 #include "aarch64-arches.def"
972 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
975 /* Processor cores implementing AArch64. */
976 static const struct processor all_cores
[] =
978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
979 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
980 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
981 FLAGS, &COSTS##_tunings},
982 #include "aarch64-cores.def"
983 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
984 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
985 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
989 /* Target specification. These are populated by the -march, -mtune, -mcpu
990 handling code or by target attributes. */
991 static const struct processor
*selected_arch
;
992 static const struct processor
*selected_cpu
;
993 static const struct processor
*selected_tune
;
995 /* The current tuning set. */
996 struct tune_params aarch64_tune_params
= generic_tunings
;
998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1000 /* An ISA extension in the co-processor and main instruction set space. */
1001 struct aarch64_option_extension
1003 const char *const name
;
1004 const unsigned long flags_on
;
1005 const unsigned long flags_off
;
1008 typedef enum aarch64_cond_code
1010 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1011 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1012 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1018 /* The condition codes of the processor, and the inverse function. */
1019 static const char * const aarch64_condition_codes
[] =
1021 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1025 /* Generate code to enable conditional branches in functions over 1 MiB. */
1027 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1028 const char * branch_format
)
1030 rtx_code_label
* tmp_label
= gen_label_rtx ();
1031 char label_buf
[256];
1033 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1034 CODE_LABEL_NUMBER (tmp_label
));
1035 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1036 rtx dest_label
= operands
[pos_label
];
1037 operands
[pos_label
] = tmp_label
;
1039 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1040 output_asm_insn (buffer
, operands
);
1042 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1043 operands
[pos_label
] = dest_label
;
1044 output_asm_insn (buffer
, operands
);
1049 aarch64_err_no_fpadvsimd (machine_mode mode
, const char *msg
)
1051 const char *mc
= FLOAT_MODE_P (mode
) ? "floating-point" : "vector";
1052 if (TARGET_GENERAL_REGS_ONLY
)
1053 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc
, msg
);
1055 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc
, msg
);
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1061 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1063 irrespectively of its cost results in bad allocations with many redundant
1064 int<->FP moves which are expensive on various cores.
1065 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1067 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1068 Otherwise set the allocno class depending on the mode.
1069 The result of this is that it is no longer inefficient to have a higher
1070 memory move cost than the register move cost.
1074 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1075 reg_class_t best_class
)
1079 if (allocno_class
!= ALL_REGS
)
1080 return allocno_class
;
1082 if (best_class
!= ALL_REGS
)
1085 mode
= PSEUDO_REGNO_MODE (regno
);
1086 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1092 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1093 return aarch64_tune_params
.min_div_recip_mul_sf
;
1094 return aarch64_tune_params
.min_div_recip_mul_df
;
1097 /* Return the reassociation width of treeop OPC with mode MODE. */
1099 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1101 if (VECTOR_MODE_P (mode
))
1102 return aarch64_tune_params
.vec_reassoc_width
;
1103 if (INTEGRAL_MODE_P (mode
))
1104 return aarch64_tune_params
.int_reassoc_width
;
1105 /* Avoid reassociating floating point addition so we emit more FMAs. */
1106 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1107 return aarch64_tune_params
.fp_reassoc_width
;
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1113 aarch64_dbx_register_number (unsigned regno
)
1115 if (GP_REGNUM_P (regno
))
1116 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1117 else if (regno
== SP_REGNUM
)
1118 return AARCH64_DWARF_SP
;
1119 else if (FP_REGNUM_P (regno
))
1120 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1121 else if (PR_REGNUM_P (regno
))
1122 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1123 else if (regno
== VG_REGNUM
)
1124 return AARCH64_DWARF_VG
;
1126 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127 equivalent DWARF register. */
1128 return DWARF_FRAME_REGISTERS
;
1131 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1133 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1136 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1139 /* Return true if MODE is an SVE predicate mode. */
1141 aarch64_sve_pred_mode_p (machine_mode mode
)
1144 && (mode
== VNx16BImode
1145 || mode
== VNx8BImode
1146 || mode
== VNx4BImode
1147 || mode
== VNx2BImode
));
1150 /* Three mutually-exclusive flags describing a vector or predicate type. */
1151 const unsigned int VEC_ADVSIMD
= 1;
1152 const unsigned int VEC_SVE_DATA
= 2;
1153 const unsigned int VEC_SVE_PRED
= 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155 a structure of 2, 3 or 4 vectors. */
1156 const unsigned int VEC_STRUCT
= 8;
1157 /* Useful combinations of the above. */
1158 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1159 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162 Ignore modes that are not supported by the current target. */
1164 aarch64_classify_vector_mode (machine_mode mode
)
1166 if (aarch64_advsimd_struct_mode_p (mode
))
1167 return VEC_ADVSIMD
| VEC_STRUCT
;
1169 if (aarch64_sve_pred_mode_p (mode
))
1170 return VEC_SVE_PRED
;
1172 scalar_mode inner
= GET_MODE_INNER (mode
);
1173 if (VECTOR_MODE_P (mode
)
1180 || inner
== DFmode
))
1184 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1185 return VEC_SVE_DATA
;
1186 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1187 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1188 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1189 return VEC_SVE_DATA
| VEC_STRUCT
;
1192 /* This includes V1DF but not V1DI (which doesn't exist). */
1194 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1195 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1202 /* Return true if MODE is any of the data vector modes, including
1205 aarch64_vector_data_mode_p (machine_mode mode
)
1207 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211 or a structure of vectors. */
1213 aarch64_sve_data_mode_p (machine_mode mode
)
1215 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1218 /* Implement target hook TARGET_ARRAY_MODE. */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1222 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1223 && IN_RANGE (nelems
, 2, 4))
1224 return mode_for_vector (GET_MODE_INNER (mode
),
1225 GET_MODE_NUNITS (mode
) * nelems
);
1227 return opt_machine_mode ();
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1232 aarch64_array_mode_supported_p (machine_mode mode
,
1233 unsigned HOST_WIDE_INT nelems
)
1236 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1237 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1238 && (nelems
>= 2 && nelems
<= 4))
1244 /* Return the SVE predicate mode to use for elements that have
1245 ELEM_NBYTES bytes, if such a mode exists. */
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1252 if (elem_nbytes
== 1)
1254 if (elem_nbytes
== 2)
1256 if (elem_nbytes
== 4)
1258 if (elem_nbytes
== 8)
1261 return opt_machine_mode ();
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1269 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1271 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1272 machine_mode pred_mode
;
1273 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1277 return default_get_mask_mode (nunits
, nbytes
);
1280 /* Implement TARGET_HARD_REGNO_NREGS. */
1283 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1285 /* ??? Logically we should only need to provide a value when
1286 HARD_REGNO_MODE_OK says that the combination is valid,
1287 but at the moment we need to handle all modes. Just ignore
1288 any runtime parts for registers that can't store them. */
1289 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1290 switch (aarch64_regno_regclass (regno
))
1294 if (aarch64_sve_data_mode_p (mode
))
1295 return exact_div (GET_MODE_SIZE (mode
),
1296 BYTES_PER_SVE_VECTOR
).to_constant ();
1297 return CEIL (lowest_size
, UNITS_PER_VREG
);
1303 return CEIL (lowest_size
, UNITS_PER_WORD
);
1308 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1311 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1313 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1314 return regno
== CC_REGNUM
;
1316 if (regno
== VG_REGNUM
)
1317 /* This must have the same size as _Unwind_Word. */
1318 return mode
== DImode
;
1320 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1321 if (vec_flags
& VEC_SVE_PRED
)
1322 return PR_REGNUM_P (regno
);
1324 if (PR_REGNUM_P (regno
))
1327 if (regno
== SP_REGNUM
)
1328 /* The purpose of comparing with ptr_mode is to support the
1329 global register variable associated with the stack pointer
1330 register via the syntax of asm ("wsp") in ILP32. */
1331 return mode
== Pmode
|| mode
== ptr_mode
;
1333 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1334 return mode
== Pmode
;
1336 if (GP_REGNUM_P (regno
) && known_le (GET_MODE_SIZE (mode
), 16))
1339 if (FP_REGNUM_P (regno
))
1341 if (vec_flags
& VEC_STRUCT
)
1342 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1344 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1351 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1352 clobbers the top 64 bits when restoring the bottom 64 bits. */
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1357 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1360 /* Implement REGMODE_NATURAL_SIZE. */
1362 aarch64_regmode_natural_size (machine_mode mode
)
1364 /* The natural size for SVE data modes is one SVE data vector,
1365 and similarly for predicates. We can't independently modify
1366 anything smaller than that. */
1367 /* ??? For now, only do this for variable-width SVE registers.
1368 Doing it for constant-sized registers breaks lower-subreg.c. */
1369 /* ??? And once that's fixed, we should probably have similar
1370 code for Advanced SIMD. */
1371 if (!aarch64_sve_vg
.is_constant ())
1373 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1374 if (vec_flags
& VEC_SVE_PRED
)
1375 return BYTES_PER_SVE_PRED
;
1376 if (vec_flags
& VEC_SVE_DATA
)
1377 return BYTES_PER_SVE_VECTOR
;
1379 return UNITS_PER_WORD
;
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1384 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1387 /* The predicate mode determines which bits are significant and
1388 which are "don't care". Decreasing the number of lanes would
1389 lose data while increasing the number of lanes would make bits
1390 unnecessarily significant. */
1391 if (PR_REGNUM_P (regno
))
1393 if (known_ge (GET_MODE_SIZE (mode
), 4))
1399 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1400 that strcpy from constants will be faster. */
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1405 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1406 return MAX (align
, BITS_PER_WORD
);
1410 /* Return true if calls to DECL should be treated as
1411 long-calls (ie called via a register). */
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419 long-calls (ie called via a register). */
1421 aarch64_is_long_call_p (rtx sym
)
1423 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1426 /* Return true if calls to symbol-ref SYM should not go through
1430 aarch64_is_noplt_call_p (rtx sym
)
1432 const_tree decl
= SYMBOL_REF_DECL (sym
);
1437 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1438 && !targetm
.binds_local_p (decl
))
1444 /* Return true if the offsets to a zero/sign-extract operation
1445 represent an expression that matches an extend operation. The
1446 operands represent the paramters from
1448 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1450 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1453 HOST_WIDE_INT mult_val
, extract_val
;
1455 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1458 mult_val
= INTVAL (mult_imm
);
1459 extract_val
= INTVAL (extract_imm
);
1462 && extract_val
< GET_MODE_BITSIZE (mode
)
1463 && exact_log2 (extract_val
& ~7) > 0
1464 && (extract_val
& 7) <= 4
1465 && mult_val
== (1 << (extract_val
& 7)))
1471 /* Emit an insn that's a simple single-set. Both the operands must be
1472 known to be valid. */
1473 inline static rtx_insn
*
1474 emit_set_insn (rtx x
, rtx y
)
1476 return emit_insn (gen_rtx_SET (x
, y
));
1479 /* X and Y are two things to compare using CODE. Emit the compare insn and
1480 return the rtx for register 0 in the proper mode. */
1482 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1484 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1485 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1487 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1491 /* Build the SYMBOL_REF for __tls_get_addr. */
1493 static GTY(()) rtx tls_get_addr_libfunc
;
1496 aarch64_tls_get_addr (void)
1498 if (!tls_get_addr_libfunc
)
1499 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1500 return tls_get_addr_libfunc
;
1503 /* Return the TLS model to use for ADDR. */
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr
)
1508 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1509 if (GET_CODE (addr
) == CONST
)
1512 rtx sym
= strip_offset (addr
, &addend
);
1513 if (GET_CODE (sym
) == SYMBOL_REF
)
1514 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1516 else if (GET_CODE (addr
) == SYMBOL_REF
)
1517 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523 so that combine would take care of combining addresses where
1524 necessary, but for generation purposes, we'll generate the address
1527 tmp = hi (symbol_ref); adrp x1, foo
1528 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1532 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1533 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1537 Load TLS symbol, depending on TLS mechanism and TLS access model.
1539 Global Dynamic - Traditional TLS:
1540 adrp tmp, :tlsgd:imm
1541 add dest, tmp, #:tlsgd_lo12:imm
1544 Global Dynamic - TLS Descriptors:
1545 adrp dest, :tlsdesc:imm
1546 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1547 add dest, dest, #:tlsdesc_lo12:imm
1554 adrp tmp, :gottprel:imm
1555 ldr dest, [tmp, #:gottprel_lo12:imm]
1560 add t0, tp, #:tprel_hi12:imm, lsl #12
1561 add t0, t0, #:tprel_lo12_nc:imm
1565 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1566 enum aarch64_symbol_type type
)
1570 case SYMBOL_SMALL_ABSOLUTE
:
1572 /* In ILP32, the mode of dest can be either SImode or DImode. */
1574 machine_mode mode
= GET_MODE (dest
);
1576 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1578 if (can_create_pseudo_p ())
1579 tmp_reg
= gen_reg_rtx (mode
);
1581 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1582 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1586 case SYMBOL_TINY_ABSOLUTE
:
1587 emit_insn (gen_rtx_SET (dest
, imm
));
1590 case SYMBOL_SMALL_GOT_28K
:
1592 machine_mode mode
= GET_MODE (dest
);
1593 rtx gp_rtx
= pic_offset_table_rtx
;
1597 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598 here before rtl expand. Tree IVOPT will generate rtl pattern to
1599 decide rtx costs, in which case pic_offset_table_rtx is not
1600 initialized. For that case no need to generate the first adrp
1601 instruction as the final cost for global variable access is
1605 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606 using the page base as GOT base, the first page may be wasted,
1607 in the worst scenario, there is only 28K space for GOT).
1609 The generate instruction sequence for accessing global variable
1612 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1614 Only one instruction needed. But we must initialize
1615 pic_offset_table_rtx properly. We generate initialize insn for
1616 every global access, and allow CSE to remove all redundant.
1618 The final instruction sequences will look like the following
1619 for multiply global variables access.
1621 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1623 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1628 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1629 crtl
->uses_pic_offset_table
= 1;
1630 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1632 if (mode
!= GET_MODE (gp_rtx
))
1633 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1637 if (mode
== ptr_mode
)
1640 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1642 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1644 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1648 gcc_assert (mode
== Pmode
);
1650 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1651 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1654 /* The operand is expected to be MEM. Whenever the related insn
1655 pattern changed, above code which calculate mem should be
1657 gcc_assert (GET_CODE (mem
) == MEM
);
1658 MEM_READONLY_P (mem
) = 1;
1659 MEM_NOTRAP_P (mem
) = 1;
1664 case SYMBOL_SMALL_GOT_4G
:
1666 /* In ILP32, the mode of dest can be either SImode or DImode,
1667 while the got entry is always of SImode size. The mode of
1668 dest depends on how dest is used: if dest is assigned to a
1669 pointer (e.g. in the memory), it has SImode; it may have
1670 DImode if dest is dereferenced to access the memeory.
1671 This is why we have to handle three different ldr_got_small
1672 patterns here (two patterns for ILP32). */
1677 machine_mode mode
= GET_MODE (dest
);
1679 if (can_create_pseudo_p ())
1680 tmp_reg
= gen_reg_rtx (mode
);
1682 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1683 if (mode
== ptr_mode
)
1686 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1688 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1690 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1694 gcc_assert (mode
== Pmode
);
1696 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1697 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1700 gcc_assert (GET_CODE (mem
) == MEM
);
1701 MEM_READONLY_P (mem
) = 1;
1702 MEM_NOTRAP_P (mem
) = 1;
1707 case SYMBOL_SMALL_TLSGD
:
1710 machine_mode mode
= GET_MODE (dest
);
1711 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1715 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1717 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1718 insns
= get_insns ();
1721 RTL_CONST_CALL_P (insns
) = 1;
1722 emit_libcall_block (insns
, dest
, result
, imm
);
1726 case SYMBOL_SMALL_TLSDESC
:
1728 machine_mode mode
= GET_MODE (dest
);
1729 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1732 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1734 /* In ILP32, the got entry is always of SImode size. Unlike
1735 small GOT, the dest is fixed at reg 0. */
1737 emit_insn (gen_tlsdesc_small_si (imm
));
1739 emit_insn (gen_tlsdesc_small_di (imm
));
1740 tp
= aarch64_load_tp (NULL
);
1743 tp
= gen_lowpart (mode
, tp
);
1745 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1747 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1751 case SYMBOL_SMALL_TLSIE
:
1753 /* In ILP32, the mode of dest can be either SImode or DImode,
1754 while the got entry is always of SImode size. The mode of
1755 dest depends on how dest is used: if dest is assigned to a
1756 pointer (e.g. in the memory), it has SImode; it may have
1757 DImode if dest is dereferenced to access the memeory.
1758 This is why we have to handle three different tlsie_small
1759 patterns here (two patterns for ILP32). */
1760 machine_mode mode
= GET_MODE (dest
);
1761 rtx tmp_reg
= gen_reg_rtx (mode
);
1762 rtx tp
= aarch64_load_tp (NULL
);
1764 if (mode
== ptr_mode
)
1767 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1770 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1771 tp
= gen_lowpart (mode
, tp
);
1776 gcc_assert (mode
== Pmode
);
1777 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1780 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1782 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1786 case SYMBOL_TLSLE12
:
1787 case SYMBOL_TLSLE24
:
1788 case SYMBOL_TLSLE32
:
1789 case SYMBOL_TLSLE48
:
1791 machine_mode mode
= GET_MODE (dest
);
1792 rtx tp
= aarch64_load_tp (NULL
);
1795 tp
= gen_lowpart (mode
, tp
);
1799 case SYMBOL_TLSLE12
:
1800 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1803 case SYMBOL_TLSLE24
:
1804 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1807 case SYMBOL_TLSLE32
:
1808 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1810 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1813 case SYMBOL_TLSLE48
:
1814 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1816 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1824 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1828 case SYMBOL_TINY_GOT
:
1829 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1832 case SYMBOL_TINY_TLSIE
:
1834 machine_mode mode
= GET_MODE (dest
);
1835 rtx tp
= aarch64_load_tp (NULL
);
1837 if (mode
== ptr_mode
)
1840 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1843 tp
= gen_lowpart (mode
, tp
);
1844 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1849 gcc_assert (mode
== Pmode
);
1850 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1854 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1863 /* Emit a move from SRC to DEST. Assume that the move expanders can
1864 handle all moves if !can_create_pseudo_p (). The distinction is
1865 important because, unlike emit_move_insn, the move expanders know
1866 how to force Pmode objects into the constant pool even when the
1867 constant pool address is not itself legitimate. */
1869 aarch64_emit_move (rtx dest
, rtx src
)
1871 return (can_create_pseudo_p ()
1872 ? emit_move_insn (dest
, src
)
1873 : emit_move_insn_1 (dest
, src
));
1876 /* Apply UNOPTAB to OP and store the result in DEST. */
1879 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
1881 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
1883 emit_move_insn (dest
, tmp
);
1886 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1889 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
1891 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
1894 emit_move_insn (dest
, tmp
);
1897 /* Split a 128-bit move operation into two 64-bit move operations,
1898 taking care to handle partial overlap of register to register
1899 copies. Special cases are needed when moving between GP regs and
1900 FP regs. SRC can be a register, constant or memory; DST a register
1901 or memory. If either operand is memory it must not have any side
1904 aarch64_split_128bit_move (rtx dst
, rtx src
)
1909 machine_mode mode
= GET_MODE (dst
);
1911 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1912 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1913 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1915 if (REG_P (dst
) && REG_P (src
))
1917 int src_regno
= REGNO (src
);
1918 int dst_regno
= REGNO (dst
);
1920 /* Handle FP <-> GP regs. */
1921 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1923 src_lo
= gen_lowpart (word_mode
, src
);
1924 src_hi
= gen_highpart (word_mode
, src
);
1928 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1929 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1933 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1934 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1938 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1940 dst_lo
= gen_lowpart (word_mode
, dst
);
1941 dst_hi
= gen_highpart (word_mode
, dst
);
1945 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
1946 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
1950 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1951 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1957 dst_lo
= gen_lowpart (word_mode
, dst
);
1958 dst_hi
= gen_highpart (word_mode
, dst
);
1959 src_lo
= gen_lowpart (word_mode
, src
);
1960 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1962 /* At most one pairing may overlap. */
1963 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1965 aarch64_emit_move (dst_hi
, src_hi
);
1966 aarch64_emit_move (dst_lo
, src_lo
);
1970 aarch64_emit_move (dst_lo
, src_lo
);
1971 aarch64_emit_move (dst_hi
, src_hi
);
1976 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1978 return (! REG_P (src
)
1979 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1982 /* Split a complex SIMD combine. */
1985 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1987 machine_mode src_mode
= GET_MODE (src1
);
1988 machine_mode dst_mode
= GET_MODE (dst
);
1990 gcc_assert (VECTOR_MODE_P (dst_mode
));
1991 gcc_assert (register_operand (dst
, dst_mode
)
1992 && register_operand (src1
, src_mode
)
1993 && register_operand (src2
, src_mode
));
1995 rtx (*gen
) (rtx
, rtx
, rtx
);
2000 gen
= gen_aarch64_simd_combinev8qi
;
2003 gen
= gen_aarch64_simd_combinev4hi
;
2006 gen
= gen_aarch64_simd_combinev2si
;
2009 gen
= gen_aarch64_simd_combinev4hf
;
2012 gen
= gen_aarch64_simd_combinev2sf
;
2015 gen
= gen_aarch64_simd_combinedi
;
2018 gen
= gen_aarch64_simd_combinedf
;
2024 emit_insn (gen (dst
, src1
, src2
));
2028 /* Split a complex SIMD move. */
2031 aarch64_split_simd_move (rtx dst
, rtx src
)
2033 machine_mode src_mode
= GET_MODE (src
);
2034 machine_mode dst_mode
= GET_MODE (dst
);
2036 gcc_assert (VECTOR_MODE_P (dst_mode
));
2038 if (REG_P (dst
) && REG_P (src
))
2040 rtx (*gen
) (rtx
, rtx
);
2042 gcc_assert (VECTOR_MODE_P (src_mode
));
2047 gen
= gen_aarch64_split_simd_movv16qi
;
2050 gen
= gen_aarch64_split_simd_movv8hi
;
2053 gen
= gen_aarch64_split_simd_movv4si
;
2056 gen
= gen_aarch64_split_simd_movv2di
;
2059 gen
= gen_aarch64_split_simd_movv8hf
;
2062 gen
= gen_aarch64_split_simd_movv4sf
;
2065 gen
= gen_aarch64_split_simd_movv2df
;
2071 emit_insn (gen (dst
, src
));
2077 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2078 machine_mode ymode
, rtx y
)
2080 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2081 gcc_assert (r
!= NULL
);
2082 return rtx_equal_p (x
, r
);
2087 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2089 if (can_create_pseudo_p ())
2090 return force_reg (mode
, value
);
2094 aarch64_emit_move (x
, value
);
2099 /* Return true if we can move VALUE into a register using a single
2100 CNT[BHWD] instruction. */
2103 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2105 HOST_WIDE_INT factor
= value
.coeffs
[0];
2106 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2107 return (value
.coeffs
[1] == factor
2108 && IN_RANGE (factor
, 2, 16 * 16)
2109 && (factor
& 1) == 0
2110 && factor
<= 16 * (factor
& -factor
));
2113 /* Likewise for rtx X. */
2116 aarch64_sve_cnt_immediate_p (rtx x
)
2119 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2122 /* Return the asm string for an instruction with a CNT-like vector size
2123 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2124 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2125 first part of the operands template (the part that comes before the
2126 vector size itself). FACTOR is the number of quadwords.
2127 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2128 If it is zero, we can use any element size. */
2131 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2132 unsigned int factor
,
2133 unsigned int nelts_per_vq
)
2135 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2137 if (nelts_per_vq
== 0)
2138 /* There is some overlap in the ranges of the four CNT instructions.
2139 Here we always use the smallest possible element size, so that the
2140 multiplier is 1 whereever possible. */
2141 nelts_per_vq
= factor
& -factor
;
2142 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2143 gcc_assert (IN_RANGE (shift
, 1, 4));
2144 char suffix
= "dwhb"[shift
- 1];
2147 unsigned int written
;
2149 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2150 prefix
, suffix
, operands
);
2152 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2153 prefix
, suffix
, operands
, factor
);
2154 gcc_assert (written
< sizeof (buffer
));
2158 /* Return the asm string for an instruction with a CNT-like vector size
2159 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2160 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2161 first part of the operands template (the part that comes before the
2162 vector size itself). X is the value of the vector size operand,
2163 as a polynomial integer rtx. */
2166 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2169 poly_int64 value
= rtx_to_poly_int64 (x
);
2170 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2171 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2172 value
.coeffs
[1], 0);
2175 /* Return true if we can add VALUE to a register using a single ADDVL
2176 or ADDPL instruction. */
2179 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2181 HOST_WIDE_INT factor
= value
.coeffs
[0];
2182 if (factor
== 0 || value
.coeffs
[1] != factor
)
2184 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2185 and a value of 16 is one vector width. */
2186 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2187 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2190 /* Likewise for rtx X. */
2193 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2196 return (poly_int_rtx_p (x
, &value
)
2197 && aarch64_sve_addvl_addpl_immediate_p (value
));
2200 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2201 and storing the result in operand 0. */
2204 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2206 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2207 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2208 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2210 /* Use INC or DEC if possible. */
2211 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2213 if (aarch64_sve_cnt_immediate_p (offset_value
))
2214 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2215 offset_value
.coeffs
[1], 0);
2216 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2217 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2218 -offset_value
.coeffs
[1], 0);
2221 int factor
= offset_value
.coeffs
[1];
2222 if ((factor
& 15) == 0)
2223 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2225 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2229 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2230 instruction. If it is, store the number of elements in each vector
2231 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2232 factor in *FACTOR_OUT (if nonnull). */
2235 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2236 unsigned int *nelts_per_vq_out
)
2241 if (!const_vec_duplicate_p (x
, &elt
)
2242 || !poly_int_rtx_p (elt
, &value
))
2245 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2246 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2247 /* There's no vector INCB. */
2250 HOST_WIDE_INT factor
= value
.coeffs
[0];
2251 if (value
.coeffs
[1] != factor
)
2254 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2255 if ((factor
% nelts_per_vq
) != 0
2256 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2260 *factor_out
= factor
;
2261 if (nelts_per_vq_out
)
2262 *nelts_per_vq_out
= nelts_per_vq
;
2266 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2270 aarch64_sve_inc_dec_immediate_p (rtx x
)
2272 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2275 /* Return the asm template for an SVE vector INC or DEC instruction.
2276 OPERANDS gives the operands before the vector count and X is the
2277 value of the vector count operand itself. */
2280 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2283 unsigned int nelts_per_vq
;
2284 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2287 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2290 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2295 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2296 scalar_int_mode mode
)
2299 unsigned HOST_WIDE_INT val
, val2
, mask
;
2300 int one_match
, zero_match
;
2305 if (aarch64_move_imm (val
, mode
))
2308 emit_insn (gen_rtx_SET (dest
, imm
));
2312 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2313 (with XXXX non-zero). In that case check to see if the move can be done in
2315 val2
= val
& 0xffffffff;
2317 && aarch64_move_imm (val2
, SImode
)
2318 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2321 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2323 /* Check if we have to emit a second instruction by checking to see
2324 if any of the upper 32 bits of the original DI mode value is set. */
2328 i
= (val
>> 48) ? 48 : 32;
2331 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2332 GEN_INT ((val
>> i
) & 0xffff)));
2337 if ((val
>> 32) == 0 || mode
== SImode
)
2341 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2343 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2344 GEN_INT ((val
>> 16) & 0xffff)));
2346 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2347 GEN_INT ((val
>> 16) & 0xffff)));
2352 /* Remaining cases are all for DImode. */
2355 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2356 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2357 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2358 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2360 if (zero_match
!= 2 && one_match
!= 2)
2362 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2363 For a 64-bit bitmask try whether changing 16 bits to all ones or
2364 zeroes creates a valid bitmask. To check any repeated bitmask,
2365 try using 16 bits from the other 32-bit half of val. */
2367 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2370 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2373 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2375 val2
= val2
& ~mask
;
2376 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2377 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2384 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2385 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2386 GEN_INT ((val
>> i
) & 0xffff)));
2392 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2393 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2394 otherwise skip zero bits. */
2398 val2
= one_match
> zero_match
? ~val
: val
;
2399 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2402 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2403 ? (val
| ~(mask
<< i
))
2404 : (val
& (mask
<< i
)))));
2405 for (i
+= 16; i
< 64; i
+= 16)
2407 if ((val2
& (mask
<< i
)) == 0)
2410 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2411 GEN_INT ((val
>> i
) & 0xffff)));
2418 /* Return whether imm is a 128-bit immediate which is simple enough to
2421 aarch64_mov128_immediate (rtx imm
)
2423 if (GET_CODE (imm
) == CONST_INT
)
2426 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2428 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2429 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2431 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2432 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2436 /* Return the number of temporary registers that aarch64_add_offset_1
2437 would need to add OFFSET to a register. */
2440 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2442 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2445 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2446 a non-polynomial OFFSET. MODE is the mode of the addition.
2447 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2448 be set and CFA adjustments added to the generated instructions.
2450 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2451 temporary if register allocation is already complete. This temporary
2452 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2453 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2454 the immediate again.
2456 Since this function may be used to adjust the stack pointer, we must
2457 ensure that it cannot cause transient stack deallocation (for example
2458 by first incrementing SP and then decrementing when adjusting by a
2459 large immediate). */
2462 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2463 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2464 bool frame_related_p
, bool emit_move_imm
)
2466 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2467 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2469 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2474 if (!rtx_equal_p (dest
, src
))
2476 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2477 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2482 /* Single instruction adjustment. */
2483 if (aarch64_uimm12_shift (moffset
))
2485 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2486 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2490 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2493 a) the offset cannot be loaded by a 16-bit move or
2494 b) there is no spare register into which we can move it. */
2495 if (moffset
< 0x1000000
2496 && ((!temp1
&& !can_create_pseudo_p ())
2497 || !aarch64_move_imm (moffset
, mode
)))
2499 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2501 low_off
= offset
< 0 ? -low_off
: low_off
;
2502 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2503 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2504 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2505 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2509 /* Emit a move immediate if required and an addition/subtraction. */
2512 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2513 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2515 insn
= emit_insn (offset
< 0
2516 ? gen_sub3_insn (dest
, src
, temp1
)
2517 : gen_add3_insn (dest
, src
, temp1
));
2518 if (frame_related_p
)
2520 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2521 rtx adj
= plus_constant (mode
, src
, offset
);
2522 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2526 /* Return the number of temporary registers that aarch64_add_offset
2527 would need to move OFFSET into a register or add OFFSET to a register;
2528 ADD_P is true if we want the latter rather than the former. */
2531 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2533 /* This follows the same structure as aarch64_add_offset. */
2534 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2537 unsigned int count
= 0;
2538 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2539 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2540 poly_int64
poly_offset (factor
, factor
);
2541 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2542 /* Need one register for the ADDVL/ADDPL result. */
2544 else if (factor
!= 0)
2546 factor
= abs (factor
);
2547 if (factor
> 16 * (factor
& -factor
))
2548 /* Need one register for the CNT result and one for the multiplication
2549 factor. If necessary, the second temporary can be reused for the
2550 constant part of the offset. */
2552 /* Need one register for the CNT result (which might then
2556 return count
+ aarch64_add_offset_1_temporaries (constant
);
2559 /* If X can be represented as a poly_int64, return the number
2560 of temporaries that are required to add it to a register.
2561 Return -1 otherwise. */
2564 aarch64_add_offset_temporaries (rtx x
)
2567 if (!poly_int_rtx_p (x
, &offset
))
2569 return aarch64_offset_temporaries (true, offset
);
2572 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2573 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2574 be set and CFA adjustments added to the generated instructions.
2576 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2577 temporary if register allocation is already complete. This temporary
2578 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2579 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2580 false to avoid emitting the immediate again.
2582 TEMP2, if nonnull, is a second temporary register that doesn't
2583 overlap either DEST or REG.
2585 Since this function may be used to adjust the stack pointer, we must
2586 ensure that it cannot cause transient stack deallocation (for example
2587 by first incrementing SP and then decrementing when adjusting by a
2588 large immediate). */
2591 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2592 poly_int64 offset
, rtx temp1
, rtx temp2
,
2593 bool frame_related_p
, bool emit_move_imm
= true)
2595 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2596 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2597 gcc_assert (temp1
== NULL_RTX
2599 || !reg_overlap_mentioned_p (temp1
, dest
));
2600 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2602 /* Try using ADDVL or ADDPL to add the whole value. */
2603 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2605 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2606 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2607 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2611 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2612 SVE vector register, over and above the minimum size of 128 bits.
2613 This is equivalent to half the value returned by CNTD with a
2614 vector shape of ALL. */
2615 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2616 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2618 /* Try using ADDVL or ADDPL to add the VG-based part. */
2619 poly_int64
poly_offset (factor
, factor
);
2620 if (src
!= const0_rtx
2621 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2623 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2624 if (frame_related_p
)
2626 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2627 RTX_FRAME_RELATED_P (insn
) = true;
2632 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2633 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2638 /* Otherwise use a CNT-based sequence. */
2639 else if (factor
!= 0)
2641 /* Use a subtraction if we have a negative factor. */
2642 rtx_code code
= PLUS
;
2649 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2650 into the multiplication. */
2654 /* Use a right shift by 1. */
2658 HOST_WIDE_INT low_bit
= factor
& -factor
;
2659 if (factor
<= 16 * low_bit
)
2661 if (factor
> 16 * 8)
2663 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2664 the value with the minimum multiplier and shift it into
2666 int extra_shift
= exact_log2 (low_bit
);
2667 shift
+= extra_shift
;
2668 factor
>>= extra_shift
;
2670 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2674 /* Use CNTD, then multiply it by FACTOR. */
2675 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2676 val
= aarch64_force_temporary (mode
, temp1
, val
);
2678 /* Go back to using a negative multiplication factor if we have
2679 no register from which to subtract. */
2680 if (code
== MINUS
&& src
== const0_rtx
)
2685 rtx coeff1
= gen_int_mode (factor
, mode
);
2686 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2687 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2692 /* Multiply by 1 << SHIFT. */
2693 val
= aarch64_force_temporary (mode
, temp1
, val
);
2694 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2696 else if (shift
== -1)
2699 val
= aarch64_force_temporary (mode
, temp1
, val
);
2700 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2703 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2704 if (src
!= const0_rtx
)
2706 val
= aarch64_force_temporary (mode
, temp1
, val
);
2707 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2709 else if (code
== MINUS
)
2711 val
= aarch64_force_temporary (mode
, temp1
, val
);
2712 val
= gen_rtx_NEG (mode
, val
);
2715 if (constant
== 0 || frame_related_p
)
2717 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2718 if (frame_related_p
)
2720 RTX_FRAME_RELATED_P (insn
) = true;
2721 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2722 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2731 src
= aarch64_force_temporary (mode
, temp1
, val
);
2736 emit_move_imm
= true;
2739 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
2740 frame_related_p
, emit_move_imm
);
2743 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2744 than a poly_int64. */
2747 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2748 rtx offset_rtx
, rtx temp1
, rtx temp2
)
2750 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
2751 temp1
, temp2
, false);
2754 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2755 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2756 if TEMP1 already contains abs (DELTA). */
2759 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
2761 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
2762 temp1
, temp2
, true, emit_move_imm
);
2765 /* Subtract DELTA from the stack pointer, marking the instructions
2766 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2770 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
)
2772 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
2773 temp1
, temp2
, frame_related_p
);
2776 /* Set DEST to (vec_series BASE STEP). */
2779 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
2781 machine_mode mode
= GET_MODE (dest
);
2782 scalar_mode inner
= GET_MODE_INNER (mode
);
2784 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2785 if (!aarch64_sve_index_immediate_p (base
))
2786 base
= force_reg (inner
, base
);
2787 if (!aarch64_sve_index_immediate_p (step
))
2788 step
= force_reg (inner
, step
);
2790 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
2793 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2794 integer of mode INT_MODE. Return true on success. */
2797 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
2800 /* If the constant is smaller than 128 bits, we can do the move
2801 using a vector of SRC_MODEs. */
2802 if (src_mode
!= TImode
)
2804 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
2805 GET_MODE_SIZE (src_mode
));
2806 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
2807 emit_move_insn (gen_lowpart (dup_mode
, dest
),
2808 gen_const_vec_duplicate (dup_mode
, src
));
2812 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2813 src
= force_const_mem (src_mode
, src
);
2817 /* Make sure that the address is legitimate. */
2818 if (!aarch64_sve_ld1r_operand_p (src
))
2820 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
2821 src
= replace_equiv_address (src
, addr
);
2824 machine_mode mode
= GET_MODE (dest
);
2825 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
2826 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
2827 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
2828 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
2829 emit_insn (gen_rtx_SET (dest
, src
));
2833 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2834 isn't a simple duplicate or series. */
2837 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
2839 machine_mode mode
= GET_MODE (src
);
2840 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
2841 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
2842 gcc_assert (npatterns
> 1);
2844 if (nelts_per_pattern
== 1)
2846 /* The constant is a repeating seqeuence of at least two elements,
2847 where the repeating elements occupy no more than 128 bits.
2848 Get an integer representation of the replicated value. */
2849 scalar_int_mode int_mode
;
2850 if (BYTES_BIG_ENDIAN
)
2851 /* For now, always use LD1RQ to load the value on big-endian
2852 targets, since the handling of smaller integers includes a
2853 subreg that is semantically an element reverse. */
2857 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
2858 gcc_assert (int_bits
<= 128);
2859 int_mode
= int_mode_for_size (int_bits
, 0).require ();
2861 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
2863 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
2867 /* Expand each pattern individually. */
2868 rtx_vector_builder builder
;
2869 auto_vec
<rtx
, 16> vectors (npatterns
);
2870 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2872 builder
.new_vector (mode
, 1, nelts_per_pattern
);
2873 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
2874 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
2875 vectors
.quick_push (force_reg (mode
, builder
.build ()));
2878 /* Use permutes to interleave the separate vectors. */
2879 while (npatterns
> 1)
2882 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2884 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
2885 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
2886 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
2890 gcc_assert (vectors
[0] == dest
);
2893 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2894 is a pattern that can be used to set DEST to a replicated scalar
2898 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
2899 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
2901 machine_mode mode
= GET_MODE (dest
);
2903 /* Check on what type of symbol it is. */
2904 scalar_int_mode int_mode
;
2905 if ((GET_CODE (imm
) == SYMBOL_REF
2906 || GET_CODE (imm
) == LABEL_REF
2907 || GET_CODE (imm
) == CONST
2908 || GET_CODE (imm
) == CONST_POLY_INT
)
2909 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
2913 HOST_WIDE_INT const_offset
;
2914 enum aarch64_symbol_type sty
;
2916 /* If we have (const (plus symbol offset)), separate out the offset
2917 before we start classifying the symbol. */
2918 rtx base
= strip_offset (imm
, &offset
);
2920 /* We must always add an offset involving VL separately, rather than
2921 folding it into the relocation. */
2922 if (!offset
.is_constant (&const_offset
))
2924 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
2925 emit_insn (gen_rtx_SET (dest
, imm
));
2928 /* Do arithmetic on 32-bit values if the result is smaller
2930 if (partial_subreg_p (int_mode
, SImode
))
2932 /* It is invalid to do symbol calculations in modes
2933 narrower than SImode. */
2934 gcc_assert (base
== const0_rtx
);
2935 dest
= gen_lowpart (SImode
, dest
);
2938 if (base
!= const0_rtx
)
2940 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2941 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2942 NULL_RTX
, NULL_RTX
, false);
2945 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2946 dest
, NULL_RTX
, false);
2951 sty
= aarch64_classify_symbol (base
, const_offset
);
2954 case SYMBOL_FORCE_TO_MEM
:
2955 if (const_offset
!= 0
2956 && targetm
.cannot_force_const_mem (int_mode
, imm
))
2958 gcc_assert (can_create_pseudo_p ());
2959 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2960 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
2961 NULL_RTX
, NULL_RTX
, false);
2965 mem
= force_const_mem (ptr_mode
, imm
);
2968 /* If we aren't generating PC relative literals, then
2969 we need to expand the literal pool access carefully.
2970 This is something that needs to be done in a number
2971 of places, so could well live as a separate function. */
2972 if (!aarch64_pcrelative_literal_loads
)
2974 gcc_assert (can_create_pseudo_p ());
2975 base
= gen_reg_rtx (ptr_mode
);
2976 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
2977 if (ptr_mode
!= Pmode
)
2978 base
= convert_memory_address (Pmode
, base
);
2979 mem
= gen_rtx_MEM (ptr_mode
, base
);
2982 if (int_mode
!= ptr_mode
)
2983 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
2985 emit_insn (gen_rtx_SET (dest
, mem
));
2989 case SYMBOL_SMALL_TLSGD
:
2990 case SYMBOL_SMALL_TLSDESC
:
2991 case SYMBOL_SMALL_TLSIE
:
2992 case SYMBOL_SMALL_GOT_28K
:
2993 case SYMBOL_SMALL_GOT_4G
:
2994 case SYMBOL_TINY_GOT
:
2995 case SYMBOL_TINY_TLSIE
:
2996 if (const_offset
!= 0)
2998 gcc_assert(can_create_pseudo_p ());
2999 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3000 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3001 NULL_RTX
, NULL_RTX
, false);
3006 case SYMBOL_SMALL_ABSOLUTE
:
3007 case SYMBOL_TINY_ABSOLUTE
:
3008 case SYMBOL_TLSLE12
:
3009 case SYMBOL_TLSLE24
:
3010 case SYMBOL_TLSLE32
:
3011 case SYMBOL_TLSLE48
:
3012 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3020 if (!CONST_INT_P (imm
))
3022 rtx base
, step
, value
;
3023 if (GET_CODE (imm
) == HIGH
3024 || aarch64_simd_valid_immediate (imm
, NULL
))
3025 emit_insn (gen_rtx_SET (dest
, imm
));
3026 else if (const_vec_series_p (imm
, &base
, &step
))
3027 aarch64_expand_vec_series (dest
, base
, step
);
3028 else if (const_vec_duplicate_p (imm
, &value
))
3030 /* If the constant is out of range of an SVE vector move,
3031 load it from memory if we can, otherwise move it into
3032 a register and use a DUP. */
3033 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3034 rtx op
= force_const_mem (inner_mode
, value
);
3036 op
= force_reg (inner_mode
, value
);
3037 else if (!aarch64_sve_ld1r_operand_p (op
))
3039 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3040 op
= replace_equiv_address (op
, addr
);
3042 emit_insn (gen_vec_duplicate (dest
, op
));
3044 else if (GET_CODE (imm
) == CONST_VECTOR
3045 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3046 aarch64_expand_sve_const_vector (dest
, imm
);
3049 rtx mem
= force_const_mem (mode
, imm
);
3051 emit_move_insn (dest
, mem
);
3057 aarch64_internal_mov_immediate (dest
, imm
, true,
3058 as_a
<scalar_int_mode
> (mode
));
3061 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3062 that is known to contain PTRUE. */
3065 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3067 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3068 gen_rtvec (2, pred
, src
),
3069 UNSPEC_MERGE_PTRUE
)));
3072 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3073 operand is in memory. In this case we need to use the predicated LD1
3074 and ST1 instead of LDR and STR, both for correctness on big-endian
3075 targets and because LD1 and ST1 support a wider range of addressing modes.
3076 PRED_MODE is the mode of the predicate.
3078 See the comment at the head of aarch64-sve.md for details about the
3079 big-endian handling. */
3082 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3084 machine_mode mode
= GET_MODE (dest
);
3085 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3086 if (!register_operand (src
, mode
)
3087 && !register_operand (dest
, mode
))
3089 rtx tmp
= gen_reg_rtx (mode
);
3091 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3093 emit_move_insn (tmp
, src
);
3096 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3099 /* Called only on big-endian targets. See whether an SVE vector move
3100 from SRC to DEST is effectively a REV[BHW] instruction, because at
3101 least one operand is a subreg of an SVE vector that has wider or
3102 narrower elements. Return true and emit the instruction if so.
3106 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3108 represents a VIEW_CONVERT between the following vectors, viewed
3111 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3112 R1: { [0], [1], [2], [3], ... }
3114 The high part of lane X in R2 should therefore correspond to lane X*2
3115 of R1, but the register representations are:
3118 R2: ...... [1].high [1].low [0].high [0].low
3119 R1: ...... [3] [2] [1] [0]
3121 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3122 We therefore need a reverse operation to swap the high and low values
3125 This is purely an optimization. Without it we would spill the
3126 subreg operand to the stack in one mode and reload it in the
3127 other mode, which has the same effect as the REV. */
3130 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3132 gcc_assert (BYTES_BIG_ENDIAN
);
3133 if (GET_CODE (dest
) == SUBREG
)
3134 dest
= SUBREG_REG (dest
);
3135 if (GET_CODE (src
) == SUBREG
)
3136 src
= SUBREG_REG (src
);
3138 /* The optimization handles two single SVE REGs with different element
3142 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3143 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3144 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3145 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3148 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3149 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3150 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3152 emit_insn (gen_rtx_SET (dest
, unspec
));
3156 /* Return a copy of X with mode MODE, without changing its other
3157 attributes. Unlike gen_lowpart, this doesn't care whether the
3158 mode change is valid. */
3161 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3163 if (GET_MODE (x
) == mode
)
3166 x
= shallow_copy_rtx (x
);
3167 set_mode_and_regno (x
, mode
, REGNO (x
));
3171 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3175 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3177 /* Decide which REV operation we need. The mode with narrower elements
3178 determines the mode of the operands and the mode with the wider
3179 elements determines the reverse width. */
3180 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3181 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3182 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3183 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3184 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3186 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3187 unsigned int unspec
;
3188 if (wider_bytes
== 8)
3189 unspec
= UNSPEC_REV64
;
3190 else if (wider_bytes
== 4)
3191 unspec
= UNSPEC_REV32
;
3192 else if (wider_bytes
== 2)
3193 unspec
= UNSPEC_REV16
;
3196 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3200 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3201 UNSPEC_MERGE_PTRUE))
3203 with the appropriate modes. */
3204 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3205 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3206 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3207 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3208 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3209 UNSPEC_MERGE_PTRUE
);
3210 emit_insn (gen_rtx_SET (dest
, src
));
3214 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3215 tree exp ATTRIBUTE_UNUSED
)
3217 /* Currently, always true. */
3221 /* Implement TARGET_PASS_BY_REFERENCE. */
3224 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3227 bool named ATTRIBUTE_UNUSED
)
3230 machine_mode dummymode
;
3233 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3234 if (mode
== BLKmode
&& type
)
3235 size
= int_size_in_bytes (type
);
3237 /* No frontends can create types with variable-sized modes, so we
3238 shouldn't be asked to pass or return them. */
3239 size
= GET_MODE_SIZE (mode
).to_constant ();
3241 /* Aggregates are passed by reference based on their size. */
3242 if (type
&& AGGREGATE_TYPE_P (type
))
3244 size
= int_size_in_bytes (type
);
3247 /* Variable sized arguments are always returned by reference. */
3251 /* Can this be a candidate to be passed in fp/simd register(s)? */
3252 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3257 /* Arguments which are variable sized or larger than 2 registers are
3258 passed by reference unless they are a homogenous floating point
3260 return size
> 2 * UNITS_PER_WORD
;
3263 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3265 aarch64_return_in_msb (const_tree valtype
)
3267 machine_mode dummy_mode
;
3270 /* Never happens in little-endian mode. */
3271 if (!BYTES_BIG_ENDIAN
)
3274 /* Only composite types smaller than or equal to 16 bytes can
3275 be potentially returned in registers. */
3276 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3277 || int_size_in_bytes (valtype
) <= 0
3278 || int_size_in_bytes (valtype
) > 16)
3281 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3282 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3283 is always passed/returned in the least significant bits of fp/simd
3285 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3286 &dummy_mode
, &dummy_int
, NULL
))
3292 /* Implement TARGET_FUNCTION_VALUE.
3293 Define how to find the value returned by a function. */
3296 aarch64_function_value (const_tree type
, const_tree func
,
3297 bool outgoing ATTRIBUTE_UNUSED
)
3302 machine_mode ag_mode
;
3304 mode
= TYPE_MODE (type
);
3305 if (INTEGRAL_TYPE_P (type
))
3306 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3308 if (aarch64_return_in_msb (type
))
3310 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3312 if (size
% UNITS_PER_WORD
!= 0)
3314 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3315 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3319 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3320 &ag_mode
, &count
, NULL
))
3322 if (!aarch64_composite_type_p (type
, mode
))
3324 gcc_assert (count
== 1 && mode
== ag_mode
);
3325 return gen_rtx_REG (mode
, V0_REGNUM
);
3332 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3333 for (i
= 0; i
< count
; i
++)
3335 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3336 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3337 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3338 XVECEXP (par
, 0, i
) = tmp
;
3344 return gen_rtx_REG (mode
, R0_REGNUM
);
3347 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3348 Return true if REGNO is the number of a hard register in which the values
3349 of called function may come back. */
3352 aarch64_function_value_regno_p (const unsigned int regno
)
3354 /* Maximum of 16 bytes can be returned in the general registers. Examples
3355 of 16-byte return values are: 128-bit integers and 16-byte small
3356 structures (excluding homogeneous floating-point aggregates). */
3357 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3360 /* Up to four fp/simd registers can return a function value, e.g. a
3361 homogeneous floating-point aggregate having four members. */
3362 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3363 return TARGET_FLOAT
;
3368 /* Implement TARGET_RETURN_IN_MEMORY.
3370 If the type T of the result of a function is such that
3372 would require that arg be passed as a value in a register (or set of
3373 registers) according to the parameter passing rules, then the result
3374 is returned in the same registers as would be used for such an
3378 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3381 machine_mode ag_mode
;
3384 if (!AGGREGATE_TYPE_P (type
)
3385 && TREE_CODE (type
) != COMPLEX_TYPE
3386 && TREE_CODE (type
) != VECTOR_TYPE
)
3387 /* Simple scalar types always returned in registers. */
3390 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3397 /* Types larger than 2 registers returned in memory. */
3398 size
= int_size_in_bytes (type
);
3399 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3403 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3404 const_tree type
, int *nregs
)
3406 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3407 return aarch64_vfp_is_call_or_return_candidate (mode
,
3409 &pcum
->aapcs_vfp_rmode
,
3414 /* Given MODE and TYPE of a function argument, return the alignment in
3415 bits. The idea is to suppress any stronger alignment requested by
3416 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3417 This is a helper function for local use only. */
3420 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3423 return GET_MODE_ALIGNMENT (mode
);
3425 if (integer_zerop (TYPE_SIZE (type
)))
3428 gcc_assert (TYPE_MODE (type
) == mode
);
3430 if (!AGGREGATE_TYPE_P (type
))
3431 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3433 if (TREE_CODE (type
) == ARRAY_TYPE
)
3434 return TYPE_ALIGN (TREE_TYPE (type
));
3436 unsigned int alignment
= 0;
3437 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3438 if (TREE_CODE (field
) == FIELD_DECL
)
3439 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3444 /* Layout a function argument according to the AAPCS64 rules. The rule
3445 numbers refer to the rule numbers in the AAPCS64. */
3448 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3450 bool named ATTRIBUTE_UNUSED
)
3452 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3453 int ncrn
, nvrn
, nregs
;
3454 bool allocate_ncrn
, allocate_nvrn
;
3457 /* We need to do this once per argument. */
3458 if (pcum
->aapcs_arg_processed
)
3461 pcum
->aapcs_arg_processed
= true;
3463 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3465 size
= int_size_in_bytes (type
);
3467 /* No frontends can create types with variable-sized modes, so we
3468 shouldn't be asked to pass or return them. */
3469 size
= GET_MODE_SIZE (mode
).to_constant ();
3470 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3472 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3473 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3478 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3479 The following code thus handles passing by SIMD/FP registers first. */
3481 nvrn
= pcum
->aapcs_nvrn
;
3483 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3484 and homogenous short-vector aggregates (HVA). */
3488 aarch64_err_no_fpadvsimd (mode
, "argument");
3490 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3492 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3493 if (!aarch64_composite_type_p (type
, mode
))
3495 gcc_assert (nregs
== 1);
3496 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3502 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3503 for (i
= 0; i
< nregs
; i
++)
3505 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3506 V0_REGNUM
+ nvrn
+ i
);
3507 rtx offset
= gen_int_mode
3508 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3509 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3510 XVECEXP (par
, 0, i
) = tmp
;
3512 pcum
->aapcs_reg
= par
;
3518 /* C.3 NSRN is set to 8. */
3519 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3524 ncrn
= pcum
->aapcs_ncrn
;
3525 nregs
= size
/ UNITS_PER_WORD
;
3527 /* C6 - C9. though the sign and zero extension semantics are
3528 handled elsewhere. This is the case where the argument fits
3529 entirely general registers. */
3530 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3533 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3535 /* C.8 if the argument has an alignment of 16 then the NGRN is
3536 rounded up to the next even number. */
3539 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3540 comparison is there because for > 16 * BITS_PER_UNIT
3541 alignment nregs should be > 2 and therefore it should be
3542 passed by reference rather than value. */
3543 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3546 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3549 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3550 A reg is still generated for it, but the caller should be smart
3551 enough not to use it. */
3552 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3553 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3559 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3560 for (i
= 0; i
< nregs
; i
++)
3562 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3563 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3564 GEN_INT (i
* UNITS_PER_WORD
));
3565 XVECEXP (par
, 0, i
) = tmp
;
3567 pcum
->aapcs_reg
= par
;
3570 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3575 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3577 /* The argument is passed on stack; record the needed number of words for
3578 this argument and align the total size if necessary. */
3580 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3582 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3583 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3584 16 / UNITS_PER_WORD
);
3588 /* Implement TARGET_FUNCTION_ARG. */
3591 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3592 const_tree type
, bool named
)
3594 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3595 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3597 if (mode
== VOIDmode
)
3600 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3601 return pcum
->aapcs_reg
;
3605 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3606 const_tree fntype ATTRIBUTE_UNUSED
,
3607 rtx libname ATTRIBUTE_UNUSED
,
3608 const_tree fndecl ATTRIBUTE_UNUSED
,
3609 unsigned n_named ATTRIBUTE_UNUSED
)
3611 pcum
->aapcs_ncrn
= 0;
3612 pcum
->aapcs_nvrn
= 0;
3613 pcum
->aapcs_nextncrn
= 0;
3614 pcum
->aapcs_nextnvrn
= 0;
3615 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3616 pcum
->aapcs_reg
= NULL_RTX
;
3617 pcum
->aapcs_arg_processed
= false;
3618 pcum
->aapcs_stack_words
= 0;
3619 pcum
->aapcs_stack_size
= 0;
3622 && fndecl
&& TREE_PUBLIC (fndecl
)
3623 && fntype
&& fntype
!= error_mark_node
)
3625 const_tree type
= TREE_TYPE (fntype
);
3626 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3627 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3628 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3629 &mode
, &nregs
, NULL
))
3630 aarch64_err_no_fpadvsimd (TYPE_MODE (type
), "return type");
3636 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3641 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3642 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3644 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3645 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3646 != (pcum
->aapcs_stack_words
!= 0));
3647 pcum
->aapcs_arg_processed
= false;
3648 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3649 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3650 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3651 pcum
->aapcs_stack_words
= 0;
3652 pcum
->aapcs_reg
= NULL_RTX
;
3657 aarch64_function_arg_regno_p (unsigned regno
)
3659 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3660 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3663 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3664 PARM_BOUNDARY bits of alignment, but will be given anything up
3665 to STACK_BOUNDARY bits if the type requires it. This makes sure
3666 that both before and after the layout of each argument, the Next
3667 Stacked Argument Address (NSAA) will have a minimum alignment of
3671 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3673 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3674 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3677 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3679 static fixed_size_mode
3680 aarch64_get_reg_raw_mode (int regno
)
3682 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3683 /* Don't use the SVE part of the register for __builtin_apply and
3684 __builtin_return. The SVE registers aren't used by the normal PCS,
3685 so using them there would be a waste of time. The PCS extensions
3686 for SVE types are fundamentally incompatible with the
3687 __builtin_return/__builtin_apply interface. */
3688 return as_a
<fixed_size_mode
> (V16QImode
);
3689 return default_get_reg_raw_mode (regno
);
3692 /* Implement TARGET_FUNCTION_ARG_PADDING.
3694 Small aggregate types are placed in the lowest memory address.
3696 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3698 static pad_direction
3699 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3701 /* On little-endian targets, the least significant byte of every stack
3702 argument is passed at the lowest byte address of the stack slot. */
3703 if (!BYTES_BIG_ENDIAN
)
3706 /* Otherwise, integral, floating-point and pointer types are padded downward:
3707 the least significant byte of a stack argument is passed at the highest
3708 byte address of the stack slot. */
3710 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3711 || POINTER_TYPE_P (type
))
3712 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3713 return PAD_DOWNWARD
;
3715 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3719 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3721 It specifies padding for the last (may also be the only)
3722 element of a block move between registers and memory. If
3723 assuming the block is in the memory, padding upward means that
3724 the last element is padded after its highest significant byte,
3725 while in downward padding, the last element is padded at the
3726 its least significant byte side.
3728 Small aggregates and small complex types are always padded
3731 We don't need to worry about homogeneous floating-point or
3732 short-vector aggregates; their move is not affected by the
3733 padding direction determined here. Regardless of endianness,
3734 each element of such an aggregate is put in the least
3735 significant bits of a fp/simd register.
3737 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3738 register has useful data, and return the opposite if the most
3739 significant byte does. */
3742 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
3743 bool first ATTRIBUTE_UNUSED
)
3746 /* Small composite types are always padded upward. */
3747 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
3751 size
= int_size_in_bytes (type
);
3753 /* No frontends can create types with variable-sized modes, so we
3754 shouldn't be asked to pass or return them. */
3755 size
= GET_MODE_SIZE (mode
).to_constant ();
3756 if (size
< 2 * UNITS_PER_WORD
)
3760 /* Otherwise, use the default padding. */
3761 return !BYTES_BIG_ENDIAN
;
3764 static scalar_int_mode
3765 aarch64_libgcc_cmp_return_mode (void)
3770 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3772 /* We use the 12-bit shifted immediate arithmetic instructions so values
3773 must be multiple of (1 << 12), i.e. 4096. */
3774 #define ARITH_FACTOR 4096
3776 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3777 #error Cannot use simple address calculation for stack probing
3780 /* The pair of scratch registers used for stack probing. */
3781 #define PROBE_STACK_FIRST_REG 9
3782 #define PROBE_STACK_SECOND_REG 10
3784 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3785 inclusive. These are offsets from the current stack pointer. */
3788 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
3791 if (!poly_size
.is_constant (&size
))
3793 sorry ("stack probes for SVE frames");
3797 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
3799 /* See the same assertion on PROBE_INTERVAL above. */
3800 gcc_assert ((first
% ARITH_FACTOR
) == 0);
3802 /* See if we have a constant small number of probes to generate. If so,
3803 that's the easy case. */
3804 if (size
<= PROBE_INTERVAL
)
3806 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
3808 emit_set_insn (reg1
,
3809 plus_constant (Pmode
,
3810 stack_pointer_rtx
, -(first
+ base
)));
3811 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
3814 /* The run-time loop is made up of 8 insns in the generic case while the
3815 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3816 else if (size
<= 4 * PROBE_INTERVAL
)
3818 HOST_WIDE_INT i
, rem
;
3820 emit_set_insn (reg1
,
3821 plus_constant (Pmode
,
3823 -(first
+ PROBE_INTERVAL
)));
3824 emit_stack_probe (reg1
);
3826 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3827 it exceeds SIZE. If only two probes are needed, this will not
3828 generate any code. Then probe at FIRST + SIZE. */
3829 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
3831 emit_set_insn (reg1
,
3832 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
3833 emit_stack_probe (reg1
);
3836 rem
= size
- (i
- PROBE_INTERVAL
);
3839 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3841 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
3842 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
3845 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
3848 /* Otherwise, do the same as above, but in a loop. Note that we must be
3849 extra careful with variables wrapping around because we might be at
3850 the very top (or the very bottom) of the address space and we have
3851 to be able to handle this case properly; in particular, we use an
3852 equality test for the loop condition. */
3855 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
3857 /* Step 1: round SIZE to the previous multiple of the interval. */
3859 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
3862 /* Step 2: compute initial and final value of the loop counter. */
3864 /* TEST_ADDR = SP + FIRST. */
3865 emit_set_insn (reg1
,
3866 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
3868 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3869 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
3870 if (! aarch64_uimm12_shift (adjustment
))
3872 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
3874 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
3877 emit_set_insn (reg2
,
3878 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
3884 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3887 while (TEST_ADDR != LAST_ADDR)
3889 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3890 until it is equal to ROUNDED_SIZE. */
3892 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
3895 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3896 that SIZE is equal to ROUNDED_SIZE. */
3898 if (size
!= rounded_size
)
3900 HOST_WIDE_INT rem
= size
- rounded_size
;
3904 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3906 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
3907 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
3910 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
3914 /* Make sure nothing is scheduled before we are done. */
3915 emit_insn (gen_blockage ());
3918 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3919 absolute addresses. */
3922 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
3924 static int labelno
= 0;
3928 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
3931 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
3933 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3935 xops
[1] = GEN_INT (PROBE_INTERVAL
);
3936 output_asm_insn ("sub\t%0, %0, %1", xops
);
3938 /* Probe at TEST_ADDR. */
3939 output_asm_insn ("str\txzr, [%0]", xops
);
3941 /* Test if TEST_ADDR == LAST_ADDR. */
3943 output_asm_insn ("cmp\t%0, %1", xops
);
3946 fputs ("\tb.ne\t", asm_out_file
);
3947 assemble_name_raw (asm_out_file
, loop_lab
);
3948 fputc ('\n', asm_out_file
);
3953 /* Mark the registers that need to be saved by the callee and calculate
3954 the size of the callee-saved registers area and frame record (both FP
3955 and LR may be omitted). */
3957 aarch64_layout_frame (void)
3959 HOST_WIDE_INT offset
= 0;
3960 int regno
, last_fp_reg
= INVALID_REGNUM
;
3962 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
3965 /* Force a frame chain for EH returns so the return address is at FP+8. */
3966 cfun
->machine
->frame
.emit_frame_chain
3967 = frame_pointer_needed
|| crtl
->calls_eh_return
;
3969 /* Emit a frame chain if the frame pointer is enabled.
3970 If -momit-leaf-frame-pointer is used, do not use a frame chain
3971 in leaf functions which do not use LR. */
3972 if (flag_omit_frame_pointer
== 2
3973 && !(flag_omit_leaf_frame_pointer
&& crtl
->is_leaf
3974 && !df_regs_ever_live_p (LR_REGNUM
)))
3975 cfun
->machine
->frame
.emit_frame_chain
= true;
3977 #define SLOT_NOT_REQUIRED (-2)
3978 #define SLOT_REQUIRED (-1)
3980 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
3981 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
3983 /* First mark all the registers that really need to be saved... */
3984 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
3985 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
3987 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
3988 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
3990 /* ... that includes the eh data registers (if needed)... */
3991 if (crtl
->calls_eh_return
)
3992 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
3993 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
3996 /* ... and any callee saved register that dataflow says is live. */
3997 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
3998 if (df_regs_ever_live_p (regno
)
3999 && (regno
== R30_REGNUM
4000 || !call_used_regs
[regno
]))
4001 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4003 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4004 if (df_regs_ever_live_p (regno
)
4005 && !call_used_regs
[regno
])
4007 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4008 last_fp_reg
= regno
;
4011 if (cfun
->machine
->frame
.emit_frame_chain
)
4013 /* FP and LR are placed in the linkage record. */
4014 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4015 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4016 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4017 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4018 offset
= 2 * UNITS_PER_WORD
;
4021 /* Now assign stack slots for them. */
4022 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4023 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4025 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4026 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4027 cfun
->machine
->frame
.wb_candidate1
= regno
;
4028 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4029 cfun
->machine
->frame
.wb_candidate2
= regno
;
4030 offset
+= UNITS_PER_WORD
;
4033 HOST_WIDE_INT max_int_offset
= offset
;
4034 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4035 bool has_align_gap
= offset
!= max_int_offset
;
4037 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4038 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4040 /* If there is an alignment gap between integer and fp callee-saves,
4041 allocate the last fp register to it if possible. */
4042 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
4044 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4048 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4049 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4050 cfun
->machine
->frame
.wb_candidate1
= regno
;
4051 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4052 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4053 cfun
->machine
->frame
.wb_candidate2
= regno
;
4054 offset
+= UNITS_PER_WORD
;
4057 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4059 cfun
->machine
->frame
.saved_regs_size
= offset
;
4061 HOST_WIDE_INT varargs_and_saved_regs_size
4062 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4064 cfun
->machine
->frame
.hard_fp_offset
4065 = aligned_upper_bound (varargs_and_saved_regs_size
4066 + get_frame_size (),
4067 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4069 /* Both these values are already aligned. */
4070 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4071 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4072 cfun
->machine
->frame
.frame_size
4073 = (cfun
->machine
->frame
.hard_fp_offset
4074 + crtl
->outgoing_args_size
);
4076 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4078 cfun
->machine
->frame
.initial_adjust
= 0;
4079 cfun
->machine
->frame
.final_adjust
= 0;
4080 cfun
->machine
->frame
.callee_adjust
= 0;
4081 cfun
->machine
->frame
.callee_offset
= 0;
4083 HOST_WIDE_INT max_push_offset
= 0;
4084 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4085 max_push_offset
= 512;
4086 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4087 max_push_offset
= 256;
4089 HOST_WIDE_INT const_size
, const_fp_offset
;
4090 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4091 && const_size
< max_push_offset
4092 && known_eq (crtl
->outgoing_args_size
, 0))
4094 /* Simple, small frame with no outgoing arguments:
4095 stp reg1, reg2, [sp, -frame_size]!
4096 stp reg3, reg4, [sp, 16] */
4097 cfun
->machine
->frame
.callee_adjust
= const_size
;
4099 else if (known_lt (crtl
->outgoing_args_size
4100 + cfun
->machine
->frame
.saved_regs_size
, 512)
4101 && !(cfun
->calls_alloca
4102 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4105 /* Frame with small outgoing arguments:
4106 sub sp, sp, frame_size
4107 stp reg1, reg2, [sp, outgoing_args_size]
4108 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4109 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4110 cfun
->machine
->frame
.callee_offset
4111 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4113 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4114 && const_fp_offset
< max_push_offset
)
4116 /* Frame with large outgoing arguments but a small local area:
4117 stp reg1, reg2, [sp, -hard_fp_offset]!
4118 stp reg3, reg4, [sp, 16]
4119 sub sp, sp, outgoing_args_size */
4120 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4121 cfun
->machine
->frame
.final_adjust
4122 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4126 /* Frame with large local area and outgoing arguments using frame pointer:
4127 sub sp, sp, hard_fp_offset
4128 stp x29, x30, [sp, 0]
4130 stp reg3, reg4, [sp, 16]
4131 sub sp, sp, outgoing_args_size */
4132 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4133 cfun
->machine
->frame
.final_adjust
4134 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4137 cfun
->machine
->frame
.laid_out
= true;
4140 /* Return true if the register REGNO is saved on entry to
4141 the current function. */
4144 aarch64_register_saved_on_entry (int regno
)
4146 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4149 /* Return the next register up from REGNO up to LIMIT for the callee
4153 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4155 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4160 /* Push the register number REGNO of mode MODE to the stack with write-back
4161 adjusting the stack by ADJUSTMENT. */
4164 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4165 HOST_WIDE_INT adjustment
)
4167 rtx base_rtx
= stack_pointer_rtx
;
4170 reg
= gen_rtx_REG (mode
, regno
);
4171 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4172 plus_constant (Pmode
, base_rtx
, -adjustment
));
4173 mem
= gen_frame_mem (mode
, mem
);
4175 insn
= emit_move_insn (mem
, reg
);
4176 RTX_FRAME_RELATED_P (insn
) = 1;
4179 /* Generate and return an instruction to store the pair of registers
4180 REG and REG2 of mode MODE to location BASE with write-back adjusting
4181 the stack location BASE by ADJUSTMENT. */
4184 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4185 HOST_WIDE_INT adjustment
)
4190 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4191 GEN_INT (-adjustment
),
4192 GEN_INT (UNITS_PER_WORD
- adjustment
));
4194 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4195 GEN_INT (-adjustment
),
4196 GEN_INT (UNITS_PER_WORD
- adjustment
));
4202 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4203 stack pointer by ADJUSTMENT. */
4206 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4209 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4211 if (regno2
== INVALID_REGNUM
)
4212 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4214 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4215 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4217 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4219 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4220 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4221 RTX_FRAME_RELATED_P (insn
) = 1;
4224 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4225 adjusting it by ADJUSTMENT afterwards. */
4228 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4229 HOST_WIDE_INT adjustment
)
4234 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4235 GEN_INT (UNITS_PER_WORD
));
4237 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4238 GEN_INT (UNITS_PER_WORD
));
4244 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4245 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4249 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4252 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4253 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4255 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4257 if (regno2
== INVALID_REGNUM
)
4259 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4260 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4261 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4265 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4266 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4267 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4272 /* Generate and return a store pair instruction of mode MODE to store
4273 register REG1 to MEM1 and register REG2 to MEM2. */
4276 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4282 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4285 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4292 /* Generate and regurn a load pair isntruction of mode MODE to load register
4293 REG1 from MEM1 and register REG2 from MEM2. */
4296 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4302 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4305 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4312 /* Return TRUE if return address signing should be enabled for the current
4313 function, otherwise return FALSE. */
4316 aarch64_return_address_signing_enabled (void)
4318 /* This function should only be called after frame laid out. */
4319 gcc_assert (cfun
->machine
->frame
.laid_out
);
4321 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4322 if it's LR is pushed onto stack. */
4323 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4324 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4325 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4328 /* Emit code to save the callee-saved registers from register number START
4329 to LIMIT to the stack at the location starting at offset START_OFFSET,
4330 skipping any write-back candidates if SKIP_WB is true. */
4333 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4334 unsigned start
, unsigned limit
, bool skip_wb
)
4340 for (regno
= aarch64_next_callee_save (start
, limit
);
4342 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4348 && (regno
== cfun
->machine
->frame
.wb_candidate1
4349 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4352 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4355 reg
= gen_rtx_REG (mode
, regno
);
4356 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4357 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4360 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4363 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4364 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4365 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4368 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4371 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4372 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4374 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4377 /* The first part of a frame-related parallel insn is
4378 always assumed to be relevant to the frame
4379 calculations; subsequent parts, are only
4380 frame-related if explicitly marked. */
4381 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4385 insn
= emit_move_insn (mem
, reg
);
4387 RTX_FRAME_RELATED_P (insn
) = 1;
4391 /* Emit code to restore the callee registers of mode MODE from register
4392 number START up to and including LIMIT. Restore from the stack offset
4393 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4394 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4397 aarch64_restore_callee_saves (machine_mode mode
,
4398 poly_int64 start_offset
, unsigned start
,
4399 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4401 rtx base_rtx
= stack_pointer_rtx
;
4406 for (regno
= aarch64_next_callee_save (start
, limit
);
4408 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4410 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4416 && (regno
== cfun
->machine
->frame
.wb_candidate1
4417 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4420 reg
= gen_rtx_REG (mode
, regno
);
4421 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4422 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4424 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4427 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4428 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4429 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4431 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4434 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4435 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4436 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4438 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4442 emit_move_insn (reg
, mem
);
4443 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4447 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4451 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4453 HOST_WIDE_INT multiple
;
4454 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4455 && IN_RANGE (multiple
, -8, 7));
4458 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4462 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4464 HOST_WIDE_INT multiple
;
4465 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4466 && IN_RANGE (multiple
, 0, 63));
4469 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4473 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4475 HOST_WIDE_INT multiple
;
4476 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4477 && IN_RANGE (multiple
, -64, 63));
4480 /* Return true if OFFSET is a signed 9-bit value. */
4483 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4486 HOST_WIDE_INT const_offset
;
4487 return (offset
.is_constant (&const_offset
)
4488 && IN_RANGE (const_offset
, -256, 255));
4491 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4495 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4497 HOST_WIDE_INT multiple
;
4498 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4499 && IN_RANGE (multiple
, -256, 255));
4502 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4506 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4508 HOST_WIDE_INT multiple
;
4509 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4510 && IN_RANGE (multiple
, 0, 4095));
4513 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4516 aarch64_get_separate_components (void)
4518 aarch64_layout_frame ();
4520 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4521 bitmap_clear (components
);
4523 /* The registers we need saved to the frame. */
4524 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4525 if (aarch64_register_saved_on_entry (regno
))
4527 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4528 if (!frame_pointer_needed
)
4529 offset
+= cfun
->machine
->frame
.frame_size
4530 - cfun
->machine
->frame
.hard_fp_offset
;
4531 /* Check that we can access the stack slot of the register with one
4532 direct load with no adjustments needed. */
4533 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4534 bitmap_set_bit (components
, regno
);
4537 /* Don't mess with the hard frame pointer. */
4538 if (frame_pointer_needed
)
4539 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4541 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4542 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4543 /* If aarch64_layout_frame has chosen registers to store/restore with
4544 writeback don't interfere with them to avoid having to output explicit
4545 stack adjustment instructions. */
4546 if (reg2
!= INVALID_REGNUM
)
4547 bitmap_clear_bit (components
, reg2
);
4548 if (reg1
!= INVALID_REGNUM
)
4549 bitmap_clear_bit (components
, reg1
);
4551 bitmap_clear_bit (components
, LR_REGNUM
);
4552 bitmap_clear_bit (components
, SP_REGNUM
);
4557 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4560 aarch64_components_for_bb (basic_block bb
)
4562 bitmap in
= DF_LIVE_IN (bb
);
4563 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4564 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4566 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4567 bitmap_clear (components
);
4569 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4570 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4571 if ((!call_used_regs
[regno
])
4572 && (bitmap_bit_p (in
, regno
)
4573 || bitmap_bit_p (gen
, regno
)
4574 || bitmap_bit_p (kill
, regno
)))
4576 unsigned regno2
, offset
, offset2
;
4577 bitmap_set_bit (components
, regno
);
4579 /* If there is a callee-save at an adjacent offset, add it too
4580 to increase the use of LDP/STP. */
4581 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4582 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
4584 if (regno2
<= LAST_SAVED_REGNUM
)
4586 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4587 if ((offset
& ~8) == (offset2
& ~8))
4588 bitmap_set_bit (components
, regno2
);
4595 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4596 Nothing to do for aarch64. */
4599 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
4603 /* Return the next set bit in BMP from START onwards. Return the total number
4604 of bits in BMP if no set bit is found at or after START. */
4607 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
4609 unsigned int nbits
= SBITMAP_SIZE (bmp
);
4613 gcc_assert (start
< nbits
);
4614 for (unsigned int i
= start
; i
< nbits
; i
++)
4615 if (bitmap_bit_p (bmp
, i
))
4621 /* Do the work for aarch64_emit_prologue_components and
4622 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4623 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4624 for these components or the epilogue sequence. That is, it determines
4625 whether we should emit stores or loads and what kind of CFA notes to attach
4626 to the insns. Otherwise the logic for the two sequences is very
4630 aarch64_process_components (sbitmap components
, bool prologue_p
)
4632 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
4633 ? HARD_FRAME_POINTER_REGNUM
4634 : STACK_POINTER_REGNUM
);
4636 unsigned last_regno
= SBITMAP_SIZE (components
);
4637 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
4638 rtx_insn
*insn
= NULL
;
4640 while (regno
!= last_regno
)
4642 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4643 so DFmode for the vector registers is enough. */
4644 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
4645 rtx reg
= gen_rtx_REG (mode
, regno
);
4646 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4647 if (!frame_pointer_needed
)
4648 offset
+= cfun
->machine
->frame
.frame_size
4649 - cfun
->machine
->frame
.hard_fp_offset
;
4650 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
4651 rtx mem
= gen_frame_mem (mode
, addr
);
4653 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
4654 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
4655 /* No more registers to handle after REGNO.
4656 Emit a single save/restore and exit. */
4657 if (regno2
== last_regno
)
4659 insn
= emit_insn (set
);
4660 RTX_FRAME_RELATED_P (insn
) = 1;
4662 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4664 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4668 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4669 /* The next register is not of the same class or its offset is not
4670 mergeable with the current one into a pair. */
4671 if (!satisfies_constraint_Ump (mem
)
4672 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
4673 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
4674 GET_MODE_SIZE (mode
)))
4676 insn
= emit_insn (set
);
4677 RTX_FRAME_RELATED_P (insn
) = 1;
4679 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4681 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4687 /* REGNO2 can be saved/restored in a pair with REGNO. */
4688 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4689 if (!frame_pointer_needed
)
4690 offset2
+= cfun
->machine
->frame
.frame_size
4691 - cfun
->machine
->frame
.hard_fp_offset
;
4692 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
4693 rtx mem2
= gen_frame_mem (mode
, addr2
);
4694 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
4695 : gen_rtx_SET (reg2
, mem2
);
4698 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
4700 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4702 RTX_FRAME_RELATED_P (insn
) = 1;
4705 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
4706 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
4710 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4711 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
4714 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
4718 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4721 aarch64_emit_prologue_components (sbitmap components
)
4723 aarch64_process_components (components
, true);
4726 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4729 aarch64_emit_epilogue_components (sbitmap components
)
4731 aarch64_process_components (components
, false);
4734 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4737 aarch64_set_handled_components (sbitmap components
)
4739 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4740 if (bitmap_bit_p (components
, regno
))
4741 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
4744 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4745 is saved at BASE + OFFSET. */
4748 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
4749 rtx base
, poly_int64 offset
)
4751 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
4752 add_reg_note (insn
, REG_CFA_EXPRESSION
,
4753 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
4756 /* AArch64 stack frames generated by this compiler look like:
4758 +-------------------------------+
4760 | incoming stack arguments |
4762 +-------------------------------+
4763 | | <-- incoming stack pointer (aligned)
4764 | callee-allocated save area |
4765 | for register varargs |
4767 +-------------------------------+
4768 | local variables | <-- frame_pointer_rtx
4770 +-------------------------------+
4772 +-------------------------------+ |
4773 | callee-saved registers | | frame.saved_regs_size
4774 +-------------------------------+ |
4776 +-------------------------------+ |
4777 | FP' | / <- hard_frame_pointer_rtx (aligned)
4778 +-------------------------------+
4779 | dynamic allocation |
4780 +-------------------------------+
4782 +-------------------------------+
4783 | outgoing stack arguments | <-- arg_pointer
4785 +-------------------------------+
4786 | | <-- stack_pointer_rtx (aligned)
4788 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4789 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4792 /* Generate the prologue instructions for entry into a function.
4793 Establish the stack frame by decreasing the stack pointer with a
4794 properly calculated size and, if necessary, create a frame record
4795 filled with the values of LR and previous frame pointer. The
4796 current FP is also set up if it is in use. */
4799 aarch64_expand_prologue (void)
4801 aarch64_layout_frame ();
4803 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
4804 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4805 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4806 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4807 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4808 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4809 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4810 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
4813 /* Sign return address for functions. */
4814 if (aarch64_return_address_signing_enabled ())
4816 insn
= emit_insn (gen_pacisp ());
4817 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4818 RTX_FRAME_RELATED_P (insn
) = 1;
4821 if (flag_stack_usage_info
)
4822 current_function_static_stack_size
= constant_lower_bound (frame_size
);
4824 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
4826 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
4828 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
4829 && maybe_gt (frame_size
, get_stack_check_protect ()))
4830 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4832 - get_stack_check_protect ()));
4834 else if (maybe_gt (frame_size
, 0))
4835 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
4838 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4839 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4841 aarch64_sub_sp (ip0_rtx
, ip1_rtx
, initial_adjust
, true);
4843 if (callee_adjust
!= 0)
4844 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
4846 if (emit_frame_chain
)
4848 poly_int64 reg_offset
= callee_adjust
;
4849 if (callee_adjust
== 0)
4853 reg_offset
= callee_offset
;
4854 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
4856 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
4857 stack_pointer_rtx
, callee_offset
,
4858 ip1_rtx
, ip0_rtx
, frame_pointer_needed
);
4859 if (frame_pointer_needed
&& !frame_size
.is_constant ())
4861 /* Variable-sized frames need to describe the save slot
4862 address using DW_CFA_expression rather than DW_CFA_offset.
4863 This means that, without taking further action, the
4864 locations of the registers that we've already saved would
4865 remain based on the stack pointer even after we redefine
4866 the CFA based on the frame pointer. We therefore need new
4867 DW_CFA_expressions to re-express the save slots with addresses
4868 based on the frame pointer. */
4869 rtx_insn
*insn
= get_last_insn ();
4870 gcc_assert (RTX_FRAME_RELATED_P (insn
));
4872 /* Add an explicit CFA definition if this was previously
4874 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
4876 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
4878 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4879 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
4882 /* Change the save slot expressions for the registers that
4883 we've already saved. */
4884 reg_offset
-= callee_offset
;
4885 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
4886 reg_offset
+ UNITS_PER_WORD
);
4887 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
4890 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
4893 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4894 callee_adjust
!= 0 || emit_frame_chain
);
4895 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4896 callee_adjust
!= 0 || emit_frame_chain
);
4897 aarch64_sub_sp (ip1_rtx
, ip0_rtx
, final_adjust
, !frame_pointer_needed
);
4900 /* Return TRUE if we can use a simple_return insn.
4902 This function checks whether the callee saved stack is empty, which
4903 means no restore actions are need. The pro_and_epilogue will use
4904 this to check whether shrink-wrapping opt is feasible. */
4907 aarch64_use_return_insn_p (void)
4909 if (!reload_completed
)
4915 aarch64_layout_frame ();
4917 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
4920 /* Generate the epilogue instructions for returning from a function.
4921 This is almost exactly the reverse of the prolog sequence, except
4922 that we need to insert barriers to avoid scheduling loads that read
4923 from a deallocated stack, and we optimize the unwind records by
4924 emitting them all together if possible. */
4926 aarch64_expand_epilogue (bool for_sibcall
)
4928 aarch64_layout_frame ();
4930 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4931 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4932 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4933 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4934 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4935 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4938 /* A stack clash protection prologue may not have left IP0_REGNUM or
4939 IP1_REGNUM in a usable state. The same is true for allocations
4940 with an SVE component, since we then need both temporary registers
4941 for each allocation. */
4942 bool can_inherit_p
= (initial_adjust
.is_constant ()
4943 && final_adjust
.is_constant ()
4944 && !flag_stack_clash_protection
);
4946 /* We need to add memory barrier to prevent read from deallocated stack. */
4948 = maybe_ne (get_frame_size ()
4949 + cfun
->machine
->frame
.saved_varargs_size
, 0);
4951 /* Emit a barrier to prevent loads from a deallocated stack. */
4952 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
4953 || cfun
->calls_alloca
4954 || crtl
->calls_eh_return
)
4956 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
4957 need_barrier_p
= false;
4960 /* Restore the stack pointer from the frame pointer if it may not
4961 be the same as the stack pointer. */
4962 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4963 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4964 if (frame_pointer_needed
4965 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
4966 /* If writeback is used when restoring callee-saves, the CFA
4967 is restored on the instruction doing the writeback. */
4968 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
4969 hard_frame_pointer_rtx
, -callee_offset
,
4970 ip1_rtx
, ip0_rtx
, callee_adjust
== 0);
4972 aarch64_add_sp (ip1_rtx
, ip0_rtx
, final_adjust
,
4973 !can_inherit_p
|| df_regs_ever_live_p (IP1_REGNUM
));
4975 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4976 callee_adjust
!= 0, &cfi_ops
);
4977 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4978 callee_adjust
!= 0, &cfi_ops
);
4981 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
4983 if (callee_adjust
!= 0)
4984 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
4986 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
4988 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4989 insn
= get_last_insn ();
4990 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
4991 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
4992 RTX_FRAME_RELATED_P (insn
) = 1;
4996 aarch64_add_sp (ip0_rtx
, ip1_rtx
, initial_adjust
,
4997 !can_inherit_p
|| df_regs_ever_live_p (IP0_REGNUM
));
5001 /* Emit delayed restores and reset the CFA to be SP. */
5002 insn
= get_last_insn ();
5003 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5004 REG_NOTES (insn
) = cfi_ops
;
5005 RTX_FRAME_RELATED_P (insn
) = 1;
5008 /* We prefer to emit the combined return/authenticate instruction RETAA,
5009 however there are three cases in which we must instead emit an explicit
5010 authentication instruction.
5012 1) Sibcalls don't return in a normal way, so if we're about to call one
5013 we must authenticate.
5015 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5016 generating code for !TARGET_ARMV8_3 we can't use it and must
5017 explicitly authenticate.
5019 3) On an eh_return path we make extra stack adjustments to update the
5020 canonical frame address to be the exception handler's CFA. We want
5021 to authenticate using the CFA of the function which calls eh_return.
5023 if (aarch64_return_address_signing_enabled ()
5024 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5026 insn
= emit_insn (gen_autisp ());
5027 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5028 RTX_FRAME_RELATED_P (insn
) = 1;
5031 /* Stack adjustment for exception handler. */
5032 if (crtl
->calls_eh_return
)
5034 /* We need to unwind the stack by the offset computed by
5035 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5036 to be SP; letting the CFA move during this adjustment
5037 is just as correct as retaining the CFA from the body
5038 of the function. Therefore, do nothing special. */
5039 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5042 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5044 emit_jump_insn (ret_rtx
);
5047 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5048 normally or return to a previous frame after unwinding.
5050 An EH return uses a single shared return sequence. The epilogue is
5051 exactly like a normal epilogue except that it has an extra input
5052 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5053 that must be applied after the frame has been destroyed. An extra label
5054 is inserted before the epilogue which initializes this register to zero,
5055 and this is the entry point for a normal return.
5057 An actual EH return updates the return address, initializes the stack
5058 adjustment and jumps directly into the epilogue (bypassing the zeroing
5059 of the adjustment). Since the return address is typically saved on the
5060 stack when a function makes a call, the saved LR must be updated outside
5063 This poses problems as the store is generated well before the epilogue,
5064 so the offset of LR is not known yet. Also optimizations will remove the
5065 store as it appears dead, even after the epilogue is generated (as the
5066 base or offset for loading LR is different in many cases).
5068 To avoid these problems this implementation forces the frame pointer
5069 in eh_return functions so that the location of LR is fixed and known early.
5070 It also marks the store volatile, so no optimization is permitted to
5071 remove the store. */
5073 aarch64_eh_return_handler_rtx (void)
5075 rtx tmp
= gen_frame_mem (Pmode
,
5076 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5078 /* Mark the store volatile, so no optimization is permitted to remove it. */
5079 MEM_VOLATILE_P (tmp
) = true;
5083 /* Output code to add DELTA to the first argument, and then jump
5084 to FUNCTION. Used for C++ multiple inheritance. */
5086 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5087 HOST_WIDE_INT delta
,
5088 HOST_WIDE_INT vcall_offset
,
5091 /* The this pointer is always in x0. Note that this differs from
5092 Arm where the this pointer maybe bumped to r1 if r0 is required
5093 to return a pointer to an aggregate. On AArch64 a result value
5094 pointer will be in x8. */
5095 int this_regno
= R0_REGNUM
;
5096 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5099 reload_completed
= 1;
5100 emit_note (NOTE_INSN_PROLOGUE_END
);
5102 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5103 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5104 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5106 if (vcall_offset
== 0)
5107 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5110 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5115 if (delta
>= -256 && delta
< 256)
5116 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5117 plus_constant (Pmode
, this_rtx
, delta
));
5119 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5120 temp1
, temp0
, false);
5123 if (Pmode
== ptr_mode
)
5124 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5126 aarch64_emit_move (temp0
,
5127 gen_rtx_ZERO_EXTEND (Pmode
,
5128 gen_rtx_MEM (ptr_mode
, addr
)));
5130 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5131 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5134 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5136 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5139 if (Pmode
== ptr_mode
)
5140 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5142 aarch64_emit_move (temp1
,
5143 gen_rtx_SIGN_EXTEND (Pmode
,
5144 gen_rtx_MEM (ptr_mode
, addr
)));
5146 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5149 /* Generate a tail call to the target function. */
5150 if (!TREE_USED (function
))
5152 assemble_external (function
);
5153 TREE_USED (function
) = 1;
5155 funexp
= XEXP (DECL_RTL (function
), 0);
5156 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5157 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5158 SIBLING_CALL_P (insn
) = 1;
5160 insn
= get_insns ();
5161 shorten_branches (insn
);
5162 final_start_function (insn
, file
, 1);
5163 final (insn
, file
, 1);
5164 final_end_function ();
5166 /* Stop pretending to be a post-reload pass. */
5167 reload_completed
= 0;
5171 aarch64_tls_referenced_p (rtx x
)
5173 if (!TARGET_HAVE_TLS
)
5175 subrtx_iterator::array_type array
;
5176 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5178 const_rtx x
= *iter
;
5179 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5181 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5182 TLS offsets, not real symbol references. */
5183 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5184 iter
.skip_subrtxes ();
5190 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5191 a left shift of 0 or 12 bits. */
5193 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5195 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5196 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5201 /* Return true if val is an immediate that can be loaded into a
5202 register by a MOVZ instruction. */
5204 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5206 if (GET_MODE_SIZE (mode
) > 4)
5208 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5209 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5214 /* Ignore sign extension. */
5215 val
&= (HOST_WIDE_INT
) 0xffffffff;
5217 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5218 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5221 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5222 64-bit (DImode) integer. */
5224 static unsigned HOST_WIDE_INT
5225 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5227 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5230 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5237 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5239 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5241 0x0000000100000001ull
,
5242 0x0001000100010001ull
,
5243 0x0101010101010101ull
,
5244 0x1111111111111111ull
,
5245 0x5555555555555555ull
,
5249 /* Return true if val is a valid bitmask immediate. */
5252 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
5254 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
5257 /* Check for a single sequence of one bits and return quickly if so.
5258 The special cases of all ones and all zeroes returns false. */
5259 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
5260 tmp
= val
+ (val
& -val
);
5262 if (tmp
== (tmp
& -tmp
))
5263 return (val
+ 1) > 1;
5265 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5267 val
= (val
<< 32) | (val
& 0xffffffff);
5269 /* Invert if the immediate doesn't start with a zero bit - this means we
5270 only need to search for sequences of one bits. */
5274 /* Find the first set bit and set tmp to val with the first sequence of one
5275 bits removed. Return success if there is a single sequence of ones. */
5276 first_one
= val
& -val
;
5277 tmp
= val
& (val
+ first_one
);
5282 /* Find the next set bit and compute the difference in bit position. */
5283 next_one
= tmp
& -tmp
;
5284 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5287 /* Check the bit position difference is a power of 2, and that the first
5288 sequence of one bits fits within 'bits' bits. */
5289 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5292 /* Check the sequence of one bits is repeated 64/bits times. */
5293 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5296 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5297 Assumed precondition: VAL_IN Is not zero. */
5299 unsigned HOST_WIDE_INT
5300 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5302 int lowest_bit_set
= ctz_hwi (val_in
);
5303 int highest_bit_set
= floor_log2 (val_in
);
5304 gcc_assert (val_in
!= 0);
5306 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5307 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5310 /* Create constant where bits outside of lowest bit set to highest bit set
5313 unsigned HOST_WIDE_INT
5314 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5316 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5319 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5322 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5324 scalar_int_mode int_mode
;
5325 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5328 if (aarch64_bitmask_imm (val_in
, int_mode
))
5331 if (aarch64_move_imm (val_in
, int_mode
))
5334 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5336 return aarch64_bitmask_imm (imm2
, int_mode
);
5339 /* Return true if val is an immediate that can be loaded into a
5340 register in a single instruction. */
5342 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
5344 scalar_int_mode int_mode
;
5345 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5348 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
5350 return aarch64_bitmask_imm (val
, int_mode
);
5354 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
5358 if (GET_CODE (x
) == HIGH
)
5361 /* There's no way to calculate VL-based values using relocations. */
5362 subrtx_iterator::array_type array
;
5363 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5364 if (GET_CODE (*iter
) == CONST_POLY_INT
)
5367 split_const (x
, &base
, &offset
);
5368 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
5370 if (aarch64_classify_symbol (base
, INTVAL (offset
))
5371 != SYMBOL_FORCE_TO_MEM
)
5374 /* Avoid generating a 64-bit relocation in ILP32; leave
5375 to aarch64_expand_mov_immediate to handle it properly. */
5376 return mode
!= ptr_mode
;
5379 return aarch64_tls_referenced_p (x
);
5382 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5383 The expansion for a table switch is quite expensive due to the number
5384 of instructions, the table lookup and hard to predict indirect jump.
5385 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5386 set, otherwise use tables for > 16 cases as a tradeoff between size and
5387 performance. When optimizing for size, use the default setting. */
5390 aarch64_case_values_threshold (void)
5392 /* Use the specified limit for the number of cases before using jump
5393 tables at higher optimization levels. */
5395 && selected_cpu
->tune
->max_case_values
!= 0)
5396 return selected_cpu
->tune
->max_case_values
;
5398 return optimize_size
? default_case_values_threshold () : 17;
5401 /* Return true if register REGNO is a valid index register.
5402 STRICT_P is true if REG_OK_STRICT is in effect. */
5405 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
5407 if (!HARD_REGISTER_NUM_P (regno
))
5415 regno
= reg_renumber
[regno
];
5417 return GP_REGNUM_P (regno
);
5420 /* Return true if register REGNO is a valid base register for mode MODE.
5421 STRICT_P is true if REG_OK_STRICT is in effect. */
5424 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
5426 if (!HARD_REGISTER_NUM_P (regno
))
5434 regno
= reg_renumber
[regno
];
5437 /* The fake registers will be eliminated to either the stack or
5438 hard frame pointer, both of which are usually valid base registers.
5439 Reload deals with the cases where the eliminated form isn't valid. */
5440 return (GP_REGNUM_P (regno
)
5441 || regno
== SP_REGNUM
5442 || regno
== FRAME_POINTER_REGNUM
5443 || regno
== ARG_POINTER_REGNUM
);
5446 /* Return true if X is a valid base register for mode MODE.
5447 STRICT_P is true if REG_OK_STRICT is in effect. */
5450 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
5453 && GET_CODE (x
) == SUBREG
5454 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
5457 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
5460 /* Return true if address offset is a valid index. If it is, fill in INFO
5461 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5464 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
5465 machine_mode mode
, bool strict_p
)
5467 enum aarch64_address_type type
;
5472 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
5473 && GET_MODE (x
) == Pmode
)
5475 type
= ADDRESS_REG_REG
;
5479 /* (sign_extend:DI (reg:SI)) */
5480 else if ((GET_CODE (x
) == SIGN_EXTEND
5481 || GET_CODE (x
) == ZERO_EXTEND
)
5482 && GET_MODE (x
) == DImode
5483 && GET_MODE (XEXP (x
, 0)) == SImode
)
5485 type
= (GET_CODE (x
) == SIGN_EXTEND
)
5486 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5487 index
= XEXP (x
, 0);
5490 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5491 else if (GET_CODE (x
) == MULT
5492 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5493 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5494 && GET_MODE (XEXP (x
, 0)) == DImode
5495 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5496 && CONST_INT_P (XEXP (x
, 1)))
5498 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5499 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5500 index
= XEXP (XEXP (x
, 0), 0);
5501 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5503 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5504 else if (GET_CODE (x
) == ASHIFT
5505 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5506 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5507 && GET_MODE (XEXP (x
, 0)) == DImode
5508 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5509 && CONST_INT_P (XEXP (x
, 1)))
5511 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5512 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5513 index
= XEXP (XEXP (x
, 0), 0);
5514 shift
= INTVAL (XEXP (x
, 1));
5516 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5517 else if ((GET_CODE (x
) == SIGN_EXTRACT
5518 || GET_CODE (x
) == ZERO_EXTRACT
)
5519 && GET_MODE (x
) == DImode
5520 && GET_CODE (XEXP (x
, 0)) == MULT
5521 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5522 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5524 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5525 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5526 index
= XEXP (XEXP (x
, 0), 0);
5527 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5528 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5529 || INTVAL (XEXP (x
, 2)) != 0)
5532 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5533 (const_int 0xffffffff<<shift)) */
5534 else if (GET_CODE (x
) == AND
5535 && GET_MODE (x
) == DImode
5536 && GET_CODE (XEXP (x
, 0)) == MULT
5537 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5538 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5539 && CONST_INT_P (XEXP (x
, 1)))
5541 type
= ADDRESS_REG_UXTW
;
5542 index
= XEXP (XEXP (x
, 0), 0);
5543 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5544 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5547 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5548 else if ((GET_CODE (x
) == SIGN_EXTRACT
5549 || GET_CODE (x
) == ZERO_EXTRACT
)
5550 && GET_MODE (x
) == DImode
5551 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5552 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5553 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5555 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5556 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5557 index
= XEXP (XEXP (x
, 0), 0);
5558 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5559 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5560 || INTVAL (XEXP (x
, 2)) != 0)
5563 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5564 (const_int 0xffffffff<<shift)) */
5565 else if (GET_CODE (x
) == AND
5566 && GET_MODE (x
) == DImode
5567 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5568 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5569 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5570 && CONST_INT_P (XEXP (x
, 1)))
5572 type
= ADDRESS_REG_UXTW
;
5573 index
= XEXP (XEXP (x
, 0), 0);
5574 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5575 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5578 /* (mult:P (reg:P) (const_int scale)) */
5579 else if (GET_CODE (x
) == MULT
5580 && GET_MODE (x
) == Pmode
5581 && GET_MODE (XEXP (x
, 0)) == Pmode
5582 && CONST_INT_P (XEXP (x
, 1)))
5584 type
= ADDRESS_REG_REG
;
5585 index
= XEXP (x
, 0);
5586 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5588 /* (ashift:P (reg:P) (const_int shift)) */
5589 else if (GET_CODE (x
) == ASHIFT
5590 && GET_MODE (x
) == Pmode
5591 && GET_MODE (XEXP (x
, 0)) == Pmode
5592 && CONST_INT_P (XEXP (x
, 1)))
5594 type
= ADDRESS_REG_REG
;
5595 index
= XEXP (x
, 0);
5596 shift
= INTVAL (XEXP (x
, 1));
5602 && GET_CODE (index
) == SUBREG
5603 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
5604 index
= SUBREG_REG (index
);
5606 if (aarch64_sve_data_mode_p (mode
))
5608 if (type
!= ADDRESS_REG_REG
5609 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
5615 && !(IN_RANGE (shift
, 1, 3)
5616 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
5621 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
5624 info
->offset
= index
;
5625 info
->shift
= shift
;
5632 /* Return true if MODE is one of the modes for which we
5633 support LDP/STP operations. */
5636 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
5638 return mode
== SImode
|| mode
== DImode
5639 || mode
== SFmode
|| mode
== DFmode
5640 || (aarch64_vector_mode_supported_p (mode
)
5641 && known_eq (GET_MODE_SIZE (mode
), 8));
5644 /* Return true if REGNO is a virtual pointer register, or an eliminable
5645 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5646 include stack_pointer or hard_frame_pointer. */
5648 virt_or_elim_regno_p (unsigned regno
)
5650 return ((regno
>= FIRST_VIRTUAL_REGISTER
5651 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
5652 || regno
== FRAME_POINTER_REGNUM
5653 || regno
== ARG_POINTER_REGNUM
);
5656 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5657 If it is, fill in INFO appropriately. STRICT_P is true if
5658 REG_OK_STRICT is in effect. */
5661 aarch64_classify_address (struct aarch64_address_info
*info
,
5662 rtx x
, machine_mode mode
, bool strict_p
,
5663 aarch64_addr_query_type type
= ADDR_QUERY_M
)
5665 enum rtx_code code
= GET_CODE (x
);
5669 HOST_WIDE_INT const_size
;
5671 /* On BE, we use load/store pair for all large int mode load/stores.
5672 TI/TFmode may also use a load/store pair. */
5673 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5674 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
5675 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
5678 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
5680 bool allow_reg_index_p
= (!load_store_pair_p
5681 && (known_lt (GET_MODE_SIZE (mode
), 16)
5682 || vec_flags
== VEC_ADVSIMD
5683 || vec_flags
== VEC_SVE_DATA
));
5685 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5686 [Rn, #offset, MUL VL]. */
5687 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
5688 && (code
!= REG
&& code
!= PLUS
))
5691 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5693 if (advsimd_struct_p
5694 && !BYTES_BIG_ENDIAN
5695 && (code
!= POST_INC
&& code
!= REG
))
5698 gcc_checking_assert (GET_MODE (x
) == VOIDmode
5699 || SCALAR_INT_MODE_P (GET_MODE (x
)));
5705 info
->type
= ADDRESS_REG_IMM
;
5707 info
->offset
= const0_rtx
;
5708 info
->const_offset
= 0;
5709 return aarch64_base_register_rtx_p (x
, strict_p
);
5717 && virt_or_elim_regno_p (REGNO (op0
))
5718 && poly_int_rtx_p (op1
, &offset
))
5720 info
->type
= ADDRESS_REG_IMM
;
5723 info
->const_offset
= offset
;
5728 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
5729 && aarch64_base_register_rtx_p (op0
, strict_p
)
5730 && poly_int_rtx_p (op1
, &offset
))
5732 info
->type
= ADDRESS_REG_IMM
;
5735 info
->const_offset
= offset
;
5737 /* TImode and TFmode values are allowed in both pairs of X
5738 registers and individual Q registers. The available
5740 X,X: 7-bit signed scaled offset
5741 Q: 9-bit signed offset
5742 We conservatively require an offset representable in either mode.
5743 When performing the check for pairs of X registers i.e. LDP/STP
5744 pass down DImode since that is the natural size of the LDP/STP
5745 instruction memory accesses. */
5746 if (mode
== TImode
|| mode
== TFmode
)
5747 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
5748 && (offset_9bit_signed_unscaled_p (mode
, offset
)
5749 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
5751 /* A 7bit offset check because OImode will emit a ldp/stp
5752 instruction (only big endian will get here).
5753 For ldp/stp instructions, the offset is scaled for the size of a
5754 single element of the pair. */
5756 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
5758 /* Three 9/12 bit offsets checks because CImode will emit three
5759 ldr/str instructions (only big endian will get here). */
5761 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5762 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
5763 || offset_12bit_unsigned_scaled_p (V16QImode
,
5766 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5767 instructions (only big endian will get here). */
5769 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5770 && aarch64_offset_7bit_signed_scaled_p (TImode
,
5773 /* Make "m" use the LD1 offset range for SVE data modes, so
5774 that pre-RTL optimizers like ivopts will work to that
5775 instead of the wider LDR/STR range. */
5776 if (vec_flags
== VEC_SVE_DATA
)
5777 return (type
== ADDR_QUERY_M
5778 ? offset_4bit_signed_scaled_p (mode
, offset
)
5779 : offset_9bit_signed_scaled_p (mode
, offset
));
5781 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
5783 poly_int64 end_offset
= (offset
5784 + GET_MODE_SIZE (mode
)
5785 - BYTES_PER_SVE_VECTOR
);
5786 return (type
== ADDR_QUERY_M
5787 ? offset_4bit_signed_scaled_p (mode
, offset
)
5788 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
5789 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
5793 if (vec_flags
== VEC_SVE_PRED
)
5794 return offset_9bit_signed_scaled_p (mode
, offset
);
5796 if (load_store_pair_p
)
5797 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5798 || known_eq (GET_MODE_SIZE (mode
), 8))
5799 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5801 return (offset_9bit_signed_unscaled_p (mode
, offset
)
5802 || offset_12bit_unsigned_scaled_p (mode
, offset
));
5805 if (allow_reg_index_p
)
5807 /* Look for base + (scaled/extended) index register. */
5808 if (aarch64_base_register_rtx_p (op0
, strict_p
)
5809 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
5814 if (aarch64_base_register_rtx_p (op1
, strict_p
)
5815 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
5828 info
->type
= ADDRESS_REG_WB
;
5829 info
->base
= XEXP (x
, 0);
5830 info
->offset
= NULL_RTX
;
5831 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
5835 info
->type
= ADDRESS_REG_WB
;
5836 info
->base
= XEXP (x
, 0);
5837 if (GET_CODE (XEXP (x
, 1)) == PLUS
5838 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
5839 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
5840 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5842 info
->offset
= XEXP (XEXP (x
, 1), 1);
5843 info
->const_offset
= offset
;
5845 /* TImode and TFmode values are allowed in both pairs of X
5846 registers and individual Q registers. The available
5848 X,X: 7-bit signed scaled offset
5849 Q: 9-bit signed offset
5850 We conservatively require an offset representable in either mode.
5852 if (mode
== TImode
|| mode
== TFmode
)
5853 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
5854 && offset_9bit_signed_unscaled_p (mode
, offset
));
5856 if (load_store_pair_p
)
5857 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5858 || known_eq (GET_MODE_SIZE (mode
), 8))
5859 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5861 return offset_9bit_signed_unscaled_p (mode
, offset
);
5868 /* load literal: pc-relative constant pool entry. Only supported
5869 for SI mode or larger. */
5870 info
->type
= ADDRESS_SYMBOLIC
;
5872 if (!load_store_pair_p
5873 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
5878 split_const (x
, &sym
, &addend
);
5879 return ((GET_CODE (sym
) == LABEL_REF
5880 || (GET_CODE (sym
) == SYMBOL_REF
5881 && CONSTANT_POOL_ADDRESS_P (sym
)
5882 && aarch64_pcrelative_literal_loads
)));
5887 info
->type
= ADDRESS_LO_SUM
;
5888 info
->base
= XEXP (x
, 0);
5889 info
->offset
= XEXP (x
, 1);
5890 if (allow_reg_index_p
5891 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5894 split_const (info
->offset
, &sym
, &offs
);
5895 if (GET_CODE (sym
) == SYMBOL_REF
5896 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
5897 == SYMBOL_SMALL_ABSOLUTE
))
5899 /* The symbol and offset must be aligned to the access size. */
5902 if (CONSTANT_POOL_ADDRESS_P (sym
))
5903 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
5904 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
5906 tree exp
= SYMBOL_REF_DECL (sym
);
5907 align
= TYPE_ALIGN (TREE_TYPE (exp
));
5908 align
= aarch64_constant_alignment (exp
, align
);
5910 else if (SYMBOL_REF_DECL (sym
))
5911 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
5912 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
5913 && SYMBOL_REF_BLOCK (sym
) != NULL
)
5914 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
5916 align
= BITS_PER_UNIT
;
5918 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
5919 if (known_eq (ref_size
, 0))
5920 ref_size
= GET_MODE_SIZE (DImode
);
5922 return (multiple_p (INTVAL (offs
), ref_size
)
5923 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
5933 /* Return true if the address X is valid for a PRFM instruction.
5934 STRICT_P is true if we should do strict checking with
5935 aarch64_classify_address. */
5938 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
5940 struct aarch64_address_info addr
;
5942 /* PRFM accepts the same addresses as DImode... */
5943 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
5947 /* ... except writeback forms. */
5948 return addr
.type
!= ADDRESS_REG_WB
;
5952 aarch64_symbolic_address_p (rtx x
)
5956 split_const (x
, &x
, &offset
);
5957 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
5960 /* Classify the base of symbolic expression X. */
5962 enum aarch64_symbol_type
5963 aarch64_classify_symbolic_expression (rtx x
)
5967 split_const (x
, &x
, &offset
);
5968 return aarch64_classify_symbol (x
, INTVAL (offset
));
5972 /* Return TRUE if X is a legitimate address for accessing memory in
5975 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
5977 struct aarch64_address_info addr
;
5979 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
5982 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5983 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5985 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
5986 aarch64_addr_query_type type
)
5988 struct aarch64_address_info addr
;
5990 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
5993 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
5996 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
5997 poly_int64 orig_offset
,
6001 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6003 HOST_WIDE_INT const_offset
, second_offset
;
6005 /* A general SVE offset is A * VQ + B. Remove the A component from
6006 coefficient 0 in order to get the constant B. */
6007 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6009 /* Split an out-of-range address displacement into a base and
6010 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6011 range otherwise to increase opportunities for sharing the base
6012 address of different sizes. Unaligned accesses use the signed
6013 9-bit range, TImode/TFmode use the intersection of signed
6014 scaled 7-bit and signed 9-bit offset. */
6015 if (mode
== TImode
|| mode
== TFmode
)
6016 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6017 else if ((const_offset
& (size
- 1)) != 0)
6018 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6020 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6022 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6025 /* Split the offset into second_offset and the rest. */
6026 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6027 *offset2
= gen_int_mode (second_offset
, Pmode
);
6032 /* Get the mode we should use as the basis of the range. For structure
6033 modes this is the mode of one vector. */
6034 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6035 machine_mode step_mode
6036 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6038 /* Get the "mul vl" multiplier we'd like to use. */
6039 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6040 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6041 if (vec_flags
& VEC_SVE_DATA
)
6042 /* LDR supports a 9-bit range, but the move patterns for
6043 structure modes require all vectors to be in range of the
6044 same base. The simplest way of accomodating that while still
6045 promoting reuse of anchor points between different modes is
6046 to use an 8-bit range unconditionally. */
6047 vnum
= ((vnum
+ 128) & 255) - 128;
6049 /* Predicates are only handled singly, so we might as well use
6051 vnum
= ((vnum
+ 256) & 511) - 256;
6055 /* Convert the "mul vl" multiplier into a byte offset. */
6056 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6057 if (known_eq (second_offset
, orig_offset
))
6060 /* Split the offset into second_offset and the rest. */
6061 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6062 *offset2
= gen_int_mode (second_offset
, Pmode
);
6067 /* Return the binary representation of floating point constant VALUE in INTVAL.
6068 If the value cannot be converted, return false without setting INTVAL.
6069 The conversion is done in the given MODE. */
6071 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6074 /* We make a general exception for 0. */
6075 if (aarch64_float_const_zero_rtx_p (value
))
6081 scalar_float_mode mode
;
6082 if (GET_CODE (value
) != CONST_DOUBLE
6083 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6084 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6085 /* Only support up to DF mode. */
6086 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6089 unsigned HOST_WIDE_INT ival
= 0;
6092 real_to_target (res
,
6093 CONST_DOUBLE_REAL_VALUE (value
),
6094 REAL_MODE_FORMAT (mode
));
6098 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6099 ival
= zext_hwi (res
[order
], 32);
6100 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6103 ival
= zext_hwi (res
[0], 32);
6109 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6110 single MOV(+MOVK) followed by an FMOV. */
6112 aarch64_float_const_rtx_p (rtx x
)
6114 machine_mode mode
= GET_MODE (x
);
6115 if (mode
== VOIDmode
)
6118 /* Determine whether it's cheaper to write float constants as
6119 mov/movk pairs over ldr/adrp pairs. */
6120 unsigned HOST_WIDE_INT ival
;
6122 if (GET_CODE (x
) == CONST_DOUBLE
6123 && SCALAR_FLOAT_MODE_P (mode
)
6124 && aarch64_reinterpret_float_as_int (x
, &ival
))
6126 scalar_int_mode imode
= (mode
== HFmode
6128 : int_mode_for_mode (mode
).require ());
6129 int num_instr
= aarch64_internal_mov_immediate
6130 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6131 return num_instr
< 3;
6137 /* Return TRUE if rtx X is immediate constant 0.0 */
6139 aarch64_float_const_zero_rtx_p (rtx x
)
6141 if (GET_MODE (x
) == VOIDmode
)
6144 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6145 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6146 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6149 /* Return TRUE if rtx X is immediate constant that fits in a single
6150 MOVI immediate operation. */
6152 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6158 scalar_int_mode imode
;
6159 unsigned HOST_WIDE_INT ival
;
6161 if (GET_CODE (x
) == CONST_DOUBLE
6162 && SCALAR_FLOAT_MODE_P (mode
))
6164 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6167 /* We make a general exception for 0. */
6168 if (aarch64_float_const_zero_rtx_p (x
))
6171 imode
= int_mode_for_mode (mode
).require ();
6173 else if (GET_CODE (x
) == CONST_INT
6174 && is_a
<scalar_int_mode
> (mode
, &imode
))
6179 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6180 a 128 bit vector mode. */
6181 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
6183 vmode
= aarch64_simd_container_mode (imode
, width
);
6184 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
6186 return aarch64_simd_valid_immediate (v_op
, NULL
);
6190 /* Return the fixed registers used for condition codes. */
6193 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
6196 *p2
= INVALID_REGNUM
;
6200 /* This function is used by the call expanders of the machine description.
6201 RESULT is the register in which the result is returned. It's NULL for
6202 "call" and "sibcall".
6203 MEM is the location of the function call.
6204 SIBCALL indicates whether this function call is normal call or sibling call.
6205 It will generate different pattern accordingly. */
6208 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
6210 rtx call
, callee
, tmp
;
6214 gcc_assert (MEM_P (mem
));
6215 callee
= XEXP (mem
, 0);
6216 mode
= GET_MODE (callee
);
6217 gcc_assert (mode
== Pmode
);
6219 /* Decide if we should generate indirect calls by loading the
6220 address of the callee into a register before performing
6221 the branch-and-link. */
6222 if (SYMBOL_REF_P (callee
)
6223 ? (aarch64_is_long_call_p (callee
)
6224 || aarch64_is_noplt_call_p (callee
))
6226 XEXP (mem
, 0) = force_reg (mode
, callee
);
6228 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
6230 if (result
!= NULL_RTX
)
6231 call
= gen_rtx_SET (result
, call
);
6236 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
6238 vec
= gen_rtvec (2, call
, tmp
);
6239 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
6241 aarch64_emit_call_insn (call
);
6244 /* Emit call insn with PAT and do aarch64-specific handling. */
6247 aarch64_emit_call_insn (rtx pat
)
6249 rtx insn
= emit_call_insn (pat
);
6251 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
6252 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
6253 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
6257 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
6259 /* All floating point compares return CCFP if it is an equality
6260 comparison, and CCFPE otherwise. */
6261 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
6288 /* Equality comparisons of short modes against zero can be performed
6289 using the TST instruction with the appropriate bitmask. */
6290 if (y
== const0_rtx
&& REG_P (x
)
6291 && (code
== EQ
|| code
== NE
)
6292 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
6295 /* Similarly, comparisons of zero_extends from shorter modes can
6296 be performed using an ANDS with an immediate mask. */
6297 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
6298 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6299 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
6300 && (code
== EQ
|| code
== NE
))
6303 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6305 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
6306 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
6307 || GET_CODE (x
) == NEG
6308 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
6309 && CONST_INT_P (XEXP (x
, 2)))))
6312 /* A compare with a shifted operand. Because of canonicalization,
6313 the comparison will have to be swapped when we emit the assembly
6315 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6316 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
6317 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
6318 || GET_CODE (x
) == LSHIFTRT
6319 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
6322 /* Similarly for a negated operand, but we can only do this for
6324 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6325 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
6326 && (code
== EQ
|| code
== NE
)
6327 && GET_CODE (x
) == NEG
)
6330 /* A test for unsigned overflow. */
6331 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6333 && GET_CODE (x
) == PLUS
6334 && GET_CODE (y
) == ZERO_EXTEND
)
6337 /* For everything else, return CCmode. */
6342 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
6345 aarch64_get_condition_code (rtx x
)
6347 machine_mode mode
= GET_MODE (XEXP (x
, 0));
6348 enum rtx_code comp_code
= GET_CODE (x
);
6350 if (GET_MODE_CLASS (mode
) != MODE_CC
)
6351 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
6352 return aarch64_get_condition_code_1 (mode
, comp_code
);
6356 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
6364 case GE
: return AARCH64_GE
;
6365 case GT
: return AARCH64_GT
;
6366 case LE
: return AARCH64_LS
;
6367 case LT
: return AARCH64_MI
;
6368 case NE
: return AARCH64_NE
;
6369 case EQ
: return AARCH64_EQ
;
6370 case ORDERED
: return AARCH64_VC
;
6371 case UNORDERED
: return AARCH64_VS
;
6372 case UNLT
: return AARCH64_LT
;
6373 case UNLE
: return AARCH64_LE
;
6374 case UNGT
: return AARCH64_HI
;
6375 case UNGE
: return AARCH64_PL
;
6383 case NE
: return AARCH64_NE
;
6384 case EQ
: return AARCH64_EQ
;
6385 case GE
: return AARCH64_GE
;
6386 case GT
: return AARCH64_GT
;
6387 case LE
: return AARCH64_LE
;
6388 case LT
: return AARCH64_LT
;
6389 case GEU
: return AARCH64_CS
;
6390 case GTU
: return AARCH64_HI
;
6391 case LEU
: return AARCH64_LS
;
6392 case LTU
: return AARCH64_CC
;
6400 case NE
: return AARCH64_NE
;
6401 case EQ
: return AARCH64_EQ
;
6402 case GE
: return AARCH64_LE
;
6403 case GT
: return AARCH64_LT
;
6404 case LE
: return AARCH64_GE
;
6405 case LT
: return AARCH64_GT
;
6406 case GEU
: return AARCH64_LS
;
6407 case GTU
: return AARCH64_CC
;
6408 case LEU
: return AARCH64_CS
;
6409 case LTU
: return AARCH64_HI
;
6417 case NE
: return AARCH64_NE
;
6418 case EQ
: return AARCH64_EQ
;
6419 case GE
: return AARCH64_PL
;
6420 case LT
: return AARCH64_MI
;
6428 case NE
: return AARCH64_NE
;
6429 case EQ
: return AARCH64_EQ
;
6437 case NE
: return AARCH64_CS
;
6438 case EQ
: return AARCH64_CC
;
6451 aarch64_const_vec_all_same_in_range_p (rtx x
,
6452 HOST_WIDE_INT minval
,
6453 HOST_WIDE_INT maxval
)
6456 return (const_vec_duplicate_p (x
, &elt
)
6457 && CONST_INT_P (elt
)
6458 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
6462 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
6464 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
6467 /* Return true if VEC is a constant in which every element is in the range
6468 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6471 aarch64_const_vec_all_in_range_p (rtx vec
,
6472 HOST_WIDE_INT minval
,
6473 HOST_WIDE_INT maxval
)
6475 if (GET_CODE (vec
) != CONST_VECTOR
6476 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
6480 if (!CONST_VECTOR_STEPPED_P (vec
))
6481 nunits
= const_vector_encoded_nelts (vec
);
6482 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
6485 for (int i
= 0; i
< nunits
; i
++)
6487 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
6488 if (!CONST_INT_P (vec_elem
)
6489 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
6496 #define AARCH64_CC_V 1
6497 #define AARCH64_CC_C (1 << 1)
6498 #define AARCH64_CC_Z (1 << 2)
6499 #define AARCH64_CC_N (1 << 3)
6501 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6502 static const int aarch64_nzcv_codes
[] =
6504 0, /* EQ, Z == 1. */
6505 AARCH64_CC_Z
, /* NE, Z == 0. */
6506 0, /* CS, C == 1. */
6507 AARCH64_CC_C
, /* CC, C == 0. */
6508 0, /* MI, N == 1. */
6509 AARCH64_CC_N
, /* PL, N == 0. */
6510 0, /* VS, V == 1. */
6511 AARCH64_CC_V
, /* VC, V == 0. */
6512 0, /* HI, C ==1 && Z == 0. */
6513 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
6514 AARCH64_CC_V
, /* GE, N == V. */
6515 0, /* LT, N != V. */
6516 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
6517 0, /* LE, !(Z == 0 && N == V). */
6522 /* Print floating-point vector immediate operand X to F, negating it
6523 first if NEGATE is true. Return true on success, false if it isn't
6524 a constant we can handle. */
6527 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
6531 if (!const_vec_duplicate_p (x
, &elt
))
6534 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
6536 r
= real_value_negate (&r
);
6538 /* We only handle the SVE single-bit immediates here. */
6539 if (real_equal (&r
, &dconst0
))
6540 asm_fprintf (f
, "0.0");
6541 else if (real_equal (&r
, &dconst1
))
6542 asm_fprintf (f
, "1.0");
6543 else if (real_equal (&r
, &dconsthalf
))
6544 asm_fprintf (f
, "0.5");
6551 /* Return the equivalent letter for size. */
6553 sizetochar (int size
)
6557 case 64: return 'd';
6558 case 32: return 's';
6559 case 16: return 'h';
6560 case 8 : return 'b';
6561 default: gcc_unreachable ();
6565 /* Print operand X to file F in a target specific manner according to CODE.
6566 The acceptable formatting commands given by CODE are:
6567 'c': An integer or symbol address without a preceding #
6569 'C': Take the duplicated element in a vector constant
6570 and print it in hex.
6571 'D': Take the duplicated element in a vector constant
6572 and print it as an unsigned integer, in decimal.
6573 'e': Print the sign/zero-extend size as a character 8->b,
6575 'p': Prints N such that 2^N == X (X must be power of 2 and
6577 'P': Print the number of non-zero bits in X (a const_int).
6578 'H': Print the higher numbered register of a pair (TImode)
6580 'm': Print a condition (eq, ne, etc).
6581 'M': Same as 'm', but invert condition.
6582 'N': Take the duplicated element in a vector constant
6583 and print the negative of it in decimal.
6584 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6585 'S/T/U/V': Print a FP/SIMD register name for a register list.
6586 The register printed is the FP/SIMD register name
6587 of X + 0/1/2/3 for S/T/U/V.
6588 'R': Print a scalar FP/SIMD register name + 1.
6589 'X': Print bottom 16 bits of integer constant in hex.
6590 'w/x': Print a general register name or the zero register
6592 '0': Print a normal operand, if it's a general register,
6593 then we assume DImode.
6594 'k': Print NZCV for conditional compare instructions.
6595 'A': Output address constant representing the first
6596 argument of X, specifying a relocation offset
6598 'L': Output constant address specified by X
6599 with a relocation offset if appropriate.
6600 'G': Prints address of X, specifying a PC relative
6601 relocation mode if appropriate.
6602 'y': Output address of LDP or STP - this is used for
6603 some LDP/STPs which don't use a PARALLEL in their
6604 pattern (so the mode needs to be adjusted).
6605 'z': Output address of a typical LDP or STP. */
6608 aarch64_print_operand (FILE *f
, rtx x
, int code
)
6614 switch (GET_CODE (x
))
6617 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
6621 output_addr_const (f
, x
);
6625 if (GET_CODE (XEXP (x
, 0)) == PLUS
6626 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
6628 output_addr_const (f
, x
);
6634 output_operand_lossage ("unsupported operand for code '%c'", code
);
6642 if (!CONST_INT_P (x
)
6643 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
6645 output_operand_lossage ("invalid operand for '%%%c'", code
);
6661 output_operand_lossage ("invalid operand for '%%%c'", code
);
6671 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
6673 output_operand_lossage ("invalid operand for '%%%c'", code
);
6677 asm_fprintf (f
, "%d", n
);
6682 if (!CONST_INT_P (x
))
6684 output_operand_lossage ("invalid operand for '%%%c'", code
);
6688 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
6692 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
6694 output_operand_lossage ("invalid operand for '%%%c'", code
);
6698 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
6705 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6706 if (x
== const_true_rtx
)
6713 if (!COMPARISON_P (x
))
6715 output_operand_lossage ("invalid operand for '%%%c'", code
);
6719 cond_code
= aarch64_get_condition_code (x
);
6720 gcc_assert (cond_code
>= 0);
6722 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
6723 fputs (aarch64_condition_codes
[cond_code
], f
);
6728 if (!const_vec_duplicate_p (x
, &elt
))
6730 output_operand_lossage ("invalid vector constant");
6734 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6735 asm_fprintf (f
, "%wd", -INTVAL (elt
));
6736 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6737 && aarch64_print_vector_float_operand (f
, x
, true))
6741 output_operand_lossage ("invalid vector constant");
6751 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6753 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6756 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
6763 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6765 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6768 asm_fprintf (f
, "%c%d",
6769 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
6770 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
6774 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6776 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6779 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
6783 if (!CONST_INT_P (x
))
6785 output_operand_lossage ("invalid operand for '%%%c'", code
);
6788 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
6793 /* Print a replicated constant in hex. */
6794 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6796 output_operand_lossage ("invalid operand for '%%%c'", code
);
6799 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6800 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6806 /* Print a replicated constant in decimal, treating it as
6808 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6810 output_operand_lossage ("invalid operand for '%%%c'", code
);
6813 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6814 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6821 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
6823 asm_fprintf (f
, "%czr", code
);
6827 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
6829 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
6833 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
6835 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
6844 output_operand_lossage ("missing operand");
6848 switch (GET_CODE (x
))
6851 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
6853 if (REG_NREGS (x
) == 1)
6854 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
6858 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
6859 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
6860 REGNO (x
) - V0_REGNUM
, suffix
,
6861 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
6865 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
6869 output_address (GET_MODE (x
), XEXP (x
, 0));
6874 output_addr_const (asm_out_file
, x
);
6878 asm_fprintf (f
, "%wd", INTVAL (x
));
6882 if (!VECTOR_MODE_P (GET_MODE (x
)))
6884 output_addr_const (asm_out_file
, x
);
6890 if (!const_vec_duplicate_p (x
, &elt
))
6892 output_operand_lossage ("invalid vector constant");
6896 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6897 asm_fprintf (f
, "%wd", INTVAL (elt
));
6898 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6899 && aarch64_print_vector_float_operand (f
, x
, false))
6903 output_operand_lossage ("invalid vector constant");
6909 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6910 be getting CONST_DOUBLEs holding integers. */
6911 gcc_assert (GET_MODE (x
) != VOIDmode
);
6912 if (aarch64_float_const_zero_rtx_p (x
))
6917 else if (aarch64_float_const_representable_p (x
))
6920 char float_buf
[buf_size
] = {'\0'};
6921 real_to_decimal_for_mode (float_buf
,
6922 CONST_DOUBLE_REAL_VALUE (x
),
6925 asm_fprintf (asm_out_file
, "%s", float_buf
);
6929 output_operand_lossage ("invalid constant");
6932 output_operand_lossage ("invalid operand");
6938 if (GET_CODE (x
) == HIGH
)
6941 switch (aarch64_classify_symbolic_expression (x
))
6943 case SYMBOL_SMALL_GOT_4G
:
6944 asm_fprintf (asm_out_file
, ":got:");
6947 case SYMBOL_SMALL_TLSGD
:
6948 asm_fprintf (asm_out_file
, ":tlsgd:");
6951 case SYMBOL_SMALL_TLSDESC
:
6952 asm_fprintf (asm_out_file
, ":tlsdesc:");
6955 case SYMBOL_SMALL_TLSIE
:
6956 asm_fprintf (asm_out_file
, ":gottprel:");
6959 case SYMBOL_TLSLE24
:
6960 asm_fprintf (asm_out_file
, ":tprel:");
6963 case SYMBOL_TINY_GOT
:
6970 output_addr_const (asm_out_file
, x
);
6974 switch (aarch64_classify_symbolic_expression (x
))
6976 case SYMBOL_SMALL_GOT_4G
:
6977 asm_fprintf (asm_out_file
, ":lo12:");
6980 case SYMBOL_SMALL_TLSGD
:
6981 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
6984 case SYMBOL_SMALL_TLSDESC
:
6985 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
6988 case SYMBOL_SMALL_TLSIE
:
6989 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
6992 case SYMBOL_TLSLE12
:
6993 asm_fprintf (asm_out_file
, ":tprel_lo12:");
6996 case SYMBOL_TLSLE24
:
6997 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7000 case SYMBOL_TINY_GOT
:
7001 asm_fprintf (asm_out_file
, ":got:");
7004 case SYMBOL_TINY_TLSIE
:
7005 asm_fprintf (asm_out_file
, ":gottprel:");
7011 output_addr_const (asm_out_file
, x
);
7015 switch (aarch64_classify_symbolic_expression (x
))
7017 case SYMBOL_TLSLE24
:
7018 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7023 output_addr_const (asm_out_file
, x
);
7028 HOST_WIDE_INT cond_code
;
7030 if (!CONST_INT_P (x
))
7032 output_operand_lossage ("invalid operand for '%%%c'", code
);
7036 cond_code
= INTVAL (x
);
7037 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7038 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7045 machine_mode mode
= GET_MODE (x
);
7047 if (GET_CODE (x
) != MEM
7048 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7050 output_operand_lossage ("invalid operand for '%%%c'", code
);
7055 /* LDP/STP which uses a single double-width memory operand.
7056 Adjust the mode to appear like a typical LDP/STP.
7057 Currently this is supported for 16-byte accesses only. */
7060 if (!aarch64_print_ldpstp_address (f
, mode
, XEXP (x
, 0)))
7061 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7066 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7071 /* Print address 'x' of a memory access with mode 'mode'.
7072 'op' is the context required by aarch64_classify_address. It can either be
7073 MEM for a normal memory access or PARALLEL for LDP/STP. */
7075 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7076 aarch64_addr_query_type type
)
7078 struct aarch64_address_info addr
;
7081 /* Check all addresses are Pmode - including ILP32. */
7082 if (GET_MODE (x
) != Pmode
)
7083 output_operand_lossage ("invalid address mode");
7085 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7088 case ADDRESS_REG_IMM
:
7089 if (known_eq (addr
.const_offset
, 0))
7090 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7091 else if (aarch64_sve_data_mode_p (mode
))
7094 = exact_div (addr
.const_offset
,
7095 BYTES_PER_SVE_VECTOR
).to_constant ();
7096 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7097 reg_names
[REGNO (addr
.base
)], vnum
);
7099 else if (aarch64_sve_pred_mode_p (mode
))
7102 = exact_div (addr
.const_offset
,
7103 BYTES_PER_SVE_PRED
).to_constant ();
7104 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7105 reg_names
[REGNO (addr
.base
)], vnum
);
7108 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7109 INTVAL (addr
.offset
));
7112 case ADDRESS_REG_REG
:
7113 if (addr
.shift
== 0)
7114 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7115 reg_names
[REGNO (addr
.offset
)]);
7117 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7118 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7121 case ADDRESS_REG_UXTW
:
7122 if (addr
.shift
== 0)
7123 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7124 REGNO (addr
.offset
) - R0_REGNUM
);
7126 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7127 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7130 case ADDRESS_REG_SXTW
:
7131 if (addr
.shift
== 0)
7132 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7133 REGNO (addr
.offset
) - R0_REGNUM
);
7135 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7136 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7139 case ADDRESS_REG_WB
:
7140 /* Writeback is only supported for fixed-width modes. */
7141 size
= GET_MODE_SIZE (mode
).to_constant ();
7142 switch (GET_CODE (x
))
7145 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7148 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7151 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7154 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7157 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7158 INTVAL (addr
.offset
));
7161 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
7162 INTVAL (addr
.offset
));
7169 case ADDRESS_LO_SUM
:
7170 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
7171 output_addr_const (f
, addr
.offset
);
7172 asm_fprintf (f
, "]");
7175 case ADDRESS_SYMBOLIC
:
7176 output_addr_const (f
, x
);
7183 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7185 aarch64_print_ldpstp_address (FILE *f
, machine_mode mode
, rtx x
)
7187 return aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_LDP_STP
);
7190 /* Print address 'x' of a memory access with mode 'mode'. */
7192 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
7194 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
7195 output_addr_const (f
, x
);
7199 aarch64_label_mentioned_p (rtx x
)
7204 if (GET_CODE (x
) == LABEL_REF
)
7207 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7208 referencing instruction, but they are constant offsets, not
7210 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7213 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
7214 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
7220 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
7221 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
7224 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
7231 /* Implement REGNO_REG_CLASS. */
7234 aarch64_regno_regclass (unsigned regno
)
7236 if (GP_REGNUM_P (regno
))
7237 return GENERAL_REGS
;
7239 if (regno
== SP_REGNUM
)
7242 if (regno
== FRAME_POINTER_REGNUM
7243 || regno
== ARG_POINTER_REGNUM
)
7244 return POINTER_REGS
;
7246 if (FP_REGNUM_P (regno
))
7247 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
7249 if (PR_REGNUM_P (regno
))
7250 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
7255 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7256 If OFFSET is out of range, return an offset of an anchor point
7257 that is in range. Return 0 otherwise. */
7259 static HOST_WIDE_INT
7260 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
7263 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7265 return (offset
+ 0x400) & ~0x7f0;
7267 /* For offsets that aren't a multiple of the access size, the limit is
7269 if (offset
& (size
- 1))
7271 /* BLKmode typically uses LDP of X-registers. */
7272 if (mode
== BLKmode
)
7273 return (offset
+ 512) & ~0x3ff;
7274 return (offset
+ 0x100) & ~0x1ff;
7277 /* Small negative offsets are supported. */
7278 if (IN_RANGE (offset
, -256, 0))
7281 if (mode
== TImode
|| mode
== TFmode
)
7282 return (offset
+ 0x100) & ~0x1ff;
7284 /* Use 12-bit offset by access size. */
7285 return offset
& (~0xfff * size
);
7289 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
7291 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7292 where mask is selected by alignment and size of the offset.
7293 We try to pick as large a range for the offset as possible to
7294 maximize the chance of a CSE. However, for aligned addresses
7295 we limit the range to 4k so that structures with different sized
7296 elements are likely to use the same base. We need to be careful
7297 not to split a CONST for some forms of address expression, otherwise
7298 it will generate sub-optimal code. */
7300 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
7302 rtx base
= XEXP (x
, 0);
7303 rtx offset_rtx
= XEXP (x
, 1);
7304 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
7306 if (GET_CODE (base
) == PLUS
)
7308 rtx op0
= XEXP (base
, 0);
7309 rtx op1
= XEXP (base
, 1);
7311 /* Force any scaling into a temp for CSE. */
7312 op0
= force_reg (Pmode
, op0
);
7313 op1
= force_reg (Pmode
, op1
);
7315 /* Let the pointer register be in op0. */
7316 if (REG_POINTER (op1
))
7317 std::swap (op0
, op1
);
7319 /* If the pointer is virtual or frame related, then we know that
7320 virtual register instantiation or register elimination is going
7321 to apply a second constant. We want the two constants folded
7322 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7323 if (virt_or_elim_regno_p (REGNO (op0
)))
7325 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
7326 NULL_RTX
, true, OPTAB_DIRECT
);
7327 return gen_rtx_PLUS (Pmode
, base
, op1
);
7330 /* Otherwise, in order to encourage CSE (and thence loop strength
7331 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7332 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
7333 NULL_RTX
, true, OPTAB_DIRECT
);
7334 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
7338 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7340 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
7342 if (base_offset
!= 0)
7344 base
= plus_constant (Pmode
, base
, base_offset
);
7345 base
= force_operand (base
, NULL_RTX
);
7346 return plus_constant (Pmode
, base
, offset
- base_offset
);
7354 /* Return the reload icode required for a constant pool in mode. */
7355 static enum insn_code
7356 aarch64_constant_pool_reload_icode (machine_mode mode
)
7361 return CODE_FOR_aarch64_reload_movcpsfdi
;
7364 return CODE_FOR_aarch64_reload_movcpdfdi
;
7367 return CODE_FOR_aarch64_reload_movcptfdi
;
7370 return CODE_FOR_aarch64_reload_movcpv8qidi
;
7373 return CODE_FOR_aarch64_reload_movcpv16qidi
;
7376 return CODE_FOR_aarch64_reload_movcpv4hidi
;
7379 return CODE_FOR_aarch64_reload_movcpv8hidi
;
7382 return CODE_FOR_aarch64_reload_movcpv2sidi
;
7385 return CODE_FOR_aarch64_reload_movcpv4sidi
;
7388 return CODE_FOR_aarch64_reload_movcpv2didi
;
7391 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
7400 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
7403 secondary_reload_info
*sri
)
7405 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7406 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7407 comment at the head of aarch64-sve.md for more details about the
7408 big-endian handling. */
7409 if (BYTES_BIG_ENDIAN
7410 && reg_class_subset_p (rclass
, FP_REGS
)
7411 && !((REG_P (x
) && HARD_REGISTER_P (x
))
7412 || aarch64_simd_valid_immediate (x
, NULL
))
7413 && aarch64_sve_data_mode_p (mode
))
7415 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
7419 /* If we have to disable direct literal pool loads and stores because the
7420 function is too big, then we need a scratch register. */
7421 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
7422 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
7423 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
7424 && !aarch64_pcrelative_literal_loads
)
7426 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
7430 /* Without the TARGET_SIMD instructions we cannot move a Q register
7431 to a Q register directly. We need a scratch. */
7432 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
7433 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
7434 && reg_class_subset_p (rclass
, FP_REGS
))
7437 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
7438 else if (mode
== TImode
)
7439 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
7443 /* A TFmode or TImode memory access should be handled via an FP_REGS
7444 because AArch64 has richer addressing modes for LDR/STR instructions
7445 than LDP/STP instructions. */
7446 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
7447 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
7450 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
7451 return GENERAL_REGS
;
7457 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
7459 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
7461 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7462 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7463 if (frame_pointer_needed
)
7464 return to
== HARD_FRAME_POINTER_REGNUM
;
7469 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
7471 aarch64_layout_frame ();
7473 if (to
== HARD_FRAME_POINTER_REGNUM
)
7475 if (from
== ARG_POINTER_REGNUM
)
7476 return cfun
->machine
->frame
.hard_fp_offset
;
7478 if (from
== FRAME_POINTER_REGNUM
)
7479 return cfun
->machine
->frame
.hard_fp_offset
7480 - cfun
->machine
->frame
.locals_offset
;
7483 if (to
== STACK_POINTER_REGNUM
)
7485 if (from
== FRAME_POINTER_REGNUM
)
7486 return cfun
->machine
->frame
.frame_size
7487 - cfun
->machine
->frame
.locals_offset
;
7490 return cfun
->machine
->frame
.frame_size
;
7493 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7497 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
7501 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
7506 aarch64_asm_trampoline_template (FILE *f
)
7510 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
7511 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
7515 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
7516 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
7518 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
7519 assemble_aligned_integer (4, const0_rtx
);
7520 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7521 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7525 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
7527 rtx fnaddr
, mem
, a_tramp
;
7528 const int tramp_code_sz
= 16;
7530 /* Don't need to copy the trailing D-words, we fill those in below. */
7531 emit_block_move (m_tramp
, assemble_trampoline_template (),
7532 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
7533 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
7534 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
7535 if (GET_MODE (fnaddr
) != ptr_mode
)
7536 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
7537 emit_move_insn (mem
, fnaddr
);
7539 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
7540 emit_move_insn (mem
, chain_value
);
7542 /* XXX We should really define a "clear_cache" pattern and use
7543 gen_clear_cache(). */
7544 a_tramp
= XEXP (m_tramp
, 0);
7545 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
7546 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
7547 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
7551 static unsigned char
7552 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
7554 /* ??? Logically we should only need to provide a value when
7555 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7556 can hold MODE, but at the moment we need to handle all modes.
7557 Just ignore any runtime parts for registers that can't store them. */
7558 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
7562 case TAILCALL_ADDR_REGS
:
7566 case POINTER_AND_FP_REGS
:
7569 if (aarch64_sve_data_mode_p (mode
)
7570 && constant_multiple_p (GET_MODE_SIZE (mode
),
7571 BYTES_PER_SVE_VECTOR
, &nregs
))
7573 return (aarch64_vector_data_mode_p (mode
)
7574 ? CEIL (lowest_size
, UNITS_PER_VREG
)
7575 : CEIL (lowest_size
, UNITS_PER_WORD
));
7592 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
7594 if (regclass
== POINTER_REGS
)
7595 return GENERAL_REGS
;
7597 if (regclass
== STACK_REG
)
7600 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
7606 /* Register eliminiation can result in a request for
7607 SP+constant->FP_REGS. We cannot support such operations which
7608 use SP as source and an FP_REG as destination, so reject out
7610 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
7612 rtx lhs
= XEXP (x
, 0);
7614 /* Look through a possible SUBREG introduced by ILP32. */
7615 if (GET_CODE (lhs
) == SUBREG
)
7616 lhs
= SUBREG_REG (lhs
);
7618 gcc_assert (REG_P (lhs
));
7619 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
7628 aarch64_asm_output_labelref (FILE* f
, const char *name
)
7630 asm_fprintf (f
, "%U%s", name
);
7634 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
7636 if (priority
== DEFAULT_INIT_PRIORITY
)
7637 default_ctor_section_asm_out_constructor (symbol
, priority
);
7641 /* While priority is known to be in range [0, 65535], so 18 bytes
7642 would be enough, the compiler might not know that. To avoid
7643 -Wformat-truncation false positive, use a larger size. */
7645 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
7646 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7647 switch_to_section (s
);
7648 assemble_align (POINTER_SIZE
);
7649 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7654 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
7656 if (priority
== DEFAULT_INIT_PRIORITY
)
7657 default_dtor_section_asm_out_destructor (symbol
, priority
);
7661 /* While priority is known to be in range [0, 65535], so 18 bytes
7662 would be enough, the compiler might not know that. To avoid
7663 -Wformat-truncation false positive, use a larger size. */
7665 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
7666 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7667 switch_to_section (s
);
7668 assemble_align (POINTER_SIZE
);
7669 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7674 aarch64_output_casesi (rtx
*operands
)
7678 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
7680 static const char *const patterns
[4][2] =
7683 "ldrb\t%w3, [%0,%w1,uxtw]",
7684 "add\t%3, %4, %w3, sxtb #2"
7687 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7688 "add\t%3, %4, %w3, sxth #2"
7691 "ldr\t%w3, [%0,%w1,uxtw #2]",
7692 "add\t%3, %4, %w3, sxtw #2"
7694 /* We assume that DImode is only generated when not optimizing and
7695 that we don't really need 64-bit address offsets. That would
7696 imply an object file with 8GB of code in a single function! */
7698 "ldr\t%w3, [%0,%w1,uxtw #2]",
7699 "add\t%3, %4, %w3, sxtw #2"
7703 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
7705 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
7706 index
= exact_log2 (GET_MODE_SIZE (mode
));
7708 gcc_assert (index
>= 0 && index
<= 3);
7710 /* Need to implement table size reduction, by chaning the code below. */
7711 output_asm_insn (patterns
[index
][0], operands
);
7712 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
7713 snprintf (buf
, sizeof (buf
),
7714 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
7715 output_asm_insn (buf
, operands
);
7716 output_asm_insn (patterns
[index
][1], operands
);
7717 output_asm_insn ("br\t%3", operands
);
7718 assemble_label (asm_out_file
, label
);
7723 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7724 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7728 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
7730 if (shift
>= 0 && shift
<= 3)
7733 for (size
= 8; size
<= 32; size
*= 2)
7735 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
7736 if (mask
== bits
<< shift
)
7743 /* Constant pools are per function only when PC relative
7744 literal loads are true or we are in the large memory
7748 aarch64_can_use_per_function_literal_pools_p (void)
7750 return (aarch64_pcrelative_literal_loads
7751 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
7755 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
7757 /* We can't use blocks for constants when we're using a per-function
7759 return !aarch64_can_use_per_function_literal_pools_p ();
7762 /* Select appropriate section for constants depending
7763 on where we place literal pools. */
7766 aarch64_select_rtx_section (machine_mode mode
,
7768 unsigned HOST_WIDE_INT align
)
7770 if (aarch64_can_use_per_function_literal_pools_p ())
7771 return function_section (current_function_decl
);
7773 return default_elf_select_rtx_section (mode
, x
, align
);
7776 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7778 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
7779 HOST_WIDE_INT offset
)
7781 /* When using per-function literal pools, we must ensure that any code
7782 section is aligned to the minimal instruction length, lest we get
7783 errors from the assembler re "unaligned instructions". */
7784 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
7785 ASM_OUTPUT_ALIGN (f
, 2);
7790 /* Helper function for rtx cost calculation. Strip a shift expression
7791 from X. Returns the inner operand if successful, or the original
7792 expression on failure. */
7794 aarch64_strip_shift (rtx x
)
7798 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7799 we can convert both to ROR during final output. */
7800 if ((GET_CODE (op
) == ASHIFT
7801 || GET_CODE (op
) == ASHIFTRT
7802 || GET_CODE (op
) == LSHIFTRT
7803 || GET_CODE (op
) == ROTATERT
7804 || GET_CODE (op
) == ROTATE
)
7805 && CONST_INT_P (XEXP (op
, 1)))
7806 return XEXP (op
, 0);
7808 if (GET_CODE (op
) == MULT
7809 && CONST_INT_P (XEXP (op
, 1))
7810 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
7811 return XEXP (op
, 0);
7816 /* Helper function for rtx cost calculation. Strip an extend
7817 expression from X. Returns the inner operand if successful, or the
7818 original expression on failure. We deal with a number of possible
7819 canonicalization variations here. If STRIP_SHIFT is true, then
7820 we can strip off a shift also. */
7822 aarch64_strip_extend (rtx x
, bool strip_shift
)
7824 scalar_int_mode mode
;
7827 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
7830 /* Zero and sign extraction of a widened value. */
7831 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
7832 && XEXP (op
, 2) == const0_rtx
7833 && GET_CODE (XEXP (op
, 0)) == MULT
7834 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
7836 return XEXP (XEXP (op
, 0), 0);
7838 /* It can also be represented (for zero-extend) as an AND with an
7840 if (GET_CODE (op
) == AND
7841 && GET_CODE (XEXP (op
, 0)) == MULT
7842 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
7843 && CONST_INT_P (XEXP (op
, 1))
7844 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
7845 INTVAL (XEXP (op
, 1))) != 0)
7846 return XEXP (XEXP (op
, 0), 0);
7848 /* Now handle extended register, as this may also have an optional
7849 left shift by 1..4. */
7851 && GET_CODE (op
) == ASHIFT
7852 && CONST_INT_P (XEXP (op
, 1))
7853 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
7856 if (GET_CODE (op
) == ZERO_EXTEND
7857 || GET_CODE (op
) == SIGN_EXTEND
)
7866 /* Return true iff CODE is a shift supported in combination
7867 with arithmetic instructions. */
7870 aarch64_shift_p (enum rtx_code code
)
7872 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
7876 /* Return true iff X is a cheap shift without a sign extend. */
7879 aarch64_cheap_mult_shift_p (rtx x
)
7886 if (!(aarch64_tune_params
.extra_tuning_flags
7887 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
7890 if (GET_CODE (op0
) == SIGN_EXTEND
)
7893 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
7894 && UINTVAL (op1
) <= 4)
7897 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
7900 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
7902 if (l2
> 0 && l2
<= 4)
7908 /* Helper function for rtx cost calculation. Calculate the cost of
7909 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7910 Return the calculated cost of the expression, recursing manually in to
7911 operands where needed. */
7914 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
7917 const struct cpu_cost_table
*extra_cost
7918 = aarch64_tune_params
.insn_extra_cost
;
7920 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
7921 machine_mode mode
= GET_MODE (x
);
7923 gcc_checking_assert (code
== MULT
);
7928 if (VECTOR_MODE_P (mode
))
7929 mode
= GET_MODE_INNER (mode
);
7931 /* Integer multiply/fma. */
7932 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7934 /* The multiply will be canonicalized as a shift, cost it as such. */
7935 if (aarch64_shift_p (GET_CODE (x
))
7936 || (CONST_INT_P (op1
)
7937 && exact_log2 (INTVAL (op1
)) > 0))
7939 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
7940 || GET_CODE (op0
) == SIGN_EXTEND
;
7945 /* If the shift is considered cheap,
7946 then don't add any cost. */
7947 if (aarch64_cheap_mult_shift_p (x
))
7949 else if (REG_P (op1
))
7950 /* ARITH + shift-by-register. */
7951 cost
+= extra_cost
->alu
.arith_shift_reg
;
7953 /* ARITH + extended register. We don't have a cost field
7954 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7955 cost
+= extra_cost
->alu
.extend_arith
;
7957 /* ARITH + shift-by-immediate. */
7958 cost
+= extra_cost
->alu
.arith_shift
;
7961 /* LSL (immediate). */
7962 cost
+= extra_cost
->alu
.shift
;
7965 /* Strip extends as we will have costed them in the case above. */
7967 op0
= aarch64_strip_extend (op0
, true);
7969 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
7974 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7975 compound and let the below cases handle it. After all, MNEG is a
7976 special-case alias of MSUB. */
7977 if (GET_CODE (op0
) == NEG
)
7979 op0
= XEXP (op0
, 0);
7983 /* Integer multiplies or FMAs have zero/sign extending variants. */
7984 if ((GET_CODE (op0
) == ZERO_EXTEND
7985 && GET_CODE (op1
) == ZERO_EXTEND
)
7986 || (GET_CODE (op0
) == SIGN_EXTEND
7987 && GET_CODE (op1
) == SIGN_EXTEND
))
7989 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
7990 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
7995 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7996 cost
+= extra_cost
->mult
[0].extend_add
;
7998 /* MUL/SMULL/UMULL. */
7999 cost
+= extra_cost
->mult
[0].extend
;
8005 /* This is either an integer multiply or a MADD. In both cases
8006 we want to recurse and cost the operands. */
8007 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8008 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8014 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8017 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8026 /* Floating-point FMA/FMUL can also support negations of the
8027 operands, unless the rounding mode is upward or downward in
8028 which case FNMUL is different than FMUL with operand negation. */
8029 bool neg0
= GET_CODE (op0
) == NEG
;
8030 bool neg1
= GET_CODE (op1
) == NEG
;
8031 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8034 op0
= XEXP (op0
, 0);
8036 op1
= XEXP (op1
, 0);
8040 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8041 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8044 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8047 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8048 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8054 aarch64_address_cost (rtx x
,
8056 addr_space_t as ATTRIBUTE_UNUSED
,
8059 enum rtx_code c
= GET_CODE (x
);
8060 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8061 struct aarch64_address_info info
;
8065 if (!aarch64_classify_address (&info
, x
, mode
, false))
8067 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8069 /* This is a CONST or SYMBOL ref which will be split
8070 in a different way depending on the code model in use.
8071 Cost it through the generic infrastructure. */
8072 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8073 /* Divide through by the cost of one instruction to
8074 bring it to the same units as the address costs. */
8075 cost_symbol_ref
/= COSTS_N_INSNS (1);
8076 /* The cost is then the cost of preparing the address,
8077 followed by an immediate (possibly 0) offset. */
8078 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8082 /* This is most likely a jump table from a case
8084 return addr_cost
->register_offset
;
8090 case ADDRESS_LO_SUM
:
8091 case ADDRESS_SYMBOLIC
:
8092 case ADDRESS_REG_IMM
:
8093 cost
+= addr_cost
->imm_offset
;
8096 case ADDRESS_REG_WB
:
8097 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8098 cost
+= addr_cost
->pre_modify
;
8099 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8100 cost
+= addr_cost
->post_modify
;
8106 case ADDRESS_REG_REG
:
8107 cost
+= addr_cost
->register_offset
;
8110 case ADDRESS_REG_SXTW
:
8111 cost
+= addr_cost
->register_sextend
;
8114 case ADDRESS_REG_UXTW
:
8115 cost
+= addr_cost
->register_zextend
;
8125 /* For the sake of calculating the cost of the shifted register
8126 component, we can treat same sized modes in the same way. */
8127 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8128 cost
+= addr_cost
->addr_scale_costs
.hi
;
8129 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8130 cost
+= addr_cost
->addr_scale_costs
.si
;
8131 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8132 cost
+= addr_cost
->addr_scale_costs
.di
;
8134 /* We can't tell, or this is a 128-bit vector. */
8135 cost
+= addr_cost
->addr_scale_costs
.ti
;
8141 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8142 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8146 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8148 /* When optimizing for speed, use the cost of unpredictable branches. */
8149 const struct cpu_branch_cost
*branch_costs
=
8150 aarch64_tune_params
.branch_costs
;
8152 if (!speed_p
|| predictable_p
)
8153 return branch_costs
->predictable
;
8155 return branch_costs
->unpredictable
;
8158 /* Return true if the RTX X in mode MODE is a zero or sign extract
8159 usable in an ADD or SUB (extended register) instruction. */
8161 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8163 /* Catch add with a sign extract.
8164 This is add_<optab><mode>_multp2. */
8165 if (GET_CODE (x
) == SIGN_EXTRACT
8166 || GET_CODE (x
) == ZERO_EXTRACT
)
8168 rtx op0
= XEXP (x
, 0);
8169 rtx op1
= XEXP (x
, 1);
8170 rtx op2
= XEXP (x
, 2);
8172 if (GET_CODE (op0
) == MULT
8173 && CONST_INT_P (op1
)
8174 && op2
== const0_rtx
8175 && CONST_INT_P (XEXP (op0
, 1))
8176 && aarch64_is_extend_from_extract (mode
,
8183 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8185 else if (GET_CODE (x
) == SIGN_EXTEND
8186 || GET_CODE (x
) == ZERO_EXTEND
)
8187 return REG_P (XEXP (x
, 0));
8193 aarch64_frint_unspec_p (unsigned int u
)
8211 /* Return true iff X is an rtx that will match an extr instruction
8212 i.e. as described in the *extr<mode>5_insn family of patterns.
8213 OP0 and OP1 will be set to the operands of the shifts involved
8214 on success and will be NULL_RTX otherwise. */
8217 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
8220 scalar_int_mode mode
;
8221 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
8224 *res_op0
= NULL_RTX
;
8225 *res_op1
= NULL_RTX
;
8227 if (GET_CODE (x
) != IOR
)
8233 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
8234 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
8236 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8237 if (GET_CODE (op1
) == ASHIFT
)
8238 std::swap (op0
, op1
);
8240 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
8243 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
8244 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
8246 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
8247 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
8249 *res_op0
= XEXP (op0
, 0);
8250 *res_op1
= XEXP (op1
, 0);
8258 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8259 storing it in *COST. Result is true if the total cost of the operation
8260 has now been calculated. */
8262 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
8266 enum rtx_code cmpcode
;
8268 if (COMPARISON_P (op0
))
8270 inner
= XEXP (op0
, 0);
8271 comparator
= XEXP (op0
, 1);
8272 cmpcode
= GET_CODE (op0
);
8277 comparator
= const0_rtx
;
8281 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
8283 /* Conditional branch. */
8284 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8288 if (cmpcode
== NE
|| cmpcode
== EQ
)
8290 if (comparator
== const0_rtx
)
8292 /* TBZ/TBNZ/CBZ/CBNZ. */
8293 if (GET_CODE (inner
) == ZERO_EXTRACT
)
8295 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
8296 ZERO_EXTRACT
, 0, speed
);
8299 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
8304 else if (cmpcode
== LT
|| cmpcode
== GE
)
8307 if (comparator
== const0_rtx
)
8312 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8315 if (GET_CODE (op1
) == COMPARE
)
8317 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8318 if (XEXP (op1
, 1) == const0_rtx
)
8322 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
8323 const struct cpu_cost_table
*extra_cost
8324 = aarch64_tune_params
.insn_extra_cost
;
8326 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8327 *cost
+= extra_cost
->alu
.arith
;
8329 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8334 /* It's a conditional operation based on the status flags,
8335 so it must be some flavor of CSEL. */
8337 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8338 if (GET_CODE (op1
) == NEG
8339 || GET_CODE (op1
) == NOT
8340 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
8341 op1
= XEXP (op1
, 0);
8342 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
8344 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8345 op1
= XEXP (op1
, 0);
8346 op2
= XEXP (op2
, 0);
8349 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
8350 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
8354 /* We don't know what this is, cost all operands. */
8358 /* Check whether X is a bitfield operation of the form shift + extend that
8359 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8360 operand to which the bitfield operation is applied. Otherwise return
8364 aarch64_extend_bitfield_pattern_p (rtx x
)
8366 rtx_code outer_code
= GET_CODE (x
);
8367 machine_mode outer_mode
= GET_MODE (x
);
8369 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
8370 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
8373 rtx inner
= XEXP (x
, 0);
8374 rtx_code inner_code
= GET_CODE (inner
);
8375 machine_mode inner_mode
= GET_MODE (inner
);
8381 if (CONST_INT_P (XEXP (inner
, 1))
8382 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8383 op
= XEXP (inner
, 0);
8386 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8387 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8388 op
= XEXP (inner
, 0);
8391 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8392 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8393 op
= XEXP (inner
, 0);
8402 /* Return true if the mask and a shift amount from an RTX of the form
8403 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8404 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8407 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
8410 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
8411 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
8412 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
8413 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
8416 /* Calculate the cost of calculating X, storing it in *COST. Result
8417 is true if the total cost of the operation has now been calculated. */
8419 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
8420 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
8423 const struct cpu_cost_table
*extra_cost
8424 = aarch64_tune_params
.insn_extra_cost
;
8425 int code
= GET_CODE (x
);
8426 scalar_int_mode int_mode
;
8428 /* By default, assume that everything has equivalent cost to the
8429 cheapest instruction. Any additional costs are applied as a delta
8430 above this default. */
8431 *cost
= COSTS_N_INSNS (1);
8436 /* The cost depends entirely on the operands to SET. */
8441 switch (GET_CODE (op0
))
8446 rtx address
= XEXP (op0
, 0);
8447 if (VECTOR_MODE_P (mode
))
8448 *cost
+= extra_cost
->ldst
.storev
;
8449 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8450 *cost
+= extra_cost
->ldst
.store
;
8451 else if (mode
== SFmode
)
8452 *cost
+= extra_cost
->ldst
.storef
;
8453 else if (mode
== DFmode
)
8454 *cost
+= extra_cost
->ldst
.stored
;
8457 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8461 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8465 if (! REG_P (SUBREG_REG (op0
)))
8466 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
8470 /* The cost is one per vector-register copied. */
8471 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
8473 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
8474 *cost
= COSTS_N_INSNS (nregs
);
8476 /* const0_rtx is in general free, but we will use an
8477 instruction to set a register to 0. */
8478 else if (REG_P (op1
) || op1
== const0_rtx
)
8480 /* The cost is 1 per register copied. */
8481 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
8482 *cost
= COSTS_N_INSNS (nregs
);
8485 /* Cost is just the cost of the RHS of the set. */
8486 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8491 /* Bit-field insertion. Strip any redundant widening of
8492 the RHS to meet the width of the target. */
8493 if (GET_CODE (op1
) == SUBREG
)
8494 op1
= SUBREG_REG (op1
);
8495 if ((GET_CODE (op1
) == ZERO_EXTEND
8496 || GET_CODE (op1
) == SIGN_EXTEND
)
8497 && CONST_INT_P (XEXP (op0
, 1))
8498 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
8499 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
8500 op1
= XEXP (op1
, 0);
8502 if (CONST_INT_P (op1
))
8504 /* MOV immediate is assumed to always be cheap. */
8505 *cost
= COSTS_N_INSNS (1);
8511 *cost
+= extra_cost
->alu
.bfi
;
8512 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
8518 /* We can't make sense of this, assume default cost. */
8519 *cost
= COSTS_N_INSNS (1);
8525 /* If an instruction can incorporate a constant within the
8526 instruction, the instruction's expression avoids calling
8527 rtx_cost() on the constant. If rtx_cost() is called on a
8528 constant, then it is usually because the constant must be
8529 moved into a register by one or more instructions.
8531 The exception is constant 0, which can be expressed
8532 as XZR/WZR and is therefore free. The exception to this is
8533 if we have (set (reg) (const0_rtx)) in which case we must cost
8534 the move. However, we can catch that when we cost the SET, so
8535 we don't need to consider that here. */
8536 if (x
== const0_rtx
)
8540 /* To an approximation, building any other constant is
8541 proportionally expensive to the number of instructions
8542 required to build that constant. This is true whether we
8543 are compiling for SPEED or otherwise. */
8544 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8545 int_mode
= word_mode
;
8546 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
8547 (NULL_RTX
, x
, false, int_mode
));
8553 /* First determine number of instructions to do the move
8554 as an integer constant. */
8555 if (!aarch64_float_const_representable_p (x
)
8556 && !aarch64_can_const_movi_rtx_p (x
, mode
)
8557 && aarch64_float_const_rtx_p (x
))
8559 unsigned HOST_WIDE_INT ival
;
8560 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
8561 gcc_assert (succeed
);
8563 scalar_int_mode imode
= (mode
== HFmode
8565 : int_mode_for_mode (mode
).require ());
8566 int ncost
= aarch64_internal_mov_immediate
8567 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8568 *cost
+= COSTS_N_INSNS (ncost
);
8574 /* mov[df,sf]_aarch64. */
8575 if (aarch64_float_const_representable_p (x
))
8576 /* FMOV (scalar immediate). */
8577 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
8578 else if (!aarch64_float_const_zero_rtx_p (x
))
8580 /* This will be a load from memory. */
8582 *cost
+= extra_cost
->ldst
.loadd
;
8584 *cost
+= extra_cost
->ldst
.loadf
;
8587 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8588 or MOV v0.s[0], wzr - neither of which are modeled by the
8589 cost tables. Just use the default cost. */
8599 /* For loads we want the base cost of a load, plus an
8600 approximation for the additional cost of the addressing
8602 rtx address
= XEXP (x
, 0);
8603 if (VECTOR_MODE_P (mode
))
8604 *cost
+= extra_cost
->ldst
.loadv
;
8605 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8606 *cost
+= extra_cost
->ldst
.load
;
8607 else if (mode
== SFmode
)
8608 *cost
+= extra_cost
->ldst
.loadf
;
8609 else if (mode
== DFmode
)
8610 *cost
+= extra_cost
->ldst
.loadd
;
8613 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8622 if (VECTOR_MODE_P (mode
))
8627 *cost
+= extra_cost
->vect
.alu
;
8632 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8634 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8635 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8638 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
8642 /* Cost this as SUB wzr, X. */
8643 op0
= CONST0_RTX (mode
);
8648 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8650 /* Support (neg(fma...)) as a single instruction only if
8651 sign of zeros is unimportant. This matches the decision
8652 making in aarch64.md. */
8653 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
8656 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8659 if (GET_CODE (op0
) == MULT
)
8662 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8667 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8677 if (VECTOR_MODE_P (mode
))
8678 *cost
+= extra_cost
->vect
.alu
;
8680 *cost
+= extra_cost
->alu
.clz
;
8689 if (op1
== const0_rtx
8690 && GET_CODE (op0
) == AND
)
8693 mode
= GET_MODE (op0
);
8697 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
8699 /* TODO: A write to the CC flags possibly costs extra, this
8700 needs encoding in the cost tables. */
8702 mode
= GET_MODE (op0
);
8704 if (GET_CODE (op0
) == AND
)
8710 if (GET_CODE (op0
) == PLUS
)
8712 /* ADDS (and CMN alias). */
8717 if (GET_CODE (op0
) == MINUS
)
8724 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
8725 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
8726 && CONST_INT_P (XEXP (op0
, 2)))
8728 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8729 Handle it here directly rather than going to cost_logic
8730 since we know the immediate generated for the TST is valid
8731 so we can avoid creating an intermediate rtx for it only
8732 for costing purposes. */
8734 *cost
+= extra_cost
->alu
.logical
;
8736 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
8737 ZERO_EXTRACT
, 0, speed
);
8741 if (GET_CODE (op1
) == NEG
)
8745 *cost
+= extra_cost
->alu
.arith
;
8747 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
8748 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
8754 Compare can freely swap the order of operands, and
8755 canonicalization puts the more complex operation first.
8756 But the integer MINUS logic expects the shift/extend
8757 operation in op1. */
8759 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
8767 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
8771 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8773 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
8775 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
8776 /* FCMP supports constant 0.0 for no extra cost. */
8782 if (VECTOR_MODE_P (mode
))
8784 /* Vector compare. */
8786 *cost
+= extra_cost
->vect
.alu
;
8788 if (aarch64_float_const_zero_rtx_p (op1
))
8790 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8804 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
8806 /* Detect valid immediates. */
8807 if ((GET_MODE_CLASS (mode
) == MODE_INT
8808 || (GET_MODE_CLASS (mode
) == MODE_CC
8809 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
8810 && CONST_INT_P (op1
)
8811 && aarch64_uimm12_shift (INTVAL (op1
)))
8814 /* SUB(S) (immediate). */
8815 *cost
+= extra_cost
->alu
.arith
;
8819 /* Look for SUB (extended register). */
8820 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8821 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
8824 *cost
+= extra_cost
->alu
.extend_arith
;
8826 op1
= aarch64_strip_extend (op1
, true);
8827 *cost
+= rtx_cost (op1
, VOIDmode
,
8828 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
8832 rtx new_op1
= aarch64_strip_extend (op1
, false);
8834 /* Cost this as an FMA-alike operation. */
8835 if ((GET_CODE (new_op1
) == MULT
8836 || aarch64_shift_p (GET_CODE (new_op1
)))
8839 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
8840 (enum rtx_code
) code
,
8845 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
8849 if (VECTOR_MODE_P (mode
))
8852 *cost
+= extra_cost
->vect
.alu
;
8854 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8857 *cost
+= extra_cost
->alu
.arith
;
8859 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8862 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8876 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8877 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8880 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
8881 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8885 if (GET_MODE_CLASS (mode
) == MODE_INT
8886 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
8887 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
8889 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
8892 /* ADD (immediate). */
8893 *cost
+= extra_cost
->alu
.arith
;
8897 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8899 /* Look for ADD (extended register). */
8900 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8901 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
8904 *cost
+= extra_cost
->alu
.extend_arith
;
8906 op0
= aarch64_strip_extend (op0
, true);
8907 *cost
+= rtx_cost (op0
, VOIDmode
,
8908 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
8912 /* Strip any extend, leave shifts behind as we will
8913 cost them through mult_cost. */
8914 new_op0
= aarch64_strip_extend (op0
, false);
8916 if (GET_CODE (new_op0
) == MULT
8917 || aarch64_shift_p (GET_CODE (new_op0
)))
8919 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
8924 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
8928 if (VECTOR_MODE_P (mode
))
8931 *cost
+= extra_cost
->vect
.alu
;
8933 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8936 *cost
+= extra_cost
->alu
.arith
;
8938 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8941 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8948 *cost
= COSTS_N_INSNS (1);
8952 if (VECTOR_MODE_P (mode
))
8953 *cost
+= extra_cost
->vect
.alu
;
8955 *cost
+= extra_cost
->alu
.rev
;
8960 if (aarch_rev16_p (x
))
8962 *cost
= COSTS_N_INSNS (1);
8966 if (VECTOR_MODE_P (mode
))
8967 *cost
+= extra_cost
->vect
.alu
;
8969 *cost
+= extra_cost
->alu
.rev
;
8974 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
8976 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
8977 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
8979 *cost
+= extra_cost
->alu
.shift
;
8990 if (VECTOR_MODE_P (mode
))
8993 *cost
+= extra_cost
->vect
.alu
;
8998 && GET_CODE (op0
) == MULT
8999 && CONST_INT_P (XEXP (op0
, 1))
9000 && CONST_INT_P (op1
)
9001 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9004 /* This is a UBFM/SBFM. */
9005 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9007 *cost
+= extra_cost
->alu
.bfx
;
9011 if (is_int_mode (mode
, &int_mode
))
9013 if (CONST_INT_P (op1
))
9015 /* We have a mask + shift version of a UBFIZ
9016 i.e. the *andim_ashift<mode>_bfiz pattern. */
9017 if (GET_CODE (op0
) == ASHIFT
9018 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
9021 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
9022 (enum rtx_code
) code
, 0, speed
);
9024 *cost
+= extra_cost
->alu
.bfx
;
9028 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
9030 /* We possibly get the immediate for free, this is not
9032 *cost
+= rtx_cost (op0
, int_mode
,
9033 (enum rtx_code
) code
, 0, speed
);
9035 *cost
+= extra_cost
->alu
.logical
;
9044 /* Handle ORN, EON, or BIC. */
9045 if (GET_CODE (op0
) == NOT
)
9046 op0
= XEXP (op0
, 0);
9048 new_op0
= aarch64_strip_shift (op0
);
9050 /* If we had a shift on op0 then this is a logical-shift-
9051 by-register/immediate operation. Otherwise, this is just
9052 a logical operation. */
9057 /* Shift by immediate. */
9058 if (CONST_INT_P (XEXP (op0
, 1)))
9059 *cost
+= extra_cost
->alu
.log_shift
;
9061 *cost
+= extra_cost
->alu
.log_shift_reg
;
9064 *cost
+= extra_cost
->alu
.logical
;
9067 /* In both cases we want to cost both operands. */
9068 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9070 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9080 op0
= aarch64_strip_shift (x
);
9082 if (VECTOR_MODE_P (mode
))
9085 *cost
+= extra_cost
->vect
.alu
;
9089 /* MVN-shifted-reg. */
9092 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9095 *cost
+= extra_cost
->alu
.log_shift
;
9099 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9100 Handle the second form here taking care that 'a' in the above can
9102 else if (GET_CODE (op0
) == XOR
)
9104 rtx newop0
= XEXP (op0
, 0);
9105 rtx newop1
= XEXP (op0
, 1);
9106 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9108 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9109 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9113 if (op0_stripped
!= newop0
)
9114 *cost
+= extra_cost
->alu
.log_shift
;
9116 *cost
+= extra_cost
->alu
.logical
;
9123 *cost
+= extra_cost
->alu
.logical
;
9130 /* If a value is written in SI mode, then zero extended to DI
9131 mode, the operation will in general be free as a write to
9132 a 'w' register implicitly zeroes the upper bits of an 'x'
9133 register. However, if this is
9135 (set (reg) (zero_extend (reg)))
9137 we must cost the explicit register move. */
9139 && GET_MODE (op0
) == SImode
9142 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9144 /* If OP_COST is non-zero, then the cost of the zero extend
9145 is effectively the cost of the inner operation. Otherwise
9146 we have a MOV instruction and we take the cost from the MOV
9147 itself. This is true independently of whether we are
9148 optimizing for space or time. */
9154 else if (MEM_P (op0
))
9156 /* All loads can zero extend to any size for free. */
9157 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9161 op0
= aarch64_extend_bitfield_pattern_p (x
);
9164 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9166 *cost
+= extra_cost
->alu
.bfx
;
9172 if (VECTOR_MODE_P (mode
))
9175 *cost
+= extra_cost
->vect
.alu
;
9179 /* We generate an AND instead of UXTB/UXTH. */
9180 *cost
+= extra_cost
->alu
.logical
;
9186 if (MEM_P (XEXP (x
, 0)))
9191 rtx address
= XEXP (XEXP (x
, 0), 0);
9192 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9195 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9201 op0
= aarch64_extend_bitfield_pattern_p (x
);
9204 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
9206 *cost
+= extra_cost
->alu
.bfx
;
9212 if (VECTOR_MODE_P (mode
))
9213 *cost
+= extra_cost
->vect
.alu
;
9215 *cost
+= extra_cost
->alu
.extend
;
9223 if (CONST_INT_P (op1
))
9227 if (VECTOR_MODE_P (mode
))
9229 /* Vector shift (immediate). */
9230 *cost
+= extra_cost
->vect
.alu
;
9234 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9236 *cost
+= extra_cost
->alu
.shift
;
9240 /* We can incorporate zero/sign extend for free. */
9241 if (GET_CODE (op0
) == ZERO_EXTEND
9242 || GET_CODE (op0
) == SIGN_EXTEND
)
9243 op0
= XEXP (op0
, 0);
9245 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
9250 if (VECTOR_MODE_P (mode
))
9253 /* Vector shift (register). */
9254 *cost
+= extra_cost
->vect
.alu
;
9260 *cost
+= extra_cost
->alu
.shift_reg
;
9262 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9263 && CONST_INT_P (XEXP (op1
, 1))
9264 && known_eq (INTVAL (XEXP (op1
, 1)),
9265 GET_MODE_BITSIZE (mode
) - 1))
9267 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9268 /* We already demanded XEXP (op1, 0) to be REG_P, so
9269 don't recurse into it. */
9273 return false; /* All arguments need to be in registers. */
9283 if (CONST_INT_P (op1
))
9285 /* ASR (immediate) and friends. */
9288 if (VECTOR_MODE_P (mode
))
9289 *cost
+= extra_cost
->vect
.alu
;
9291 *cost
+= extra_cost
->alu
.shift
;
9294 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9299 if (VECTOR_MODE_P (mode
))
9302 /* Vector shift (register). */
9303 *cost
+= extra_cost
->vect
.alu
;
9308 /* ASR (register) and friends. */
9309 *cost
+= extra_cost
->alu
.shift_reg
;
9311 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9312 && CONST_INT_P (XEXP (op1
, 1))
9313 && known_eq (INTVAL (XEXP (op1
, 1)),
9314 GET_MODE_BITSIZE (mode
) - 1))
9316 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9317 /* We already demanded XEXP (op1, 0) to be REG_P, so
9318 don't recurse into it. */
9322 return false; /* All arguments need to be in registers. */
9327 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
9328 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
9332 *cost
+= extra_cost
->ldst
.load
;
9334 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
9335 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
9337 /* ADRP, followed by ADD. */
9338 *cost
+= COSTS_N_INSNS (1);
9340 *cost
+= 2 * extra_cost
->alu
.arith
;
9342 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9343 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9347 *cost
+= extra_cost
->alu
.arith
;
9352 /* One extra load instruction, after accessing the GOT. */
9353 *cost
+= COSTS_N_INSNS (1);
9355 *cost
+= extra_cost
->ldst
.load
;
9361 /* ADRP/ADD (immediate). */
9363 *cost
+= extra_cost
->alu
.arith
;
9371 if (VECTOR_MODE_P (mode
))
9372 *cost
+= extra_cost
->vect
.alu
;
9374 *cost
+= extra_cost
->alu
.bfx
;
9377 /* We can trust that the immediates used will be correct (there
9378 are no by-register forms), so we need only cost op0. */
9379 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9383 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
9384 /* aarch64_rtx_mult_cost always handles recursion to its
9389 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9390 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9391 an unconditional negate. This case should only ever be reached through
9392 the set_smod_pow2_cheap check in expmed.c. */
9393 if (CONST_INT_P (XEXP (x
, 1))
9394 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
9395 && (mode
== SImode
|| mode
== DImode
))
9397 /* We expand to 4 instructions. Reset the baseline. */
9398 *cost
= COSTS_N_INSNS (4);
9401 *cost
+= 2 * extra_cost
->alu
.logical
9402 + 2 * extra_cost
->alu
.arith
;
9411 /* Slighly prefer UMOD over SMOD. */
9412 if (VECTOR_MODE_P (mode
))
9413 *cost
+= extra_cost
->vect
.alu
;
9414 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9415 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
9416 + extra_cost
->mult
[mode
== DImode
].idiv
9417 + (code
== MOD
? 1 : 0));
9419 return false; /* All arguments need to be in registers. */
9426 if (VECTOR_MODE_P (mode
))
9427 *cost
+= extra_cost
->vect
.alu
;
9428 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9429 /* There is no integer SQRT, so only DIV and UDIV can get
9431 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
9432 /* Slighly prefer UDIV over SDIV. */
9433 + (code
== DIV
? 1 : 0));
9435 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
9437 return false; /* All arguments need to be in registers. */
9440 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
9441 XEXP (x
, 2), cost
, speed
);
9454 return false; /* All arguments must be in registers. */
9463 if (VECTOR_MODE_P (mode
))
9464 *cost
+= extra_cost
->vect
.alu
;
9466 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9469 /* FMSUB, FNMADD, and FNMSUB are free. */
9470 if (GET_CODE (op0
) == NEG
)
9471 op0
= XEXP (op0
, 0);
9473 if (GET_CODE (op2
) == NEG
)
9474 op2
= XEXP (op2
, 0);
9476 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9477 and the by-element operand as operand 0. */
9478 if (GET_CODE (op1
) == NEG
)
9479 op1
= XEXP (op1
, 0);
9481 /* Catch vector-by-element operations. The by-element operand can
9482 either be (vec_duplicate (vec_select (x))) or just
9483 (vec_select (x)), depending on whether we are multiplying by
9484 a vector or a scalar.
9486 Canonicalization is not very good in these cases, FMA4 will put the
9487 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9488 if (GET_CODE (op0
) == VEC_DUPLICATE
)
9489 op0
= XEXP (op0
, 0);
9490 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
9491 op1
= XEXP (op1
, 0);
9493 if (GET_CODE (op0
) == VEC_SELECT
)
9494 op0
= XEXP (op0
, 0);
9495 else if (GET_CODE (op1
) == VEC_SELECT
)
9496 op1
= XEXP (op1
, 0);
9498 /* If the remaining parameters are not registers,
9499 get the cost to put them into registers. */
9500 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
9501 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
9502 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
9506 case UNSIGNED_FLOAT
:
9508 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
9514 if (VECTOR_MODE_P (mode
))
9516 /*Vector truncate. */
9517 *cost
+= extra_cost
->vect
.alu
;
9520 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
9524 case FLOAT_TRUNCATE
:
9527 if (VECTOR_MODE_P (mode
))
9529 /*Vector conversion. */
9530 *cost
+= extra_cost
->vect
.alu
;
9533 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
9540 /* Strip the rounding part. They will all be implemented
9541 by the fcvt* family of instructions anyway. */
9542 if (GET_CODE (x
) == UNSPEC
)
9544 unsigned int uns_code
= XINT (x
, 1);
9546 if (uns_code
== UNSPEC_FRINTA
9547 || uns_code
== UNSPEC_FRINTM
9548 || uns_code
== UNSPEC_FRINTN
9549 || uns_code
== UNSPEC_FRINTP
9550 || uns_code
== UNSPEC_FRINTZ
)
9551 x
= XVECEXP (x
, 0, 0);
9556 if (VECTOR_MODE_P (mode
))
9557 *cost
+= extra_cost
->vect
.alu
;
9559 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
9562 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9563 fixed-point fcvt. */
9564 if (GET_CODE (x
) == MULT
9565 && ((VECTOR_MODE_P (mode
)
9566 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
9567 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
9569 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
9574 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9578 if (VECTOR_MODE_P (mode
))
9582 *cost
+= extra_cost
->vect
.alu
;
9584 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9588 /* FABD, which is analogous to FADD. */
9589 if (GET_CODE (op0
) == MINUS
)
9591 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
9592 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
9594 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9598 /* Simple FABS is analogous to FNEG. */
9600 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9604 /* Integer ABS will either be split to
9605 two arithmetic instructions, or will be an ABS
9606 (scalar), which we don't model. */
9607 *cost
= COSTS_N_INSNS (2);
9609 *cost
+= 2 * extra_cost
->alu
.arith
;
9617 if (VECTOR_MODE_P (mode
))
9618 *cost
+= extra_cost
->vect
.alu
;
9621 /* FMAXNM/FMINNM/FMAX/FMIN.
9622 TODO: This may not be accurate for all implementations, but
9623 we do not model this in the cost tables. */
9624 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9630 /* The floating point round to integer frint* instructions. */
9631 if (aarch64_frint_unspec_p (XINT (x
, 1)))
9634 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
9639 if (XINT (x
, 1) == UNSPEC_RBIT
)
9642 *cost
+= extra_cost
->alu
.rev
;
9650 /* Decompose <su>muldi3_highpart. */
9651 if (/* (truncate:DI */
9654 && GET_MODE (XEXP (x
, 0)) == TImode
9655 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
9657 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
9658 /* (ANY_EXTEND:TI (reg:DI))
9659 (ANY_EXTEND:TI (reg:DI))) */
9660 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
9661 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
9662 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
9663 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
9664 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
9665 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
9666 /* (const_int 64) */
9667 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
9668 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
9672 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
9673 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
9674 mode
, MULT
, 0, speed
);
9675 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
9676 mode
, MULT
, 1, speed
);
9686 && flag_aarch64_verbose_cost
)
9688 "\nFailed to cost RTX. Assuming default cost.\n");
9693 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9694 calculated for X. This cost is stored in *COST. Returns true
9695 if the total cost of X was calculated. */
9697 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
9698 int param
, int *cost
, bool speed
)
9700 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
9703 && flag_aarch64_verbose_cost
)
9705 print_rtl_single (dump_file
, x
);
9706 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
9707 speed
? "Hot" : "Cold",
9708 *cost
, result
? "final" : "partial");
9715 aarch64_register_move_cost (machine_mode mode
,
9716 reg_class_t from_i
, reg_class_t to_i
)
9718 enum reg_class from
= (enum reg_class
) from_i
;
9719 enum reg_class to
= (enum reg_class
) to_i
;
9720 const struct cpu_regmove_cost
*regmove_cost
9721 = aarch64_tune_params
.regmove_cost
;
9723 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9724 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
9727 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
9728 from
= GENERAL_REGS
;
9730 /* Moving between GPR and stack cost is the same as GP2GP. */
9731 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
9732 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
9733 return regmove_cost
->GP2GP
;
9735 /* To/From the stack register, we move via the gprs. */
9736 if (to
== STACK_REG
|| from
== STACK_REG
)
9737 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
9738 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
9740 if (known_eq (GET_MODE_SIZE (mode
), 16))
9742 /* 128-bit operations on general registers require 2 instructions. */
9743 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9744 return regmove_cost
->GP2GP
* 2;
9745 else if (from
== GENERAL_REGS
)
9746 return regmove_cost
->GP2FP
* 2;
9747 else if (to
== GENERAL_REGS
)
9748 return regmove_cost
->FP2GP
* 2;
9750 /* When AdvSIMD instructions are disabled it is not possible to move
9751 a 128-bit value directly between Q registers. This is handled in
9752 secondary reload. A general register is used as a scratch to move
9753 the upper DI value and the lower DI value is moved directly,
9754 hence the cost is the sum of three moves. */
9756 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
9758 return regmove_cost
->FP2FP
;
9761 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9762 return regmove_cost
->GP2GP
;
9763 else if (from
== GENERAL_REGS
)
9764 return regmove_cost
->GP2FP
;
9765 else if (to
== GENERAL_REGS
)
9766 return regmove_cost
->FP2GP
;
9768 return regmove_cost
->FP2FP
;
9772 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
9773 reg_class_t rclass ATTRIBUTE_UNUSED
,
9774 bool in ATTRIBUTE_UNUSED
)
9776 return aarch64_tune_params
.memmov_cost
;
9779 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9780 to optimize 1.0/sqrt. */
9783 use_rsqrt_p (machine_mode mode
)
9785 return (!flag_trapping_math
9786 && flag_unsafe_math_optimizations
9787 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
9788 & AARCH64_APPROX_MODE (mode
))
9789 || flag_mrecip_low_precision_sqrt
));
9792 /* Function to decide when to use the approximate reciprocal square root
9796 aarch64_builtin_reciprocal (tree fndecl
)
9798 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
9800 if (!use_rsqrt_p (mode
))
9802 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
9805 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
9807 /* Select reciprocal square root initial estimate insn depending on machine
9811 get_rsqrte_type (machine_mode mode
)
9815 case E_DFmode
: return gen_aarch64_rsqrtedf
;
9816 case E_SFmode
: return gen_aarch64_rsqrtesf
;
9817 case E_V2DFmode
: return gen_aarch64_rsqrtev2df
;
9818 case E_V2SFmode
: return gen_aarch64_rsqrtev2sf
;
9819 case E_V4SFmode
: return gen_aarch64_rsqrtev4sf
;
9820 default: gcc_unreachable ();
9824 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
9826 /* Select reciprocal square root series step insn depending on machine mode. */
9829 get_rsqrts_type (machine_mode mode
)
9833 case E_DFmode
: return gen_aarch64_rsqrtsdf
;
9834 case E_SFmode
: return gen_aarch64_rsqrtssf
;
9835 case E_V2DFmode
: return gen_aarch64_rsqrtsv2df
;
9836 case E_V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
9837 case E_V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
9838 default: gcc_unreachable ();
9842 /* Emit instruction sequence to compute either the approximate square root
9843 or its approximate reciprocal, depending on the flag RECP, and return
9844 whether the sequence was emitted or not. */
9847 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
9849 machine_mode mode
= GET_MODE (dst
);
9851 if (GET_MODE_INNER (mode
) == HFmode
)
9859 if (!(flag_mlow_precision_sqrt
9860 || (aarch64_tune_params
.approx_modes
->sqrt
9861 & AARCH64_APPROX_MODE (mode
))))
9864 if (flag_finite_math_only
9865 || flag_trapping_math
9866 || !flag_unsafe_math_optimizations
9867 || optimize_function_for_size_p (cfun
))
9871 /* Caller assumes we cannot fail. */
9872 gcc_assert (use_rsqrt_p (mode
));
9874 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
9875 rtx xmsk
= gen_reg_rtx (mmsk
);
9877 /* When calculating the approximate square root, compare the
9878 argument with 0.0 and create a mask. */
9879 emit_insn (gen_rtx_SET (xmsk
,
9881 gen_rtx_EQ (mmsk
, src
,
9882 CONST0_RTX (mode
)))));
9884 /* Estimate the approximate reciprocal square root. */
9885 rtx xdst
= gen_reg_rtx (mode
);
9886 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
9888 /* Iterate over the series twice for SF and thrice for DF. */
9889 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9891 /* Optionally iterate over the series once less for faster performance
9892 while sacrificing the accuracy. */
9893 if ((recp
&& flag_mrecip_low_precision_sqrt
)
9894 || (!recp
&& flag_mlow_precision_sqrt
))
9897 /* Iterate over the series to calculate the approximate reciprocal square
9899 rtx x1
= gen_reg_rtx (mode
);
9900 while (iterations
--)
9902 rtx x2
= gen_reg_rtx (mode
);
9903 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
9905 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
9908 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
9913 /* Qualify the approximate reciprocal square root when the argument is
9914 0.0 by squashing the intermediary result to 0.0. */
9915 rtx xtmp
= gen_reg_rtx (mmsk
);
9916 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
9917 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
9918 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
9920 /* Calculate the approximate square root. */
9921 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
9924 /* Finalize the approximation. */
9925 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
9930 typedef rtx (*recpe_type
) (rtx
, rtx
);
9932 /* Select reciprocal initial estimate insn depending on machine mode. */
9935 get_recpe_type (machine_mode mode
)
9939 case E_SFmode
: return (gen_aarch64_frecpesf
);
9940 case E_V2SFmode
: return (gen_aarch64_frecpev2sf
);
9941 case E_V4SFmode
: return (gen_aarch64_frecpev4sf
);
9942 case E_DFmode
: return (gen_aarch64_frecpedf
);
9943 case E_V2DFmode
: return (gen_aarch64_frecpev2df
);
9944 default: gcc_unreachable ();
9948 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
9950 /* Select reciprocal series step insn depending on machine mode. */
9953 get_recps_type (machine_mode mode
)
9957 case E_SFmode
: return (gen_aarch64_frecpssf
);
9958 case E_V2SFmode
: return (gen_aarch64_frecpsv2sf
);
9959 case E_V4SFmode
: return (gen_aarch64_frecpsv4sf
);
9960 case E_DFmode
: return (gen_aarch64_frecpsdf
);
9961 case E_V2DFmode
: return (gen_aarch64_frecpsv2df
);
9962 default: gcc_unreachable ();
9966 /* Emit the instruction sequence to compute the approximation for the division
9967 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9970 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
9972 machine_mode mode
= GET_MODE (quo
);
9974 if (GET_MODE_INNER (mode
) == HFmode
)
9977 bool use_approx_division_p
= (flag_mlow_precision_div
9978 || (aarch64_tune_params
.approx_modes
->division
9979 & AARCH64_APPROX_MODE (mode
)));
9981 if (!flag_finite_math_only
9982 || flag_trapping_math
9983 || !flag_unsafe_math_optimizations
9984 || optimize_function_for_size_p (cfun
)
9985 || !use_approx_division_p
)
9988 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
9991 /* Estimate the approximate reciprocal. */
9992 rtx xrcp
= gen_reg_rtx (mode
);
9993 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
9995 /* Iterate over the series twice for SF and thrice for DF. */
9996 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9998 /* Optionally iterate over the series once less for faster performance,
9999 while sacrificing the accuracy. */
10000 if (flag_mlow_precision_div
)
10003 /* Iterate over the series to calculate the approximate reciprocal. */
10004 rtx xtmp
= gen_reg_rtx (mode
);
10005 while (iterations
--)
10007 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
10009 if (iterations
> 0)
10010 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10013 if (num
!= CONST1_RTX (mode
))
10015 /* As the approximate reciprocal of DEN is already calculated, only
10016 calculate the approximate division when NUM is not 1.0. */
10017 rtx xnum
= force_reg (mode
, num
);
10018 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
10021 /* Finalize the approximation. */
10022 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10026 /* Return the number of instructions that can be issued per cycle. */
10028 aarch64_sched_issue_rate (void)
10030 return aarch64_tune_params
.issue_rate
;
10034 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10036 int issue_rate
= aarch64_sched_issue_rate ();
10038 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
10042 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10043 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10044 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10047 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10050 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10054 /* Vectorizer cost model target hooks. */
10056 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10058 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10060 int misalign ATTRIBUTE_UNUSED
)
10063 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10066 if (vectype
!= NULL
)
10067 fp
= FLOAT_TYPE_P (vectype
);
10069 switch (type_of_cost
)
10072 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10075 return costs
->scalar_load_cost
;
10078 return costs
->scalar_store_cost
;
10081 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10084 return costs
->vec_align_load_cost
;
10087 return costs
->vec_store_cost
;
10089 case vec_to_scalar
:
10090 return costs
->vec_to_scalar_cost
;
10092 case scalar_to_vec
:
10093 return costs
->scalar_to_vec_cost
;
10095 case unaligned_load
:
10096 case vector_gather_load
:
10097 return costs
->vec_unalign_load_cost
;
10099 case unaligned_store
:
10100 case vector_scatter_store
:
10101 return costs
->vec_unalign_store_cost
;
10103 case cond_branch_taken
:
10104 return costs
->cond_taken_branch_cost
;
10106 case cond_branch_not_taken
:
10107 return costs
->cond_not_taken_branch_cost
;
10110 return costs
->vec_permute_cost
;
10112 case vec_promote_demote
:
10113 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10115 case vec_construct
:
10116 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10117 return elements
/ 2 + 1;
10120 gcc_unreachable ();
10124 /* Implement targetm.vectorize.add_stmt_cost. */
10126 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10127 struct _stmt_vec_info
*stmt_info
, int misalign
,
10128 enum vect_cost_model_location where
)
10130 unsigned *cost
= (unsigned *) data
;
10131 unsigned retval
= 0;
10133 if (flag_vect_cost_model
)
10135 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10137 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10139 /* Statements in an inner loop relative to the loop being
10140 vectorized are weighted more heavily. The value here is
10141 arbitrary and could potentially be improved with analysis. */
10142 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10143 count
*= 50; /* FIXME */
10145 retval
= (unsigned) (count
* stmt_cost
);
10146 cost
[where
] += retval
;
10152 static void initialize_aarch64_code_model (struct gcc_options
*);
10154 /* Parse the TO_PARSE string and put the architecture struct that it
10155 selects into RES and the architectural features into ISA_FLAGS.
10156 Return an aarch64_parse_opt_result describing the parse result.
10157 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10159 static enum aarch64_parse_opt_result
10160 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10161 unsigned long *isa_flags
)
10164 const struct processor
*arch
;
10165 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10168 strcpy (str
, to_parse
);
10170 ext
= strchr (str
, '+');
10175 len
= strlen (str
);
10178 return AARCH64_PARSE_MISSING_ARG
;
10181 /* Loop through the list of supported ARCHes to find a match. */
10182 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10184 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
10186 unsigned long isa_temp
= arch
->flags
;
10190 /* TO_PARSE string contains at least one extension. */
10191 enum aarch64_parse_opt_result ext_res
10192 = aarch64_parse_extension (ext
, &isa_temp
);
10194 if (ext_res
!= AARCH64_PARSE_OK
)
10197 /* Extension parsing was successful. Confirm the result
10198 arch and ISA flags. */
10200 *isa_flags
= isa_temp
;
10201 return AARCH64_PARSE_OK
;
10205 /* ARCH name not found in list. */
10206 return AARCH64_PARSE_INVALID_ARG
;
10209 /* Parse the TO_PARSE string and put the result tuning in RES and the
10210 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10211 describing the parse result. If there is an error parsing, RES and
10212 ISA_FLAGS are left unchanged. */
10214 static enum aarch64_parse_opt_result
10215 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10216 unsigned long *isa_flags
)
10219 const struct processor
*cpu
;
10220 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10223 strcpy (str
, to_parse
);
10225 ext
= strchr (str
, '+');
10230 len
= strlen (str
);
10233 return AARCH64_PARSE_MISSING_ARG
;
10236 /* Loop through the list of supported CPUs to find a match. */
10237 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10239 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
10241 unsigned long isa_temp
= cpu
->flags
;
10246 /* TO_PARSE string contains at least one extension. */
10247 enum aarch64_parse_opt_result ext_res
10248 = aarch64_parse_extension (ext
, &isa_temp
);
10250 if (ext_res
!= AARCH64_PARSE_OK
)
10253 /* Extension parsing was successfull. Confirm the result
10254 cpu and ISA flags. */
10256 *isa_flags
= isa_temp
;
10257 return AARCH64_PARSE_OK
;
10261 /* CPU name not found in list. */
10262 return AARCH64_PARSE_INVALID_ARG
;
10265 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10266 Return an aarch64_parse_opt_result describing the parse result.
10267 If the parsing fails the RES does not change. */
10269 static enum aarch64_parse_opt_result
10270 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
10272 const struct processor
*cpu
;
10273 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10275 strcpy (str
, to_parse
);
10277 /* Loop through the list of supported CPUs to find a match. */
10278 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10280 if (strcmp (cpu
->name
, str
) == 0)
10283 return AARCH64_PARSE_OK
;
10287 /* CPU name not found in list. */
10288 return AARCH64_PARSE_INVALID_ARG
;
10291 /* Parse TOKEN, which has length LENGTH to see if it is an option
10292 described in FLAG. If it is, return the index bit for that fusion type.
10293 If not, error (printing OPTION_NAME) and return zero. */
10295 static unsigned int
10296 aarch64_parse_one_option_token (const char *token
,
10298 const struct aarch64_flag_desc
*flag
,
10299 const char *option_name
)
10301 for (; flag
->name
!= NULL
; flag
++)
10303 if (length
== strlen (flag
->name
)
10304 && !strncmp (flag
->name
, token
, length
))
10308 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10312 /* Parse OPTION which is a comma-separated list of flags to enable.
10313 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10314 default state we inherit from the CPU tuning structures. OPTION_NAME
10315 gives the top-level option we are parsing in the -moverride string,
10316 for use in error messages. */
10318 static unsigned int
10319 aarch64_parse_boolean_options (const char *option
,
10320 const struct aarch64_flag_desc
*flags
,
10321 unsigned int initial_state
,
10322 const char *option_name
)
10324 const char separator
= '.';
10325 const char* specs
= option
;
10326 const char* ntoken
= option
;
10327 unsigned int found_flags
= initial_state
;
10329 while ((ntoken
= strchr (specs
, separator
)))
10331 size_t token_length
= ntoken
- specs
;
10332 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10336 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10337 in the token stream, reset the supported operations. So:
10339 adrp+add.cmp+branch.none.adrp+add
10341 would have the result of turning on only adrp+add fusion. */
10345 found_flags
|= token_ops
;
10349 /* We ended with a comma, print something. */
10352 error ("%s string ill-formed\n", option_name
);
10356 /* We still have one more token to parse. */
10357 size_t token_length
= strlen (specs
);
10358 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10365 found_flags
|= token_ops
;
10366 return found_flags
;
10369 /* Support for overriding instruction fusion. */
10372 aarch64_parse_fuse_string (const char *fuse_string
,
10373 struct tune_params
*tune
)
10375 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
10376 aarch64_fusible_pairs
,
10381 /* Support for overriding other tuning flags. */
10384 aarch64_parse_tune_string (const char *tune_string
,
10385 struct tune_params
*tune
)
10387 tune
->extra_tuning_flags
10388 = aarch64_parse_boolean_options (tune_string
,
10389 aarch64_tuning_flags
,
10390 tune
->extra_tuning_flags
,
10394 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10395 we understand. If it is, extract the option string and handoff to
10396 the appropriate function. */
10399 aarch64_parse_one_override_token (const char* token
,
10401 struct tune_params
*tune
)
10403 const struct aarch64_tuning_override_function
*fn
10404 = aarch64_tuning_override_functions
;
10406 const char *option_part
= strchr (token
, '=');
10409 error ("tuning string missing in option (%s)", token
);
10413 /* Get the length of the option name. */
10414 length
= option_part
- token
;
10415 /* Skip the '=' to get to the option string. */
10418 for (; fn
->name
!= NULL
; fn
++)
10420 if (!strncmp (fn
->name
, token
, length
))
10422 fn
->parse_override (option_part
, tune
);
10427 error ("unknown tuning option (%s)",token
);
10431 /* A checking mechanism for the implementation of the tls size. */
10434 initialize_aarch64_tls_size (struct gcc_options
*opts
)
10436 if (aarch64_tls_size
== 0)
10437 aarch64_tls_size
= 24;
10439 switch (opts
->x_aarch64_cmodel_var
)
10441 case AARCH64_CMODEL_TINY
:
10442 /* Both the default and maximum TLS size allowed under tiny is 1M which
10443 needs two instructions to address, so we clamp the size to 24. */
10444 if (aarch64_tls_size
> 24)
10445 aarch64_tls_size
= 24;
10447 case AARCH64_CMODEL_SMALL
:
10448 /* The maximum TLS size allowed under small is 4G. */
10449 if (aarch64_tls_size
> 32)
10450 aarch64_tls_size
= 32;
10452 case AARCH64_CMODEL_LARGE
:
10453 /* The maximum TLS size allowed under large is 16E.
10454 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10455 if (aarch64_tls_size
> 48)
10456 aarch64_tls_size
= 48;
10459 gcc_unreachable ();
10465 /* Parse STRING looking for options in the format:
10466 string :: option:string
10467 option :: name=substring
10469 substring :: defined by option. */
10472 aarch64_parse_override_string (const char* input_string
,
10473 struct tune_params
* tune
)
10475 const char separator
= ':';
10476 size_t string_length
= strlen (input_string
) + 1;
10477 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
10478 char *string
= string_root
;
10479 strncpy (string
, input_string
, string_length
);
10480 string
[string_length
- 1] = '\0';
10482 char* ntoken
= string
;
10484 while ((ntoken
= strchr (string
, separator
)))
10486 size_t token_length
= ntoken
- string
;
10487 /* Make this substring look like a string. */
10489 aarch64_parse_one_override_token (string
, token_length
, tune
);
10493 /* One last option to parse. */
10494 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
10495 free (string_root
);
10500 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
10502 /* PR 70044: We have to be careful about being called multiple times for the
10503 same function. This means all changes should be repeatable. */
10505 /* If the frame pointer is enabled, set it to a special value that behaves
10506 similar to frame pointer omission. If we don't do this all leaf functions
10507 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10508 If flag_omit_frame_pointer has this special value, we must force the
10509 frame pointer if not in a leaf function. We also need to force it in a
10510 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10511 if (opts
->x_flag_omit_frame_pointer
== 0)
10512 opts
->x_flag_omit_frame_pointer
= 2;
10514 /* If not optimizing for size, set the default
10515 alignment to what the target wants. */
10516 if (!opts
->x_optimize_size
)
10518 if (opts
->x_align_loops
<= 0)
10519 opts
->x_align_loops
= aarch64_tune_params
.loop_align
;
10520 if (opts
->x_align_jumps
<= 0)
10521 opts
->x_align_jumps
= aarch64_tune_params
.jump_align
;
10522 if (opts
->x_align_functions
<= 0)
10523 opts
->x_align_functions
= aarch64_tune_params
.function_align
;
10526 /* We default to no pc-relative literal loads. */
10528 aarch64_pcrelative_literal_loads
= false;
10530 /* If -mpc-relative-literal-loads is set on the command line, this
10531 implies that the user asked for PC relative literal loads. */
10532 if (opts
->x_pcrelative_literal_loads
== 1)
10533 aarch64_pcrelative_literal_loads
= true;
10535 /* In the tiny memory model it makes no sense to disallow PC relative
10536 literal pool loads. */
10537 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10538 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10539 aarch64_pcrelative_literal_loads
= true;
10541 /* When enabling the lower precision Newton series for the square root, also
10542 enable it for the reciprocal square root, since the latter is an
10543 intermediary step for the former. */
10544 if (flag_mlow_precision_sqrt
)
10545 flag_mrecip_low_precision_sqrt
= true;
10548 /* 'Unpack' up the internal tuning structs and update the options
10549 in OPTS. The caller must have set up selected_tune and selected_arch
10550 as all the other target-specific codegen decisions are
10551 derived from them. */
10554 aarch64_override_options_internal (struct gcc_options
*opts
)
10556 aarch64_tune_flags
= selected_tune
->flags
;
10557 aarch64_tune
= selected_tune
->sched_core
;
10558 /* Make a copy of the tuning parameters attached to the core, which
10559 we may later overwrite. */
10560 aarch64_tune_params
= *(selected_tune
->tune
);
10561 aarch64_architecture_version
= selected_arch
->architecture_version
;
10563 if (opts
->x_aarch64_override_tune_string
)
10564 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
10565 &aarch64_tune_params
);
10567 /* This target defaults to strict volatile bitfields. */
10568 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
10569 opts
->x_flag_strict_volatile_bitfields
= 1;
10571 initialize_aarch64_code_model (opts
);
10572 initialize_aarch64_tls_size (opts
);
10574 int queue_depth
= 0;
10575 switch (aarch64_tune_params
.autoprefetcher_model
)
10577 case tune_params::AUTOPREFETCHER_OFF
:
10580 case tune_params::AUTOPREFETCHER_WEAK
:
10583 case tune_params::AUTOPREFETCHER_STRONG
:
10584 queue_depth
= max_insn_queue_index
+ 1;
10587 gcc_unreachable ();
10590 /* We don't mind passing in global_options_set here as we don't use
10591 the *options_set structs anyway. */
10592 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
10594 opts
->x_param_values
,
10595 global_options_set
.x_param_values
);
10597 /* Set up parameters to be used in prefetching algorithm. Do not
10598 override the defaults unless we are tuning for a core we have
10599 researched values for. */
10600 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
10601 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
10602 aarch64_tune_params
.prefetch
->num_slots
,
10603 opts
->x_param_values
,
10604 global_options_set
.x_param_values
);
10605 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
10606 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
10607 aarch64_tune_params
.prefetch
->l1_cache_size
,
10608 opts
->x_param_values
,
10609 global_options_set
.x_param_values
);
10610 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
10611 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
10612 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
10613 opts
->x_param_values
,
10614 global_options_set
.x_param_values
);
10615 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
10616 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
10617 aarch64_tune_params
.prefetch
->l2_cache_size
,
10618 opts
->x_param_values
,
10619 global_options_set
.x_param_values
);
10621 /* Use the alternative scheduling-pressure algorithm by default. */
10622 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
10623 opts
->x_param_values
,
10624 global_options_set
.x_param_values
);
10626 /* Enable sw prefetching at specified optimization level for
10627 CPUS that have prefetch. Lower optimization level threshold by 1
10628 when profiling is enabled. */
10629 if (opts
->x_flag_prefetch_loop_arrays
< 0
10630 && !opts
->x_optimize_size
10631 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
10632 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
10633 opts
->x_flag_prefetch_loop_arrays
= 1;
10635 aarch64_override_options_after_change_1 (opts
);
10638 /* Print a hint with a suggestion for a core or architecture name that
10639 most closely resembles what the user passed in STR. ARCH is true if
10640 the user is asking for an architecture name. ARCH is false if the user
10641 is asking for a core name. */
10644 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
10646 auto_vec
<const char *> candidates
;
10647 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
10648 for (; entry
->name
!= NULL
; entry
++)
10649 candidates
.safe_push (entry
->name
);
10651 #ifdef HAVE_LOCAL_CPU_DETECT
10652 /* Add also "native" as possible value. */
10654 candidates
.safe_push ("native");
10658 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
10660 inform (input_location
, "valid arguments are: %s;"
10661 " did you mean %qs?", s
, hint
);
10663 inform (input_location
, "valid arguments are: %s", s
);
10668 /* Print a hint with a suggestion for a core name that most closely resembles
10669 what the user passed in STR. */
10672 aarch64_print_hint_for_core (const char *str
)
10674 aarch64_print_hint_for_core_or_arch (str
, false);
10677 /* Print a hint with a suggestion for an architecture name that most closely
10678 resembles what the user passed in STR. */
10681 aarch64_print_hint_for_arch (const char *str
)
10683 aarch64_print_hint_for_core_or_arch (str
, true);
10686 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10687 specified in STR and throw errors if appropriate. Put the results if
10688 they are valid in RES and ISA_FLAGS. Return whether the option is
10692 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
10693 unsigned long *isa_flags
)
10695 enum aarch64_parse_opt_result parse_res
10696 = aarch64_parse_cpu (str
, res
, isa_flags
);
10698 if (parse_res
== AARCH64_PARSE_OK
)
10703 case AARCH64_PARSE_MISSING_ARG
:
10704 error ("missing cpu name in %<-mcpu=%s%>", str
);
10706 case AARCH64_PARSE_INVALID_ARG
:
10707 error ("unknown value %qs for -mcpu", str
);
10708 aarch64_print_hint_for_core (str
);
10710 case AARCH64_PARSE_INVALID_FEATURE
:
10711 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
10714 gcc_unreachable ();
10720 /* Validate a command-line -march option. Parse the arch and extensions
10721 (if any) specified in STR and throw errors if appropriate. Put the
10722 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10723 option is valid. */
10726 aarch64_validate_march (const char *str
, const struct processor
**res
,
10727 unsigned long *isa_flags
)
10729 enum aarch64_parse_opt_result parse_res
10730 = aarch64_parse_arch (str
, res
, isa_flags
);
10732 if (parse_res
== AARCH64_PARSE_OK
)
10737 case AARCH64_PARSE_MISSING_ARG
:
10738 error ("missing arch name in %<-march=%s%>", str
);
10740 case AARCH64_PARSE_INVALID_ARG
:
10741 error ("unknown value %qs for -march", str
);
10742 aarch64_print_hint_for_arch (str
);
10744 case AARCH64_PARSE_INVALID_FEATURE
:
10745 error ("invalid feature modifier in %<-march=%s%>", str
);
10748 gcc_unreachable ();
10754 /* Validate a command-line -mtune option. Parse the cpu
10755 specified in STR and throw errors if appropriate. Put the
10756 result, if it is valid, in RES. Return whether the option is
10760 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
10762 enum aarch64_parse_opt_result parse_res
10763 = aarch64_parse_tune (str
, res
);
10765 if (parse_res
== AARCH64_PARSE_OK
)
10770 case AARCH64_PARSE_MISSING_ARG
:
10771 error ("missing cpu name in %<-mtune=%s%>", str
);
10773 case AARCH64_PARSE_INVALID_ARG
:
10774 error ("unknown value %qs for -mtune", str
);
10775 aarch64_print_hint_for_core (str
);
10778 gcc_unreachable ();
10783 /* Return the CPU corresponding to the enum CPU.
10784 If it doesn't specify a cpu, return the default. */
10786 static const struct processor
*
10787 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
10789 if (cpu
!= aarch64_none
)
10790 return &all_cores
[cpu
];
10792 /* The & 0x3f is to extract the bottom 6 bits that encode the
10793 default cpu as selected by the --with-cpu GCC configure option
10795 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10796 flags mechanism should be reworked to make it more sane. */
10797 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10800 /* Return the architecture corresponding to the enum ARCH.
10801 If it doesn't specify a valid architecture, return the default. */
10803 static const struct processor
*
10804 aarch64_get_arch (enum aarch64_arch arch
)
10806 if (arch
!= aarch64_no_arch
)
10807 return &all_architectures
[arch
];
10809 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10811 return &all_architectures
[cpu
->arch
];
10814 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10817 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
10819 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10820 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10821 deciding which .md file patterns to use and when deciding whether
10822 something is a legitimate address or constant. */
10823 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
10824 return poly_uint16 (2, 2);
10826 return (int) value
/ 64;
10829 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10830 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10831 tuning structs. In particular it must set selected_tune and
10832 aarch64_isa_flags that define the available ISA features and tuning
10833 decisions. It must also set selected_arch as this will be used to
10834 output the .arch asm tags for each function. */
10837 aarch64_override_options (void)
10839 unsigned long cpu_isa
= 0;
10840 unsigned long arch_isa
= 0;
10841 aarch64_isa_flags
= 0;
10843 bool valid_cpu
= true;
10844 bool valid_tune
= true;
10845 bool valid_arch
= true;
10847 selected_cpu
= NULL
;
10848 selected_arch
= NULL
;
10849 selected_tune
= NULL
;
10851 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10852 If either of -march or -mtune is given, they override their
10853 respective component of -mcpu. */
10854 if (aarch64_cpu_string
)
10855 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
10858 if (aarch64_arch_string
)
10859 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
10862 if (aarch64_tune_string
)
10863 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
10865 /* If the user did not specify a processor, choose the default
10866 one for them. This will be the CPU set during configuration using
10867 --with-cpu, otherwise it is "generic". */
10872 selected_cpu
= &all_cores
[selected_arch
->ident
];
10873 aarch64_isa_flags
= arch_isa
;
10874 explicit_arch
= selected_arch
->arch
;
10878 /* Get default configure-time CPU. */
10879 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
10880 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
10884 explicit_tune_core
= selected_tune
->ident
;
10886 /* If both -mcpu and -march are specified check that they are architecturally
10887 compatible, warn if they're not and prefer the -march ISA flags. */
10888 else if (selected_arch
)
10890 if (selected_arch
->arch
!= selected_cpu
->arch
)
10892 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10893 all_architectures
[selected_cpu
->arch
].name
,
10894 selected_arch
->name
);
10896 aarch64_isa_flags
= arch_isa
;
10897 explicit_arch
= selected_arch
->arch
;
10898 explicit_tune_core
= selected_tune
? selected_tune
->ident
10899 : selected_cpu
->ident
;
10903 /* -mcpu but no -march. */
10904 aarch64_isa_flags
= cpu_isa
;
10905 explicit_tune_core
= selected_tune
? selected_tune
->ident
10906 : selected_cpu
->ident
;
10907 gcc_assert (selected_cpu
);
10908 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10909 explicit_arch
= selected_arch
->arch
;
10912 /* Set the arch as well as we will need it when outputing
10913 the .arch directive in assembly. */
10914 if (!selected_arch
)
10916 gcc_assert (selected_cpu
);
10917 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10920 if (!selected_tune
)
10921 selected_tune
= selected_cpu
;
10923 #ifndef HAVE_AS_MABI_OPTION
10924 /* The compiler may have been configured with 2.23.* binutils, which does
10925 not have support for ILP32. */
10927 error ("assembler does not support -mabi=ilp32");
10930 /* Convert -msve-vector-bits to a VG count. */
10931 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
10933 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
10934 sorry ("return address signing is only supported for -mabi=lp64");
10936 /* Make sure we properly set up the explicit options. */
10937 if ((aarch64_cpu_string
&& valid_cpu
)
10938 || (aarch64_tune_string
&& valid_tune
))
10939 gcc_assert (explicit_tune_core
!= aarch64_none
);
10941 if ((aarch64_cpu_string
&& valid_cpu
)
10942 || (aarch64_arch_string
&& valid_arch
))
10943 gcc_assert (explicit_arch
!= aarch64_no_arch
);
10945 aarch64_override_options_internal (&global_options
);
10947 /* Save these options as the default ones in case we push and pop them later
10948 while processing functions with potential target attributes. */
10949 target_option_default_node
= target_option_current_node
10950 = build_target_option_node (&global_options
);
10953 /* Implement targetm.override_options_after_change. */
10956 aarch64_override_options_after_change (void)
10958 aarch64_override_options_after_change_1 (&global_options
);
10961 static struct machine_function
*
10962 aarch64_init_machine_status (void)
10964 struct machine_function
*machine
;
10965 machine
= ggc_cleared_alloc
<machine_function
> ();
10970 aarch64_init_expanders (void)
10972 init_machine_status
= aarch64_init_machine_status
;
10975 /* A checking mechanism for the implementation of the various code models. */
10977 initialize_aarch64_code_model (struct gcc_options
*opts
)
10979 if (opts
->x_flag_pic
)
10981 switch (opts
->x_aarch64_cmodel_var
)
10983 case AARCH64_CMODEL_TINY
:
10984 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
10986 case AARCH64_CMODEL_SMALL
:
10987 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10988 aarch64_cmodel
= (flag_pic
== 2
10989 ? AARCH64_CMODEL_SMALL_PIC
10990 : AARCH64_CMODEL_SMALL_SPIC
);
10992 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
10995 case AARCH64_CMODEL_LARGE
:
10996 sorry ("code model %qs with -f%s", "large",
10997 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
11000 gcc_unreachable ();
11004 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
11007 /* Implement TARGET_OPTION_SAVE. */
11010 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
11012 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
11015 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11016 using the information saved in PTR. */
11019 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
11021 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
11022 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11023 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
11024 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11025 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
11027 aarch64_override_options_internal (opts
);
11030 /* Implement TARGET_OPTION_PRINT. */
11033 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
11035 const struct processor
*cpu
11036 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11037 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
11038 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11039 std::string extension
11040 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
11042 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
11043 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
11044 arch
->name
, extension
.c_str ());
11047 static GTY(()) tree aarch64_previous_fndecl
;
11050 aarch64_reset_previous_fndecl (void)
11052 aarch64_previous_fndecl
= NULL
;
11055 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11056 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11057 make sure optab availability predicates are recomputed when necessary. */
11060 aarch64_save_restore_target_globals (tree new_tree
)
11062 if (TREE_TARGET_GLOBALS (new_tree
))
11063 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
11064 else if (new_tree
== target_option_default_node
)
11065 restore_target_globals (&default_target_globals
);
11067 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
11070 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11071 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11072 of the function, if such exists. This function may be called multiple
11073 times on a single function so use aarch64_previous_fndecl to avoid
11074 setting up identical state. */
11077 aarch64_set_current_function (tree fndecl
)
11079 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
11082 tree old_tree
= (aarch64_previous_fndecl
11083 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
11086 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11088 /* If current function has no attributes but the previous one did,
11089 use the default node. */
11090 if (!new_tree
&& old_tree
)
11091 new_tree
= target_option_default_node
;
11093 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11094 the default have been handled by aarch64_save_restore_target_globals from
11095 aarch64_pragma_target_parse. */
11096 if (old_tree
== new_tree
)
11099 aarch64_previous_fndecl
= fndecl
;
11101 /* First set the target options. */
11102 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
11104 aarch64_save_restore_target_globals (new_tree
);
11107 /* Enum describing the various ways we can handle attributes.
11108 In many cases we can reuse the generic option handling machinery. */
11110 enum aarch64_attr_opt_type
11112 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
11113 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
11114 aarch64_attr_enum
, /* Attribute sets an enum variable. */
11115 aarch64_attr_custom
/* Attribute requires a custom handling function. */
11118 /* All the information needed to handle a target attribute.
11119 NAME is the name of the attribute.
11120 ATTR_TYPE specifies the type of behavior of the attribute as described
11121 in the definition of enum aarch64_attr_opt_type.
11122 ALLOW_NEG is true if the attribute supports a "no-" form.
11123 HANDLER is the function that takes the attribute string as an argument
11124 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11125 OPT_NUM is the enum specifying the option that the attribute modifies.
11126 This is needed for attributes that mirror the behavior of a command-line
11127 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11128 aarch64_attr_enum. */
11130 struct aarch64_attribute_info
11133 enum aarch64_attr_opt_type attr_type
;
11135 bool (*handler
) (const char *);
11136 enum opt_code opt_num
;
11139 /* Handle the ARCH_STR argument to the arch= target attribute. */
11142 aarch64_handle_attr_arch (const char *str
)
11144 const struct processor
*tmp_arch
= NULL
;
11145 enum aarch64_parse_opt_result parse_res
11146 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
11148 if (parse_res
== AARCH64_PARSE_OK
)
11150 gcc_assert (tmp_arch
);
11151 selected_arch
= tmp_arch
;
11152 explicit_arch
= selected_arch
->arch
;
11158 case AARCH64_PARSE_MISSING_ARG
:
11159 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11161 case AARCH64_PARSE_INVALID_ARG
:
11162 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
11163 aarch64_print_hint_for_arch (str
);
11165 case AARCH64_PARSE_INVALID_FEATURE
:
11166 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11169 gcc_unreachable ();
11175 /* Handle the argument CPU_STR to the cpu= target attribute. */
11178 aarch64_handle_attr_cpu (const char *str
)
11180 const struct processor
*tmp_cpu
= NULL
;
11181 enum aarch64_parse_opt_result parse_res
11182 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
11184 if (parse_res
== AARCH64_PARSE_OK
)
11186 gcc_assert (tmp_cpu
);
11187 selected_tune
= tmp_cpu
;
11188 explicit_tune_core
= selected_tune
->ident
;
11190 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
11191 explicit_arch
= selected_arch
->arch
;
11197 case AARCH64_PARSE_MISSING_ARG
:
11198 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11200 case AARCH64_PARSE_INVALID_ARG
:
11201 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
11202 aarch64_print_hint_for_core (str
);
11204 case AARCH64_PARSE_INVALID_FEATURE
:
11205 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11208 gcc_unreachable ();
11214 /* Handle the argument STR to the tune= target attribute. */
11217 aarch64_handle_attr_tune (const char *str
)
11219 const struct processor
*tmp_tune
= NULL
;
11220 enum aarch64_parse_opt_result parse_res
11221 = aarch64_parse_tune (str
, &tmp_tune
);
11223 if (parse_res
== AARCH64_PARSE_OK
)
11225 gcc_assert (tmp_tune
);
11226 selected_tune
= tmp_tune
;
11227 explicit_tune_core
= selected_tune
->ident
;
11233 case AARCH64_PARSE_INVALID_ARG
:
11234 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
11235 aarch64_print_hint_for_core (str
);
11238 gcc_unreachable ();
11244 /* Parse an architecture extensions target attribute string specified in STR.
11245 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11246 if successful. Update aarch64_isa_flags to reflect the ISA features
11250 aarch64_handle_attr_isa_flags (char *str
)
11252 enum aarch64_parse_opt_result parse_res
;
11253 unsigned long isa_flags
= aarch64_isa_flags
;
11255 /* We allow "+nothing" in the beginning to clear out all architectural
11256 features if the user wants to handpick specific features. */
11257 if (strncmp ("+nothing", str
, 8) == 0)
11263 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
11265 if (parse_res
== AARCH64_PARSE_OK
)
11267 aarch64_isa_flags
= isa_flags
;
11273 case AARCH64_PARSE_MISSING_ARG
:
11274 error ("missing value in %<target()%> pragma or attribute");
11277 case AARCH64_PARSE_INVALID_FEATURE
:
11278 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11282 gcc_unreachable ();
11288 /* The target attributes that we support. On top of these we also support just
11289 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11290 handled explicitly in aarch64_process_one_target_attr. */
11292 static const struct aarch64_attribute_info aarch64_attributes
[] =
11294 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
11295 OPT_mgeneral_regs_only
},
11296 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
11297 OPT_mfix_cortex_a53_835769
},
11298 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
11299 OPT_mfix_cortex_a53_843419
},
11300 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
11301 { "strict-align", aarch64_attr_mask
, false, NULL
, OPT_mstrict_align
},
11302 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
11303 OPT_momit_leaf_frame_pointer
},
11304 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
11305 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
11307 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
11308 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
11310 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
11311 OPT_msign_return_address_
},
11312 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
11315 /* Parse ARG_STR which contains the definition of one target attribute.
11316 Show appropriate errors if any or return true if the attribute is valid. */
11319 aarch64_process_one_target_attr (char *arg_str
)
11321 bool invert
= false;
11323 size_t len
= strlen (arg_str
);
11327 error ("malformed %<target()%> pragma or attribute");
11331 char *str_to_check
= (char *) alloca (len
+ 1);
11332 strcpy (str_to_check
, arg_str
);
11334 /* Skip leading whitespace. */
11335 while (*str_to_check
== ' ' || *str_to_check
== '\t')
11338 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11339 It is easier to detect and handle it explicitly here rather than going
11340 through the machinery for the rest of the target attributes in this
11342 if (*str_to_check
== '+')
11343 return aarch64_handle_attr_isa_flags (str_to_check
);
11345 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
11350 char *arg
= strchr (str_to_check
, '=');
11352 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11353 and point ARG to "foo". */
11359 const struct aarch64_attribute_info
*p_attr
;
11360 bool found
= false;
11361 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
11363 /* If the names don't match up, or the user has given an argument
11364 to an attribute that doesn't accept one, or didn't give an argument
11365 to an attribute that expects one, fail to match. */
11366 if (strcmp (str_to_check
, p_attr
->name
) != 0)
11370 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
11371 || p_attr
->attr_type
== aarch64_attr_enum
;
11373 if (attr_need_arg_p
^ (arg
!= NULL
))
11375 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
11379 /* If the name matches but the attribute does not allow "no-" versions
11380 then we can't match. */
11381 if (invert
&& !p_attr
->allow_neg
)
11383 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
11387 switch (p_attr
->attr_type
)
11389 /* Has a custom handler registered.
11390 For example, cpu=, arch=, tune=. */
11391 case aarch64_attr_custom
:
11392 gcc_assert (p_attr
->handler
);
11393 if (!p_attr
->handler (arg
))
11397 /* Either set or unset a boolean option. */
11398 case aarch64_attr_bool
:
11400 struct cl_decoded_option decoded
;
11402 generate_option (p_attr
->opt_num
, NULL
, !invert
,
11403 CL_TARGET
, &decoded
);
11404 aarch64_handle_option (&global_options
, &global_options_set
,
11405 &decoded
, input_location
);
11408 /* Set or unset a bit in the target_flags. aarch64_handle_option
11409 should know what mask to apply given the option number. */
11410 case aarch64_attr_mask
:
11412 struct cl_decoded_option decoded
;
11413 /* We only need to specify the option number.
11414 aarch64_handle_option will know which mask to apply. */
11415 decoded
.opt_index
= p_attr
->opt_num
;
11416 decoded
.value
= !invert
;
11417 aarch64_handle_option (&global_options
, &global_options_set
,
11418 &decoded
, input_location
);
11421 /* Use the option setting machinery to set an option to an enum. */
11422 case aarch64_attr_enum
:
11427 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
11428 &value
, CL_TARGET
);
11431 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
11432 NULL
, DK_UNSPECIFIED
, input_location
,
11437 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
11442 gcc_unreachable ();
11446 /* If we reached here we either have found an attribute and validated
11447 it or didn't match any. If we matched an attribute but its arguments
11448 were malformed we will have returned false already. */
11452 /* Count how many times the character C appears in
11453 NULL-terminated string STR. */
11455 static unsigned int
11456 num_occurences_in_str (char c
, char *str
)
11458 unsigned int res
= 0;
11459 while (*str
!= '\0')
11470 /* Parse the tree in ARGS that contains the target attribute information
11471 and update the global target options space. */
11474 aarch64_process_target_attr (tree args
)
11476 if (TREE_CODE (args
) == TREE_LIST
)
11480 tree head
= TREE_VALUE (args
);
11483 if (!aarch64_process_target_attr (head
))
11486 args
= TREE_CHAIN (args
);
11492 if (TREE_CODE (args
) != STRING_CST
)
11494 error ("attribute %<target%> argument not a string");
11498 size_t len
= strlen (TREE_STRING_POINTER (args
));
11499 char *str_to_check
= (char *) alloca (len
+ 1);
11500 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
11504 error ("malformed %<target()%> pragma or attribute");
11508 /* Used to catch empty spaces between commas i.e.
11509 attribute ((target ("attr1,,attr2"))). */
11510 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
11512 /* Handle multiple target attributes separated by ','. */
11513 char *token
= strtok (str_to_check
, ",");
11515 unsigned int num_attrs
= 0;
11519 if (!aarch64_process_one_target_attr (token
))
11521 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
11525 token
= strtok (NULL
, ",");
11528 if (num_attrs
!= num_commas
+ 1)
11530 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
11537 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11538 process attribute ((target ("..."))). */
11541 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
11543 struct cl_target_option cur_target
;
11546 tree new_target
, new_optimize
;
11547 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11549 /* If what we're processing is the current pragma string then the
11550 target option node is already stored in target_option_current_node
11551 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11552 having to re-parse the string. This is especially useful to keep
11553 arm_neon.h compile times down since that header contains a lot
11554 of intrinsics enclosed in pragmas. */
11555 if (!existing_target
&& args
== current_target_pragma
)
11557 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
11560 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11562 old_optimize
= build_optimization_node (&global_options
);
11563 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11565 /* If the function changed the optimization levels as well as setting
11566 target options, start with the optimizations specified. */
11567 if (func_optimize
&& func_optimize
!= old_optimize
)
11568 cl_optimization_restore (&global_options
,
11569 TREE_OPTIMIZATION (func_optimize
));
11571 /* Save the current target options to restore at the end. */
11572 cl_target_option_save (&cur_target
, &global_options
);
11574 /* If fndecl already has some target attributes applied to it, unpack
11575 them so that we add this attribute on top of them, rather than
11576 overwriting them. */
11577 if (existing_target
)
11579 struct cl_target_option
*existing_options
11580 = TREE_TARGET_OPTION (existing_target
);
11582 if (existing_options
)
11583 cl_target_option_restore (&global_options
, existing_options
);
11586 cl_target_option_restore (&global_options
,
11587 TREE_TARGET_OPTION (target_option_current_node
));
11589 ret
= aarch64_process_target_attr (args
);
11591 /* Set up any additional state. */
11594 aarch64_override_options_internal (&global_options
);
11595 /* Initialize SIMD builtins if we haven't already.
11596 Set current_target_pragma to NULL for the duration so that
11597 the builtin initialization code doesn't try to tag the functions
11598 being built with the attributes specified by any current pragma, thus
11599 going into an infinite recursion. */
11602 tree saved_current_target_pragma
= current_target_pragma
;
11603 current_target_pragma
= NULL
;
11604 aarch64_init_simd_builtins ();
11605 current_target_pragma
= saved_current_target_pragma
;
11607 new_target
= build_target_option_node (&global_options
);
11612 new_optimize
= build_optimization_node (&global_options
);
11616 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
11618 if (old_optimize
!= new_optimize
)
11619 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
11622 cl_target_option_restore (&global_options
, &cur_target
);
11624 if (old_optimize
!= new_optimize
)
11625 cl_optimization_restore (&global_options
,
11626 TREE_OPTIMIZATION (old_optimize
));
11630 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11631 tri-bool options (yes, no, don't care) and the default value is
11632 DEF, determine whether to reject inlining. */
11635 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
11636 int dont_care
, int def
)
11638 /* If the callee doesn't care, always allow inlining. */
11639 if (callee
== dont_care
)
11642 /* If the caller doesn't care, always allow inlining. */
11643 if (caller
== dont_care
)
11646 /* Otherwise, allow inlining if either the callee and caller values
11647 agree, or if the callee is using the default value. */
11648 return (callee
== caller
|| callee
== def
);
11651 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11652 to inline CALLEE into CALLER based on target-specific info.
11653 Make sure that the caller and callee have compatible architectural
11654 features. Then go through the other possible target attributes
11655 and see if they can block inlining. Try not to reject always_inline
11656 callees unless they are incompatible architecturally. */
11659 aarch64_can_inline_p (tree caller
, tree callee
)
11661 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
11662 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
11664 /* If callee has no option attributes, then it is ok to inline. */
11668 struct cl_target_option
*caller_opts
11669 = TREE_TARGET_OPTION (caller_tree
? caller_tree
11670 : target_option_default_node
);
11672 struct cl_target_option
*callee_opts
= TREE_TARGET_OPTION (callee_tree
);
11675 /* Callee's ISA flags should be a subset of the caller's. */
11676 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
11677 != callee_opts
->x_aarch64_isa_flags
)
11680 /* Allow non-strict aligned functions inlining into strict
11682 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
11683 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
11684 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
11685 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
11688 bool always_inline
= lookup_attribute ("always_inline",
11689 DECL_ATTRIBUTES (callee
));
11691 /* If the architectural features match up and the callee is always_inline
11692 then the other attributes don't matter. */
11696 if (caller_opts
->x_aarch64_cmodel_var
11697 != callee_opts
->x_aarch64_cmodel_var
)
11700 if (caller_opts
->x_aarch64_tls_dialect
11701 != callee_opts
->x_aarch64_tls_dialect
)
11704 /* Honour explicit requests to workaround errata. */
11705 if (!aarch64_tribools_ok_for_inlining_p (
11706 caller_opts
->x_aarch64_fix_a53_err835769
,
11707 callee_opts
->x_aarch64_fix_a53_err835769
,
11708 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
11711 if (!aarch64_tribools_ok_for_inlining_p (
11712 caller_opts
->x_aarch64_fix_a53_err843419
,
11713 callee_opts
->x_aarch64_fix_a53_err843419
,
11714 2, TARGET_FIX_ERR_A53_843419
))
11717 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11718 caller and calle and they don't match up, reject inlining. */
11719 if (!aarch64_tribools_ok_for_inlining_p (
11720 caller_opts
->x_flag_omit_leaf_frame_pointer
,
11721 callee_opts
->x_flag_omit_leaf_frame_pointer
,
11725 /* If the callee has specific tuning overrides, respect them. */
11726 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
11727 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
11730 /* If the user specified tuning override strings for the
11731 caller and callee and they don't match up, reject inlining.
11732 We just do a string compare here, we don't analyze the meaning
11733 of the string, as it would be too costly for little gain. */
11734 if (callee_opts
->x_aarch64_override_tune_string
11735 && caller_opts
->x_aarch64_override_tune_string
11736 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
11737 caller_opts
->x_aarch64_override_tune_string
) != 0))
11743 /* Return true if SYMBOL_REF X binds locally. */
11746 aarch64_symbol_binds_local_p (const_rtx x
)
11748 return (SYMBOL_REF_DECL (x
)
11749 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
11750 : SYMBOL_REF_LOCAL_P (x
));
11753 /* Return true if SYMBOL_REF X is thread local */
11755 aarch64_tls_symbol_p (rtx x
)
11757 if (! TARGET_HAVE_TLS
)
11760 if (GET_CODE (x
) != SYMBOL_REF
)
11763 return SYMBOL_REF_TLS_MODEL (x
) != 0;
11766 /* Classify a TLS symbol into one of the TLS kinds. */
11767 enum aarch64_symbol_type
11768 aarch64_classify_tls_symbol (rtx x
)
11770 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
11774 case TLS_MODEL_GLOBAL_DYNAMIC
:
11775 case TLS_MODEL_LOCAL_DYNAMIC
:
11776 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
11778 case TLS_MODEL_INITIAL_EXEC
:
11779 switch (aarch64_cmodel
)
11781 case AARCH64_CMODEL_TINY
:
11782 case AARCH64_CMODEL_TINY_PIC
:
11783 return SYMBOL_TINY_TLSIE
;
11785 return SYMBOL_SMALL_TLSIE
;
11788 case TLS_MODEL_LOCAL_EXEC
:
11789 if (aarch64_tls_size
== 12)
11790 return SYMBOL_TLSLE12
;
11791 else if (aarch64_tls_size
== 24)
11792 return SYMBOL_TLSLE24
;
11793 else if (aarch64_tls_size
== 32)
11794 return SYMBOL_TLSLE32
;
11795 else if (aarch64_tls_size
== 48)
11796 return SYMBOL_TLSLE48
;
11798 gcc_unreachable ();
11800 case TLS_MODEL_EMULATED
:
11801 case TLS_MODEL_NONE
:
11802 return SYMBOL_FORCE_TO_MEM
;
11805 gcc_unreachable ();
11809 /* Return the correct method for accessing X + OFFSET, where X is either
11810 a SYMBOL_REF or LABEL_REF. */
11812 enum aarch64_symbol_type
11813 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
11815 if (GET_CODE (x
) == LABEL_REF
)
11817 switch (aarch64_cmodel
)
11819 case AARCH64_CMODEL_LARGE
:
11820 return SYMBOL_FORCE_TO_MEM
;
11822 case AARCH64_CMODEL_TINY_PIC
:
11823 case AARCH64_CMODEL_TINY
:
11824 return SYMBOL_TINY_ABSOLUTE
;
11826 case AARCH64_CMODEL_SMALL_SPIC
:
11827 case AARCH64_CMODEL_SMALL_PIC
:
11828 case AARCH64_CMODEL_SMALL
:
11829 return SYMBOL_SMALL_ABSOLUTE
;
11832 gcc_unreachable ();
11836 if (GET_CODE (x
) == SYMBOL_REF
)
11838 if (aarch64_tls_symbol_p (x
))
11839 return aarch64_classify_tls_symbol (x
);
11841 switch (aarch64_cmodel
)
11843 case AARCH64_CMODEL_TINY
:
11844 /* When we retrieve symbol + offset address, we have to make sure
11845 the offset does not cause overflow of the final address. But
11846 we have no way of knowing the address of symbol at compile time
11847 so we can't accurately say if the distance between the PC and
11848 symbol + offset is outside the addressible range of +/-1M in the
11849 TINY code model. So we rely on images not being greater than
11850 1M and cap the offset at 1M and anything beyond 1M will have to
11851 be loaded using an alternative mechanism. Furthermore if the
11852 symbol is a weak reference to something that isn't known to
11853 resolve to a symbol in this module, then force to memory. */
11854 if ((SYMBOL_REF_WEAK (x
)
11855 && !aarch64_symbol_binds_local_p (x
))
11856 || !IN_RANGE (offset
, -1048575, 1048575))
11857 return SYMBOL_FORCE_TO_MEM
;
11858 return SYMBOL_TINY_ABSOLUTE
;
11860 case AARCH64_CMODEL_SMALL
:
11861 /* Same reasoning as the tiny code model, but the offset cap here is
11863 if ((SYMBOL_REF_WEAK (x
)
11864 && !aarch64_symbol_binds_local_p (x
))
11865 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
11866 HOST_WIDE_INT_C (4294967264)))
11867 return SYMBOL_FORCE_TO_MEM
;
11868 return SYMBOL_SMALL_ABSOLUTE
;
11870 case AARCH64_CMODEL_TINY_PIC
:
11871 if (!aarch64_symbol_binds_local_p (x
))
11872 return SYMBOL_TINY_GOT
;
11873 return SYMBOL_TINY_ABSOLUTE
;
11875 case AARCH64_CMODEL_SMALL_SPIC
:
11876 case AARCH64_CMODEL_SMALL_PIC
:
11877 if (!aarch64_symbol_binds_local_p (x
))
11878 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
11879 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
11880 return SYMBOL_SMALL_ABSOLUTE
;
11882 case AARCH64_CMODEL_LARGE
:
11883 /* This is alright even in PIC code as the constant
11884 pool reference is always PC relative and within
11885 the same translation unit. */
11886 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
11887 return SYMBOL_SMALL_ABSOLUTE
;
11889 return SYMBOL_FORCE_TO_MEM
;
11892 gcc_unreachable ();
11896 /* By default push everything into the constant pool. */
11897 return SYMBOL_FORCE_TO_MEM
;
11901 aarch64_constant_address_p (rtx x
)
11903 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
11907 aarch64_legitimate_pic_operand_p (rtx x
)
11909 if (GET_CODE (x
) == SYMBOL_REF
11910 || (GET_CODE (x
) == CONST
11911 && GET_CODE (XEXP (x
, 0)) == PLUS
11912 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
11918 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11919 that should be rematerialized rather than spilled. */
11922 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
11924 /* Support CSE and rematerialization of common constants. */
11925 if (CONST_INT_P (x
)
11926 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11927 || GET_CODE (x
) == CONST_VECTOR
)
11930 /* Do not allow vector struct mode constants for Advanced SIMD.
11931 We could support 0 and -1 easily, but they need support in
11932 aarch64-simd.md. */
11933 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
11934 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
11937 /* Only accept variable-length vector constants if they can be
11940 ??? It would be possible to handle rematerialization of other
11941 constants via secondary reloads. */
11942 if (vec_flags
& VEC_ANY_SVE
)
11943 return aarch64_simd_valid_immediate (x
, NULL
);
11945 if (GET_CODE (x
) == HIGH
)
11948 /* Accept polynomial constants that can be calculated by using the
11949 destination of a move as the sole temporary. Constants that
11950 require a second temporary cannot be rematerialized (they can't be
11951 forced to memory and also aren't legitimate constants). */
11953 if (poly_int_rtx_p (x
, &offset
))
11954 return aarch64_offset_temporaries (false, offset
) <= 1;
11956 /* If an offset is being added to something else, we need to allow the
11957 base to be moved into the destination register, meaning that there
11958 are no free temporaries for the offset. */
11959 x
= strip_offset (x
, &offset
);
11960 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
11963 /* Do not allow const (plus (anchor_symbol, const_int)). */
11964 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
11967 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11968 so spilling them is better than rematerialization. */
11969 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
11972 /* Label references are always constant. */
11973 if (GET_CODE (x
) == LABEL_REF
)
11980 aarch64_load_tp (rtx target
)
11983 || GET_MODE (target
) != Pmode
11984 || !register_operand (target
, Pmode
))
11985 target
= gen_reg_rtx (Pmode
);
11987 /* Can return in any reg. */
11988 emit_insn (gen_aarch64_load_tp_hard (target
));
11992 /* On AAPCS systems, this is the "struct __va_list". */
11993 static GTY(()) tree va_list_type
;
11995 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11996 Return the type to use as __builtin_va_list.
11998 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12010 aarch64_build_builtin_va_list (void)
12013 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12015 /* Create the type. */
12016 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
12017 /* Give it the required name. */
12018 va_list_name
= build_decl (BUILTINS_LOCATION
,
12020 get_identifier ("__va_list"),
12022 DECL_ARTIFICIAL (va_list_name
) = 1;
12023 TYPE_NAME (va_list_type
) = va_list_name
;
12024 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
12026 /* Create the fields. */
12027 f_stack
= build_decl (BUILTINS_LOCATION
,
12028 FIELD_DECL
, get_identifier ("__stack"),
12030 f_grtop
= build_decl (BUILTINS_LOCATION
,
12031 FIELD_DECL
, get_identifier ("__gr_top"),
12033 f_vrtop
= build_decl (BUILTINS_LOCATION
,
12034 FIELD_DECL
, get_identifier ("__vr_top"),
12036 f_groff
= build_decl (BUILTINS_LOCATION
,
12037 FIELD_DECL
, get_identifier ("__gr_offs"),
12038 integer_type_node
);
12039 f_vroff
= build_decl (BUILTINS_LOCATION
,
12040 FIELD_DECL
, get_identifier ("__vr_offs"),
12041 integer_type_node
);
12043 /* Tell tree-stdarg pass about our internal offset fields.
12044 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12045 purpose to identify whether the code is updating va_list internal
12046 offset fields through irregular way. */
12047 va_list_gpr_counter_field
= f_groff
;
12048 va_list_fpr_counter_field
= f_vroff
;
12050 DECL_ARTIFICIAL (f_stack
) = 1;
12051 DECL_ARTIFICIAL (f_grtop
) = 1;
12052 DECL_ARTIFICIAL (f_vrtop
) = 1;
12053 DECL_ARTIFICIAL (f_groff
) = 1;
12054 DECL_ARTIFICIAL (f_vroff
) = 1;
12056 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
12057 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
12058 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
12059 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
12060 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
12062 TYPE_FIELDS (va_list_type
) = f_stack
;
12063 DECL_CHAIN (f_stack
) = f_grtop
;
12064 DECL_CHAIN (f_grtop
) = f_vrtop
;
12065 DECL_CHAIN (f_vrtop
) = f_groff
;
12066 DECL_CHAIN (f_groff
) = f_vroff
;
12068 /* Compute its layout. */
12069 layout_type (va_list_type
);
12071 return va_list_type
;
12074 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12076 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
12078 const CUMULATIVE_ARGS
*cum
;
12079 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12080 tree stack
, grtop
, vrtop
, groff
, vroff
;
12082 int gr_save_area_size
= cfun
->va_list_gpr_size
;
12083 int vr_save_area_size
= cfun
->va_list_fpr_size
;
12086 cum
= &crtl
->args
.info
;
12087 if (cfun
->va_list_gpr_size
)
12088 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
12089 cfun
->va_list_gpr_size
);
12090 if (cfun
->va_list_fpr_size
)
12091 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
12092 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
12096 gcc_assert (cum
->aapcs_nvrn
== 0);
12097 vr_save_area_size
= 0;
12100 f_stack
= TYPE_FIELDS (va_list_type_node
);
12101 f_grtop
= DECL_CHAIN (f_stack
);
12102 f_vrtop
= DECL_CHAIN (f_grtop
);
12103 f_groff
= DECL_CHAIN (f_vrtop
);
12104 f_vroff
= DECL_CHAIN (f_groff
);
12106 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
12108 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
12110 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
12112 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
12114 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
12117 /* Emit code to initialize STACK, which points to the next varargs stack
12118 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12119 by named arguments. STACK is 8-byte aligned. */
12120 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
12121 if (cum
->aapcs_stack_size
> 0)
12122 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
12123 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
12124 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12126 /* Emit code to initialize GRTOP, the top of the GR save area.
12127 virtual_incoming_args_rtx should have been 16 byte aligned. */
12128 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
12129 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
12130 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12132 /* Emit code to initialize VRTOP, the top of the VR save area.
12133 This address is gr_save_area_bytes below GRTOP, rounded
12134 down to the next 16-byte boundary. */
12135 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
12136 vr_offset
= ROUND_UP (gr_save_area_size
,
12137 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12140 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
12141 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
12142 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12144 /* Emit code to initialize GROFF, the offset from GRTOP of the
12145 next GPR argument. */
12146 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
12147 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
12148 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12150 /* Likewise emit code to initialize VROFF, the offset from FTOP
12151 of the next VR argument. */
12152 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
12153 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
12154 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12157 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12160 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
12161 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
12165 bool is_ha
; /* is HFA or HVA. */
12166 bool dw_align
; /* double-word align. */
12167 machine_mode ag_mode
= VOIDmode
;
12171 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12172 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
12173 HOST_WIDE_INT size
, rsize
, adjust
, align
;
12174 tree t
, u
, cond1
, cond2
;
12176 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
12178 type
= build_pointer_type (type
);
12180 mode
= TYPE_MODE (type
);
12182 f_stack
= TYPE_FIELDS (va_list_type_node
);
12183 f_grtop
= DECL_CHAIN (f_stack
);
12184 f_vrtop
= DECL_CHAIN (f_grtop
);
12185 f_groff
= DECL_CHAIN (f_vrtop
);
12186 f_vroff
= DECL_CHAIN (f_groff
);
12188 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
12189 f_stack
, NULL_TREE
);
12190 size
= int_size_in_bytes (type
);
12191 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
12195 if (aarch64_vfp_is_call_or_return_candidate (mode
,
12201 /* No frontends can create types with variable-sized modes, so we
12202 shouldn't be asked to pass or return them. */
12203 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
12205 /* TYPE passed in fp/simd registers. */
12207 aarch64_err_no_fpadvsimd (mode
, "varargs");
12209 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
12210 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
12211 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
12212 unshare_expr (valist
), f_vroff
, NULL_TREE
);
12214 rsize
= nregs
* UNITS_PER_VREG
;
12218 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
12219 adjust
= UNITS_PER_VREG
- ag_size
;
12221 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12222 && size
< UNITS_PER_VREG
)
12224 adjust
= UNITS_PER_VREG
- size
;
12229 /* TYPE passed in general registers. */
12230 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
12231 unshare_expr (valist
), f_grtop
, NULL_TREE
);
12232 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
12233 unshare_expr (valist
), f_groff
, NULL_TREE
);
12234 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
12235 nregs
= rsize
/ UNITS_PER_WORD
;
12240 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12241 && size
< UNITS_PER_WORD
)
12243 adjust
= UNITS_PER_WORD
- size
;
12247 /* Get a local temporary for the field value. */
12248 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
12250 /* Emit code to branch if off >= 0. */
12251 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
12252 build_int_cst (TREE_TYPE (off
), 0));
12253 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
12257 /* Emit: offs = (offs + 15) & -16. */
12258 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12259 build_int_cst (TREE_TYPE (off
), 15));
12260 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
12261 build_int_cst (TREE_TYPE (off
), -16));
12262 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
12267 /* Update ap.__[g|v]r_offs */
12268 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12269 build_int_cst (TREE_TYPE (off
), rsize
));
12270 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
12274 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12276 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12277 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
12278 build_int_cst (TREE_TYPE (f_off
), 0));
12279 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
12281 /* String up: make sure the assignment happens before the use. */
12282 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
12283 COND_EXPR_ELSE (cond1
) = t
;
12285 /* Prepare the trees handling the argument that is passed on the stack;
12286 the top level node will store in ON_STACK. */
12287 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
12290 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12291 t
= fold_build_pointer_plus_hwi (arg
, 15);
12292 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12293 build_int_cst (TREE_TYPE (t
), -16));
12294 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
12298 /* Advance ap.__stack */
12299 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
12300 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12301 build_int_cst (TREE_TYPE (t
), -8));
12302 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
12303 /* String up roundup and advance. */
12305 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12306 /* String up with arg */
12307 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
12308 /* Big-endianness related address adjustment. */
12309 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12310 && size
< UNITS_PER_WORD
)
12312 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
12313 size_int (UNITS_PER_WORD
- size
));
12314 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
12317 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
12318 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
12320 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12323 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
12324 build_int_cst (TREE_TYPE (off
), adjust
));
12326 t
= fold_convert (sizetype
, t
);
12327 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
12331 /* type ha; // treat as "struct {ftype field[n];}"
12332 ... [computing offs]
12333 for (i = 0; i <nregs; ++i, offs += 16)
12334 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12337 tree tmp_ha
, field_t
, field_ptr_t
;
12339 /* Declare a local variable. */
12340 tmp_ha
= create_tmp_var_raw (type
, "ha");
12341 gimple_add_tmp_var (tmp_ha
);
12343 /* Establish the base type. */
12347 field_t
= float_type_node
;
12348 field_ptr_t
= float_ptr_type_node
;
12351 field_t
= double_type_node
;
12352 field_ptr_t
= double_ptr_type_node
;
12355 field_t
= long_double_type_node
;
12356 field_ptr_t
= long_double_ptr_type_node
;
12359 field_t
= aarch64_fp16_type_node
;
12360 field_ptr_t
= aarch64_fp16_ptr_type_node
;
12365 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
12366 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
12367 field_ptr_t
= build_pointer_type (field_t
);
12374 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12375 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
12377 t
= fold_convert (field_ptr_t
, addr
);
12378 t
= build2 (MODIFY_EXPR
, field_t
,
12379 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
12380 build1 (INDIRECT_REF
, field_t
, t
));
12382 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12383 for (i
= 1; i
< nregs
; ++i
)
12385 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
12386 u
= fold_convert (field_ptr_t
, addr
);
12387 u
= build2 (MODIFY_EXPR
, field_t
,
12388 build2 (MEM_REF
, field_t
, tmp_ha
,
12389 build_int_cst (field_ptr_t
,
12391 int_size_in_bytes (field_t
)))),
12392 build1 (INDIRECT_REF
, field_t
, u
));
12393 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
12396 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
12397 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
12400 COND_EXPR_ELSE (cond2
) = t
;
12401 addr
= fold_convert (build_pointer_type (type
), cond1
);
12402 addr
= build_va_arg_indirect_ref (addr
);
12405 addr
= build_va_arg_indirect_ref (addr
);
12410 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12413 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
12414 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
12417 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
12418 CUMULATIVE_ARGS local_cum
;
12419 int gr_saved
= cfun
->va_list_gpr_size
;
12420 int vr_saved
= cfun
->va_list_fpr_size
;
12422 /* The caller has advanced CUM up to, but not beyond, the last named
12423 argument. Advance a local copy of CUM past the last "real" named
12424 argument, to find out how many registers are left over. */
12426 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
12428 /* Found out how many registers we need to save.
12429 Honor tree-stdvar analysis results. */
12430 if (cfun
->va_list_gpr_size
)
12431 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
12432 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
12433 if (cfun
->va_list_fpr_size
)
12434 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
12435 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
12439 gcc_assert (local_cum
.aapcs_nvrn
== 0);
12449 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12450 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
12451 - gr_saved
* UNITS_PER_WORD
);
12452 mem
= gen_frame_mem (BLKmode
, ptr
);
12453 set_mem_alias_set (mem
, get_varargs_alias_set ());
12455 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
12460 /* We can't use move_block_from_reg, because it will use
12461 the wrong mode, storing D regs only. */
12462 machine_mode mode
= TImode
;
12463 int off
, i
, vr_start
;
12465 /* Set OFF to the offset from virtual_incoming_args_rtx of
12466 the first vector register. The VR save area lies below
12467 the GR one, and is aligned to 16 bytes. */
12468 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12469 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12470 off
-= vr_saved
* UNITS_PER_VREG
;
12472 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
12473 for (i
= 0; i
< vr_saved
; ++i
)
12477 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
12478 mem
= gen_frame_mem (mode
, ptr
);
12479 set_mem_alias_set (mem
, get_varargs_alias_set ());
12480 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
12481 off
+= UNITS_PER_VREG
;
12486 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12487 any complication of having crtl->args.pretend_args_size changed. */
12488 cfun
->machine
->frame
.saved_varargs_size
12489 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12490 STACK_BOUNDARY
/ BITS_PER_UNIT
)
12491 + vr_saved
* UNITS_PER_VREG
);
12495 aarch64_conditional_register_usage (void)
12500 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
12503 call_used_regs
[i
] = 1;
12507 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
12510 call_used_regs
[i
] = 1;
12514 /* Walk down the type tree of TYPE counting consecutive base elements.
12515 If *MODEP is VOIDmode, then set it to the first valid floating point
12516 type. If a non-floating point type is found, or if a floating point
12517 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12518 otherwise return the count in the sub-tree. */
12520 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
12523 HOST_WIDE_INT size
;
12525 switch (TREE_CODE (type
))
12528 mode
= TYPE_MODE (type
);
12529 if (mode
!= DFmode
&& mode
!= SFmode
12530 && mode
!= TFmode
&& mode
!= HFmode
)
12533 if (*modep
== VOIDmode
)
12536 if (*modep
== mode
)
12542 mode
= TYPE_MODE (TREE_TYPE (type
));
12543 if (mode
!= DFmode
&& mode
!= SFmode
12544 && mode
!= TFmode
&& mode
!= HFmode
)
12547 if (*modep
== VOIDmode
)
12550 if (*modep
== mode
)
12556 /* Use V2SImode and V4SImode as representatives of all 64-bit
12557 and 128-bit vector types. */
12558 size
= int_size_in_bytes (type
);
12571 if (*modep
== VOIDmode
)
12574 /* Vector modes are considered to be opaque: two vectors are
12575 equivalent for the purposes of being homogeneous aggregates
12576 if they are the same size. */
12577 if (*modep
== mode
)
12585 tree index
= TYPE_DOMAIN (type
);
12587 /* Can't handle incomplete types nor sizes that are not
12589 if (!COMPLETE_TYPE_P (type
)
12590 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12593 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
12596 || !TYPE_MAX_VALUE (index
)
12597 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
12598 || !TYPE_MIN_VALUE (index
)
12599 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
12603 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
12604 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
12606 /* There must be no padding. */
12607 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12608 count
* GET_MODE_BITSIZE (*modep
)))
12620 /* Can't handle incomplete types nor sizes that are not
12622 if (!COMPLETE_TYPE_P (type
)
12623 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12626 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12628 if (TREE_CODE (field
) != FIELD_DECL
)
12631 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12634 count
+= sub_count
;
12637 /* There must be no padding. */
12638 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12639 count
* GET_MODE_BITSIZE (*modep
)))
12646 case QUAL_UNION_TYPE
:
12648 /* These aren't very interesting except in a degenerate case. */
12653 /* Can't handle incomplete types nor sizes that are not
12655 if (!COMPLETE_TYPE_P (type
)
12656 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12659 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12661 if (TREE_CODE (field
) != FIELD_DECL
)
12664 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12667 count
= count
> sub_count
? count
: sub_count
;
12670 /* There must be no padding. */
12671 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12672 count
* GET_MODE_BITSIZE (*modep
)))
12685 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12686 type as described in AAPCS64 \S 4.1.2.
12688 See the comment above aarch64_composite_type_p for the notes on MODE. */
12691 aarch64_short_vector_p (const_tree type
,
12694 poly_int64 size
= -1;
12696 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
12697 size
= int_size_in_bytes (type
);
12698 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
12699 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
12700 size
= GET_MODE_SIZE (mode
);
12702 return known_eq (size
, 8) || known_eq (size
, 16);
12705 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12706 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12707 array types. The C99 floating-point complex types are also considered
12708 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12709 types, which are GCC extensions and out of the scope of AAPCS64, are
12710 treated as composite types here as well.
12712 Note that MODE itself is not sufficient in determining whether a type
12713 is such a composite type or not. This is because
12714 stor-layout.c:compute_record_mode may have already changed the MODE
12715 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12716 structure with only one field may have its MODE set to the mode of the
12717 field. Also an integer mode whose size matches the size of the
12718 RECORD_TYPE type may be used to substitute the original mode
12719 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12720 solely relied on. */
12723 aarch64_composite_type_p (const_tree type
,
12726 if (aarch64_short_vector_p (type
, mode
))
12729 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
12732 if (mode
== BLKmode
12733 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
12734 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
12740 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12741 shall be passed or returned in simd/fp register(s) (providing these
12742 parameter passing registers are available).
12744 Upon successful return, *COUNT returns the number of needed registers,
12745 *BASE_MODE returns the mode of the individual register and when IS_HAF
12746 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12747 floating-point aggregate or a homogeneous short-vector aggregate. */
12750 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
12752 machine_mode
*base_mode
,
12756 machine_mode new_mode
= VOIDmode
;
12757 bool composite_p
= aarch64_composite_type_p (type
, mode
);
12759 if (is_ha
!= NULL
) *is_ha
= false;
12761 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12762 || aarch64_short_vector_p (type
, mode
))
12767 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
12769 if (is_ha
!= NULL
) *is_ha
= true;
12771 new_mode
= GET_MODE_INNER (mode
);
12773 else if (type
&& composite_p
)
12775 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
12777 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
12779 if (is_ha
!= NULL
) *is_ha
= true;
12788 *base_mode
= new_mode
;
12792 /* Implement TARGET_STRUCT_VALUE_RTX. */
12795 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
12796 int incoming ATTRIBUTE_UNUSED
)
12798 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
12801 /* Implements target hook vector_mode_supported_p. */
12803 aarch64_vector_mode_supported_p (machine_mode mode
)
12805 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12806 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
12809 /* Return appropriate SIMD container
12810 for MODE within a vector of WIDTH bits. */
12811 static machine_mode
12812 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
12814 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
12830 return VNx16QImode
;
12835 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
12838 if (known_eq (width
, 128))
12878 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12879 static machine_mode
12880 aarch64_preferred_simd_mode (scalar_mode mode
)
12882 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
12883 return aarch64_simd_container_mode (mode
, bits
);
12886 /* Return a list of possible vector sizes for the vectorizer
12887 to iterate over. */
12889 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
12892 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
12893 sizes
->safe_push (16);
12894 sizes
->safe_push (8);
12897 /* Implement TARGET_MANGLE_TYPE. */
12899 static const char *
12900 aarch64_mangle_type (const_tree type
)
12902 /* The AArch64 ABI documents say that "__va_list" has to be
12903 managled as if it is in the "std" namespace. */
12904 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
12905 return "St9__va_list";
12907 /* Half-precision float. */
12908 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
12911 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12913 if (TYPE_NAME (type
) != NULL
)
12914 return aarch64_mangle_builtin_type (type
);
12916 /* Use the default mangling. */
12920 /* Find the first rtx_insn before insn that will generate an assembly
12924 aarch64_prev_real_insn (rtx_insn
*insn
)
12931 insn
= prev_real_insn (insn
);
12933 while (insn
&& recog_memoized (insn
) < 0);
12939 is_madd_op (enum attr_type t1
)
12942 /* A number of these may be AArch32 only. */
12943 enum attr_type mlatypes
[] = {
12944 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
12945 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
12946 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
12949 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
12951 if (t1
== mlatypes
[i
])
12958 /* Check if there is a register dependency between a load and the insn
12959 for which we hold recog_data. */
12962 dep_between_memop_and_curr (rtx memop
)
12967 gcc_assert (GET_CODE (memop
) == SET
);
12969 if (!REG_P (SET_DEST (memop
)))
12972 load_reg
= SET_DEST (memop
);
12973 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
12975 rtx operand
= recog_data
.operand
[opno
];
12976 if (REG_P (operand
)
12977 && reg_overlap_mentioned_p (load_reg
, operand
))
12985 /* When working around the Cortex-A53 erratum 835769,
12986 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12987 instruction and has a preceding memory instruction such that a NOP
12988 should be inserted between them. */
12991 aarch64_madd_needs_nop (rtx_insn
* insn
)
12993 enum attr_type attr_type
;
12997 if (!TARGET_FIX_ERR_A53_835769
)
13000 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
13003 attr_type
= get_attr_type (insn
);
13004 if (!is_madd_op (attr_type
))
13007 prev
= aarch64_prev_real_insn (insn
);
13008 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13009 Restore recog state to INSN to avoid state corruption. */
13010 extract_constrain_insn_cached (insn
);
13012 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
13015 body
= single_set (prev
);
13017 /* If the previous insn is a memory op and there is no dependency between
13018 it and the DImode madd, emit a NOP between them. If body is NULL then we
13019 have a complex memory operation, probably a load/store pair.
13020 Be conservative for now and emit a NOP. */
13021 if (GET_MODE (recog_data
.operand
[0]) == DImode
13022 && (!body
|| !dep_between_memop_and_curr (body
)))
13030 /* Implement FINAL_PRESCAN_INSN. */
13033 aarch64_final_prescan_insn (rtx_insn
*insn
)
13035 if (aarch64_madd_needs_nop (insn
))
13036 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
13040 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13044 aarch64_sve_index_immediate_p (rtx base_or_step
)
13046 return (CONST_INT_P (base_or_step
)
13047 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
13050 /* Return true if X is a valid immediate for the SVE ADD and SUB
13051 instructions. Negate X first if NEGATE_P is true. */
13054 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
13058 if (!const_vec_duplicate_p (x
, &elt
)
13059 || !CONST_INT_P (elt
))
13062 HOST_WIDE_INT val
= INTVAL (elt
);
13065 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
13068 return IN_RANGE (val
, 0, 0xff);
13069 return IN_RANGE (val
, 0, 0xff00);
13072 /* Return true if X is a valid immediate operand for an SVE logical
13073 instruction such as AND. */
13076 aarch64_sve_bitmask_immediate_p (rtx x
)
13080 return (const_vec_duplicate_p (x
, &elt
)
13081 && CONST_INT_P (elt
)
13082 && aarch64_bitmask_imm (INTVAL (elt
),
13083 GET_MODE_INNER (GET_MODE (x
))));
13086 /* Return true if X is a valid immediate for the SVE DUP and CPY
13090 aarch64_sve_dup_immediate_p (rtx x
)
13094 if (!const_vec_duplicate_p (x
, &elt
)
13095 || !CONST_INT_P (elt
))
13098 HOST_WIDE_INT val
= INTVAL (elt
);
13100 return IN_RANGE (val
, -0x80, 0x7f);
13101 return IN_RANGE (val
, -0x8000, 0x7f00);
13104 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13105 SIGNED_P says whether the operand is signed rather than unsigned. */
13108 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
13112 return (const_vec_duplicate_p (x
, &elt
)
13113 && CONST_INT_P (elt
)
13115 ? IN_RANGE (INTVAL (elt
), -16, 15)
13116 : IN_RANGE (INTVAL (elt
), 0, 127)));
13119 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13120 instruction. Negate X first if NEGATE_P is true. */
13123 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
13128 if (!const_vec_duplicate_p (x
, &elt
)
13129 || GET_CODE (elt
) != CONST_DOUBLE
)
13132 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
13135 r
= real_value_negate (&r
);
13137 if (real_equal (&r
, &dconst1
))
13139 if (real_equal (&r
, &dconsthalf
))
13144 /* Return true if X is a valid immediate operand for an SVE FMUL
13148 aarch64_sve_float_mul_immediate_p (rtx x
)
13152 /* GCC will never generate a multiply with an immediate of 2, so there is no
13153 point testing for it (even though it is a valid constant). */
13154 return (const_vec_duplicate_p (x
, &elt
)
13155 && GET_CODE (elt
) == CONST_DOUBLE
13156 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
13159 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13160 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13161 is nonnull, use it to describe valid immediates. */
13163 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
13164 simd_immediate_info
*info
,
13165 enum simd_immediate_check which
,
13166 simd_immediate_info::insn_type insn
)
13168 /* Try a 4-byte immediate with LSL. */
13169 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
13170 if ((val32
& (0xff << shift
)) == val32
)
13173 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13174 simd_immediate_info::LSL
, shift
);
13178 /* Try a 2-byte immediate with LSL. */
13179 unsigned int imm16
= val32
& 0xffff;
13180 if (imm16
== (val32
>> 16))
13181 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
13182 if ((imm16
& (0xff << shift
)) == imm16
)
13185 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
13186 simd_immediate_info::LSL
, shift
);
13190 /* Try a 4-byte immediate with MSL, except for cases that MVN
13192 if (which
== AARCH64_CHECK_MOV
)
13193 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
13195 unsigned int low
= (1 << shift
) - 1;
13196 if (((val32
& (0xff << shift
)) | low
) == val32
)
13199 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13200 simd_immediate_info::MSL
, shift
);
13208 /* Return true if replicating VAL64 is a valid immediate for the
13209 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13210 use it to describe valid immediates. */
13212 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
13213 simd_immediate_info
*info
,
13214 enum simd_immediate_check which
)
13216 unsigned int val32
= val64
& 0xffffffff;
13217 unsigned int val16
= val64
& 0xffff;
13218 unsigned int val8
= val64
& 0xff;
13220 if (val32
== (val64
>> 32))
13222 if ((which
& AARCH64_CHECK_ORR
) != 0
13223 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
13224 simd_immediate_info::MOV
))
13227 if ((which
& AARCH64_CHECK_BIC
) != 0
13228 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
13229 simd_immediate_info::MVN
))
13232 /* Try using a replicated byte. */
13233 if (which
== AARCH64_CHECK_MOV
13234 && val16
== (val32
>> 16)
13235 && val8
== (val16
>> 8))
13238 *info
= simd_immediate_info (QImode
, val8
);
13243 /* Try using a bit-to-bytemask. */
13244 if (which
== AARCH64_CHECK_MOV
)
13247 for (i
= 0; i
< 64; i
+= 8)
13249 unsigned char byte
= (val64
>> i
) & 0xff;
13250 if (byte
!= 0 && byte
!= 0xff)
13256 *info
= simd_immediate_info (DImode
, val64
);
13263 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13264 instruction. If INFO is nonnull, use it to describe valid immediates. */
13267 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
13268 simd_immediate_info
*info
)
13270 scalar_int_mode mode
= DImode
;
13271 unsigned int val32
= val64
& 0xffffffff;
13272 if (val32
== (val64
>> 32))
13275 unsigned int val16
= val32
& 0xffff;
13276 if (val16
== (val32
>> 16))
13279 unsigned int val8
= val16
& 0xff;
13280 if (val8
== (val16
>> 8))
13284 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
13285 if (IN_RANGE (val
, -0x80, 0x7f))
13287 /* DUP with no shift. */
13289 *info
= simd_immediate_info (mode
, val
);
13292 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
13294 /* DUP with LSL #8. */
13296 *info
= simd_immediate_info (mode
, val
);
13299 if (aarch64_bitmask_imm (val64
, mode
))
13303 *info
= simd_immediate_info (mode
, val
);
13309 /* Return true if OP is a valid SIMD immediate for the operation
13310 described by WHICH. If INFO is nonnull, use it to describe valid
13313 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
13314 enum simd_immediate_check which
)
13316 machine_mode mode
= GET_MODE (op
);
13317 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13318 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13321 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
13323 unsigned int n_elts
;
13324 if (GET_CODE (op
) == CONST_VECTOR
13325 && CONST_VECTOR_DUPLICATE_P (op
))
13326 n_elts
= CONST_VECTOR_NPATTERNS (op
);
13327 else if ((vec_flags
& VEC_SVE_DATA
)
13328 && const_vec_series_p (op
, &base
, &step
))
13330 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
13331 if (!aarch64_sve_index_immediate_p (base
)
13332 || !aarch64_sve_index_immediate_p (step
))
13336 *info
= simd_immediate_info (elt_mode
, base
, step
);
13339 else if (GET_CODE (op
) == CONST_VECTOR
13340 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
13341 /* N_ELTS set above. */;
13345 /* Handle PFALSE and PTRUE. */
13346 if (vec_flags
& VEC_SVE_PRED
)
13347 return (op
== CONST0_RTX (mode
)
13348 || op
== CONSTM1_RTX (mode
));
13350 scalar_float_mode elt_float_mode
;
13352 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
13354 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
13355 if (aarch64_float_const_zero_rtx_p (elt
)
13356 || aarch64_float_const_representable_p (elt
))
13359 *info
= simd_immediate_info (elt_float_mode
, elt
);
13364 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
13368 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
13370 /* Expand the vector constant out into a byte vector, with the least
13371 significant byte of the register first. */
13372 auto_vec
<unsigned char, 16> bytes
;
13373 bytes
.reserve (n_elts
* elt_size
);
13374 for (unsigned int i
= 0; i
< n_elts
; i
++)
13376 /* The vector is provided in gcc endian-neutral fashion.
13377 For aarch64_be Advanced SIMD, it must be laid out in the vector
13378 register in reverse order. */
13379 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
13380 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
13382 if (elt_mode
!= elt_int_mode
)
13383 elt
= gen_lowpart (elt_int_mode
, elt
);
13385 if (!CONST_INT_P (elt
))
13388 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
13389 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
13391 bytes
.quick_push (elt_val
& 0xff);
13392 elt_val
>>= BITS_PER_UNIT
;
13396 /* The immediate must repeat every eight bytes. */
13397 unsigned int nbytes
= bytes
.length ();
13398 for (unsigned i
= 8; i
< nbytes
; ++i
)
13399 if (bytes
[i
] != bytes
[i
- 8])
13402 /* Get the repeating 8-byte value as an integer. No endian correction
13403 is needed here because bytes is already in lsb-first order. */
13404 unsigned HOST_WIDE_INT val64
= 0;
13405 for (unsigned int i
= 0; i
< 8; i
++)
13406 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
13407 << (i
* BITS_PER_UNIT
));
13409 if (vec_flags
& VEC_SVE_DATA
)
13410 return aarch64_sve_valid_immediate (val64
, info
);
13412 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
13415 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13416 has a step in the range of INDEX. Return the index expression if so,
13417 otherwise return null. */
13419 aarch64_check_zero_based_sve_index_immediate (rtx x
)
13422 if (const_vec_series_p (x
, &base
, &step
)
13423 && base
== const0_rtx
13424 && aarch64_sve_index_immediate_p (step
))
13429 /* Check of immediate shift constants are within range. */
13431 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
13433 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
13435 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
13437 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
13440 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13441 operation of width WIDTH at bit position POS. */
13444 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
13446 gcc_assert (CONST_INT_P (width
));
13447 gcc_assert (CONST_INT_P (pos
));
13449 unsigned HOST_WIDE_INT mask
13450 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
13451 return GEN_INT (mask
<< UINTVAL (pos
));
13455 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
13457 if (GET_CODE (x
) == HIGH
13458 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
13461 if (CONST_INT_P (x
))
13464 if (VECTOR_MODE_P (GET_MODE (x
)))
13465 return aarch64_simd_valid_immediate (x
, NULL
);
13467 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
13470 if (aarch64_sve_cnt_immediate_p (x
))
13473 return aarch64_classify_symbolic_expression (x
)
13474 == SYMBOL_TINY_ABSOLUTE
;
13477 /* Return a const_int vector of VAL. */
13479 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
13481 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
13482 return gen_const_vec_duplicate (mode
, c
);
13485 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13488 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
13490 machine_mode vmode
;
13492 vmode
= aarch64_simd_container_mode (mode
, 64);
13493 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
13494 return aarch64_simd_valid_immediate (op_v
, NULL
);
13497 /* Construct and return a PARALLEL RTX vector with elements numbering the
13498 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13499 the vector - from the perspective of the architecture. This does not
13500 line up with GCC's perspective on lane numbers, so we end up with
13501 different masks depending on our target endian-ness. The diagram
13502 below may help. We must draw the distinction when building masks
13503 which select one half of the vector. An instruction selecting
13504 architectural low-lanes for a big-endian target, must be described using
13505 a mask selecting GCC high-lanes.
13507 Big-Endian Little-Endian
13509 GCC 0 1 2 3 3 2 1 0
13510 | x | x | x | x | | x | x | x | x |
13511 Architecture 3 2 1 0 3 2 1 0
13513 Low Mask: { 2, 3 } { 0, 1 }
13514 High Mask: { 0, 1 } { 2, 3 }
13516 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13519 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
13521 rtvec v
= rtvec_alloc (nunits
/ 2);
13522 int high_base
= nunits
/ 2;
13528 if (BYTES_BIG_ENDIAN
)
13529 base
= high
? low_base
: high_base
;
13531 base
= high
? high_base
: low_base
;
13533 for (i
= 0; i
< nunits
/ 2; i
++)
13534 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
13536 t1
= gen_rtx_PARALLEL (mode
, v
);
13540 /* Check OP for validity as a PARALLEL RTX vector with elements
13541 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13542 from the perspective of the architecture. See the diagram above
13543 aarch64_simd_vect_par_cnst_half for more details. */
13546 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
13550 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
13553 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
13554 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
13555 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
13558 if (count_op
!= count_ideal
)
13561 for (i
= 0; i
< count_ideal
; i
++)
13563 rtx elt_op
= XVECEXP (op
, 0, i
);
13564 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
13566 if (!CONST_INT_P (elt_op
)
13567 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
13573 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13574 HIGH (exclusive). */
13576 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
13579 HOST_WIDE_INT lane
;
13580 gcc_assert (CONST_INT_P (operand
));
13581 lane
= INTVAL (operand
);
13583 if (lane
< low
|| lane
>= high
)
13586 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
13588 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
13592 /* Peform endian correction on lane number N, which indexes a vector
13593 of mode MODE, and return the result as an SImode rtx. */
13596 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
13598 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
13601 /* Return TRUE if OP is a valid vector addressing mode. */
13604 aarch64_simd_mem_operand_p (rtx op
)
13606 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
13607 || REG_P (XEXP (op
, 0)));
13610 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13613 aarch64_sve_ld1r_operand_p (rtx op
)
13615 struct aarch64_address_info addr
;
13619 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
13620 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
13621 && addr
.type
== ADDRESS_REG_IMM
13622 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
13625 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13626 The conditions for STR are the same. */
13628 aarch64_sve_ldr_operand_p (rtx op
)
13630 struct aarch64_address_info addr
;
13633 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
13634 false, ADDR_QUERY_ANY
)
13635 && addr
.type
== ADDRESS_REG_IMM
);
13638 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13639 We need to be able to access the individual pieces, so the range
13640 is different from LD[234] and ST[234]. */
13642 aarch64_sve_struct_memory_operand_p (rtx op
)
13647 machine_mode mode
= GET_MODE (op
);
13648 struct aarch64_address_info addr
;
13649 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
13651 || addr
.type
!= ADDRESS_REG_IMM
)
13654 poly_int64 first
= addr
.const_offset
;
13655 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
13656 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
13657 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
13660 /* Emit a register copy from operand to operand, taking care not to
13661 early-clobber source registers in the process.
13663 COUNT is the number of components into which the copy needs to be
13666 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
13667 unsigned int count
)
13670 int rdest
= REGNO (operands
[0]);
13671 int rsrc
= REGNO (operands
[1]);
13673 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
13675 for (i
= 0; i
< count
; i
++)
13676 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
13677 gen_rtx_REG (mode
, rsrc
+ i
));
13679 for (i
= 0; i
< count
; i
++)
13680 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
13681 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
13684 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13685 one of VSTRUCT modes: OI, CI, or XI. */
13687 aarch64_simd_attr_length_rglist (machine_mode mode
)
13689 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13690 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
13693 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13694 alignment of a vector to 128 bits. SVE predicates have an alignment of
13696 static HOST_WIDE_INT
13697 aarch64_simd_vector_alignment (const_tree type
)
13699 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13700 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13701 be set for non-predicate vectors of booleans. Modes are the most
13702 direct way we have of identifying real SVE predicate types. */
13703 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
13704 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
13705 return MIN (align
, 128);
13708 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13709 static HOST_WIDE_INT
13710 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
13712 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
13714 /* If the length of the vector is fixed, try to align to that length,
13715 otherwise don't try to align at all. */
13716 HOST_WIDE_INT result
;
13717 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
13718 result
= TYPE_ALIGN (TREE_TYPE (type
));
13721 return TYPE_ALIGN (type
);
13724 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13726 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
13731 /* For fixed-length vectors, check that the vectorizer will aim for
13732 full-vector alignment. This isn't true for generic GCC vectors
13733 that are wider than the ABI maximum of 128 bits. */
13734 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
13735 && (wi::to_widest (TYPE_SIZE (type
))
13736 != aarch64_vectorize_preferred_vector_alignment (type
)))
13739 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13743 /* Return true if the vector misalignment factor is supported by the
13746 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
13747 const_tree type
, int misalignment
,
13750 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
13752 /* Return if movmisalign pattern is not supported for this mode. */
13753 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
13756 /* Misalignment factor is unknown at compile time. */
13757 if (misalignment
== -1)
13760 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
13764 /* If VALS is a vector constant that can be loaded into a register
13765 using DUP, generate instructions to do so and return an RTX to
13766 assign to the register. Otherwise return NULL_RTX. */
13768 aarch64_simd_dup_constant (rtx vals
)
13770 machine_mode mode
= GET_MODE (vals
);
13771 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13774 if (!const_vec_duplicate_p (vals
, &x
))
13777 /* We can load this constant by using DUP and a constant in a
13778 single ARM register. This will be cheaper than a vector
13780 x
= copy_to_mode_reg (inner_mode
, x
);
13781 return gen_vec_duplicate (mode
, x
);
13785 /* Generate code to load VALS, which is a PARALLEL containing only
13786 constants (for vec_init) or CONST_VECTOR, efficiently into a
13787 register. Returns an RTX to copy into the register, or NULL_RTX
13788 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13790 aarch64_simd_make_constant (rtx vals
)
13792 machine_mode mode
= GET_MODE (vals
);
13794 rtx const_vec
= NULL_RTX
;
13798 if (GET_CODE (vals
) == CONST_VECTOR
)
13800 else if (GET_CODE (vals
) == PARALLEL
)
13802 /* A CONST_VECTOR must contain only CONST_INTs and
13803 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13804 Only store valid constants in a CONST_VECTOR. */
13805 int n_elts
= XVECLEN (vals
, 0);
13806 for (i
= 0; i
< n_elts
; ++i
)
13808 rtx x
= XVECEXP (vals
, 0, i
);
13809 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13812 if (n_const
== n_elts
)
13813 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
13816 gcc_unreachable ();
13818 if (const_vec
!= NULL_RTX
13819 && aarch64_simd_valid_immediate (const_vec
, NULL
))
13820 /* Load using MOVI/MVNI. */
13822 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
13823 /* Loaded using DUP. */
13825 else if (const_vec
!= NULL_RTX
)
13826 /* Load from constant pool. We can not take advantage of single-cycle
13827 LD1 because we need a PC-relative addressing mode. */
13830 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13831 We can not construct an initializer. */
13835 /* Expand a vector initialisation sequence, such that TARGET is
13836 initialised to contain VALS. */
13839 aarch64_expand_vector_init (rtx target
, rtx vals
)
13841 machine_mode mode
= GET_MODE (target
);
13842 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
13843 /* The number of vector elements. */
13844 int n_elts
= XVECLEN (vals
, 0);
13845 /* The number of vector elements which are not constant. */
13847 rtx any_const
= NULL_RTX
;
13848 /* The first element of vals. */
13849 rtx v0
= XVECEXP (vals
, 0, 0);
13850 bool all_same
= true;
13852 /* Count the number of variable elements to initialise. */
13853 for (int i
= 0; i
< n_elts
; ++i
)
13855 rtx x
= XVECEXP (vals
, 0, i
);
13856 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
13861 all_same
&= rtx_equal_p (x
, v0
);
13864 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13865 how best to handle this. */
13868 rtx constant
= aarch64_simd_make_constant (vals
);
13869 if (constant
!= NULL_RTX
)
13871 emit_move_insn (target
, constant
);
13876 /* Splat a single non-constant element if we can. */
13879 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
13880 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13884 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
13885 gcc_assert (icode
!= CODE_FOR_nothing
);
13887 /* If there are only variable elements, try to optimize
13888 the insertion using dup for the most common element
13889 followed by insertions. */
13891 /* The algorithm will fill matches[*][0] with the earliest matching element,
13892 and matches[X][1] with the count of duplicate elements (if X is the
13893 earliest element which has duplicates). */
13895 if (n_var
== n_elts
&& n_elts
<= 16)
13897 int matches
[16][2] = {0};
13898 for (int i
= 0; i
< n_elts
; i
++)
13900 for (int j
= 0; j
<= i
; j
++)
13902 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
13910 int maxelement
= 0;
13912 for (int i
= 0; i
< n_elts
; i
++)
13913 if (matches
[i
][1] > maxv
)
13916 maxv
= matches
[i
][1];
13919 /* Create a duplicate of the most common element, unless all elements
13920 are equally useless to us, in which case just immediately set the
13921 vector register using the first element. */
13925 /* For vectors of two 64-bit elements, we can do even better. */
13927 && (inner_mode
== E_DImode
13928 || inner_mode
== E_DFmode
))
13931 rtx x0
= XVECEXP (vals
, 0, 0);
13932 rtx x1
= XVECEXP (vals
, 0, 1);
13933 /* Combine can pick up this case, but handling it directly
13934 here leaves clearer RTL.
13936 This is load_pair_lanes<mode>, and also gives us a clean-up
13937 for store_pair_lanes<mode>. */
13938 if (memory_operand (x0
, inner_mode
)
13939 && memory_operand (x1
, inner_mode
)
13940 && !STRICT_ALIGNMENT
13941 && rtx_equal_p (XEXP (x1
, 0),
13942 plus_constant (Pmode
,
13944 GET_MODE_SIZE (inner_mode
))))
13947 if (inner_mode
== DFmode
)
13948 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
13950 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
13955 /* The subreg-move sequence below will move into lane zero of the
13956 vector register. For big-endian we want that position to hold
13957 the last element of VALS. */
13958 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
13959 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
13960 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
13964 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
13965 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13968 /* Insert the rest. */
13969 for (int i
= 0; i
< n_elts
; i
++)
13971 rtx x
= XVECEXP (vals
, 0, i
);
13972 if (matches
[i
][0] == maxelement
)
13974 x
= copy_to_mode_reg (inner_mode
, x
);
13975 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
13980 /* Initialise a vector which is part-variable. We want to first try
13981 to build those lanes which are constant in the most efficient way we
13983 if (n_var
!= n_elts
)
13985 rtx copy
= copy_rtx (vals
);
13987 /* Load constant part of vector. We really don't care what goes into the
13988 parts we will overwrite, but we're more likely to be able to load the
13989 constant efficiently if it has fewer, larger, repeating parts
13990 (see aarch64_simd_valid_immediate). */
13991 for (int i
= 0; i
< n_elts
; i
++)
13993 rtx x
= XVECEXP (vals
, 0, i
);
13994 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13996 rtx subst
= any_const
;
13997 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
13999 /* Look in the copied vector, as more elements are const. */
14000 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
14001 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
14007 XVECEXP (copy
, 0, i
) = subst
;
14009 aarch64_expand_vector_init (target
, copy
);
14012 /* Insert the variable lanes directly. */
14013 for (int i
= 0; i
< n_elts
; i
++)
14015 rtx x
= XVECEXP (vals
, 0, i
);
14016 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14018 x
= copy_to_mode_reg (inner_mode
, x
);
14019 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14023 static unsigned HOST_WIDE_INT
14024 aarch64_shift_truncation_mask (machine_mode mode
)
14026 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
14028 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
14031 /* Select a format to encode pointers in exception handling data. */
14033 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
14036 switch (aarch64_cmodel
)
14038 case AARCH64_CMODEL_TINY
:
14039 case AARCH64_CMODEL_TINY_PIC
:
14040 case AARCH64_CMODEL_SMALL
:
14041 case AARCH64_CMODEL_SMALL_PIC
:
14042 case AARCH64_CMODEL_SMALL_SPIC
:
14043 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14045 type
= DW_EH_PE_sdata4
;
14048 /* No assumptions here. 8-byte relocs required. */
14049 type
= DW_EH_PE_sdata8
;
14052 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
14055 /* The last .arch and .tune assembly strings that we printed. */
14056 static std::string aarch64_last_printed_arch_string
;
14057 static std::string aarch64_last_printed_tune_string
;
14059 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14060 by the function fndecl. */
14063 aarch64_declare_function_name (FILE *stream
, const char* name
,
14066 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14068 struct cl_target_option
*targ_options
;
14070 targ_options
= TREE_TARGET_OPTION (target_parts
);
14072 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
14073 gcc_assert (targ_options
);
14075 const struct processor
*this_arch
14076 = aarch64_get_arch (targ_options
->x_explicit_arch
);
14078 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
14079 std::string extension
14080 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
14082 /* Only update the assembler .arch string if it is distinct from the last
14083 such string we printed. */
14084 std::string to_print
= this_arch
->name
+ extension
;
14085 if (to_print
!= aarch64_last_printed_arch_string
)
14087 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
14088 aarch64_last_printed_arch_string
= to_print
;
14091 /* Print the cpu name we're tuning for in the comments, might be
14092 useful to readers of the generated asm. Do it only when it changes
14093 from function to function and verbose assembly is requested. */
14094 const struct processor
*this_tune
14095 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
14097 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
14099 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
14101 aarch64_last_printed_tune_string
= this_tune
->name
;
14104 /* Don't forget the type directive for ELF. */
14105 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
14106 ASM_OUTPUT_LABEL (stream
, name
);
14109 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14112 aarch64_start_file (void)
14114 struct cl_target_option
*default_options
14115 = TREE_TARGET_OPTION (target_option_default_node
);
14117 const struct processor
*default_arch
14118 = aarch64_get_arch (default_options
->x_explicit_arch
);
14119 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
14120 std::string extension
14121 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
14122 default_arch
->flags
);
14124 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
14125 aarch64_last_printed_tune_string
= "";
14126 asm_fprintf (asm_out_file
, "\t.arch %s\n",
14127 aarch64_last_printed_arch_string
.c_str ());
14129 default_file_start ();
14132 /* Emit load exclusive. */
14135 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
14136 rtx mem
, rtx model_rtx
)
14138 rtx (*gen
) (rtx
, rtx
, rtx
);
14142 case E_QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
14143 case E_HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
14144 case E_SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
14145 case E_DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
14147 gcc_unreachable ();
14150 emit_insn (gen (rval
, mem
, model_rtx
));
14153 /* Emit store exclusive. */
14156 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
14157 rtx rval
, rtx mem
, rtx model_rtx
)
14159 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14163 case E_QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
14164 case E_HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
14165 case E_SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
14166 case E_DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
14168 gcc_unreachable ();
14171 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
14174 /* Mark the previous jump instruction as unlikely. */
14177 aarch64_emit_unlikely_jump (rtx insn
)
14179 rtx_insn
*jump
= emit_jump_insn (insn
);
14180 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
14183 /* Expand a compare and swap pattern. */
14186 aarch64_expand_compare_and_swap (rtx operands
[])
14188 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
14189 machine_mode mode
, cmp_mode
;
14190 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14193 const gen_cas_fn split_cas
[] =
14195 gen_aarch64_compare_and_swapqi
,
14196 gen_aarch64_compare_and_swaphi
,
14197 gen_aarch64_compare_and_swapsi
,
14198 gen_aarch64_compare_and_swapdi
14200 const gen_cas_fn atomic_cas
[] =
14202 gen_aarch64_compare_and_swapqi_lse
,
14203 gen_aarch64_compare_and_swaphi_lse
,
14204 gen_aarch64_compare_and_swapsi_lse
,
14205 gen_aarch64_compare_and_swapdi_lse
14208 bval
= operands
[0];
14209 rval
= operands
[1];
14211 oldval
= operands
[3];
14212 newval
= operands
[4];
14213 is_weak
= operands
[5];
14214 mod_s
= operands
[6];
14215 mod_f
= operands
[7];
14216 mode
= GET_MODE (mem
);
14219 /* Normally the succ memory model must be stronger than fail, but in the
14220 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14221 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14223 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
14224 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
14225 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
14231 /* For short modes, we're going to perform the comparison in SImode,
14232 so do the zero-extension now. */
14234 rval
= gen_reg_rtx (SImode
);
14235 oldval
= convert_modes (SImode
, mode
, oldval
, true);
14236 /* Fall through. */
14240 /* Force the value into a register if needed. */
14241 if (!aarch64_plus_operand (oldval
, mode
))
14242 oldval
= force_reg (cmp_mode
, oldval
);
14246 gcc_unreachable ();
14251 case E_QImode
: idx
= 0; break;
14252 case E_HImode
: idx
= 1; break;
14253 case E_SImode
: idx
= 2; break;
14254 case E_DImode
: idx
= 3; break;
14256 gcc_unreachable ();
14259 gen
= atomic_cas
[idx
];
14261 gen
= split_cas
[idx
];
14263 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
14265 if (mode
== QImode
|| mode
== HImode
)
14266 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
14268 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14269 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
14270 emit_insn (gen_rtx_SET (bval
, x
));
14273 /* Test whether the target supports using a atomic load-operate instruction.
14274 CODE is the operation and AFTER is TRUE if the data in memory after the
14275 operation should be returned and FALSE if the data before the operation
14276 should be returned. Returns FALSE if the operation isn't supported by the
14280 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
14299 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14300 sequence implementing an atomic operation. */
14303 aarch64_emit_post_barrier (enum memmodel model
)
14305 const enum memmodel base_model
= memmodel_base (model
);
14307 if (is_mm_sync (model
)
14308 && (base_model
== MEMMODEL_ACQUIRE
14309 || base_model
== MEMMODEL_ACQ_REL
14310 || base_model
== MEMMODEL_SEQ_CST
))
14312 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
14316 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14317 for the data in memory. EXPECTED is the value expected to be in memory.
14318 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14319 is the memory ordering to use. */
14322 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
14323 rtx expected
, rtx desired
,
14326 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14329 mode
= GET_MODE (mem
);
14333 case E_QImode
: gen
= gen_aarch64_atomic_casqi
; break;
14334 case E_HImode
: gen
= gen_aarch64_atomic_cashi
; break;
14335 case E_SImode
: gen
= gen_aarch64_atomic_cassi
; break;
14336 case E_DImode
: gen
= gen_aarch64_atomic_casdi
; break;
14338 gcc_unreachable ();
14341 /* Move the expected value into the CAS destination register. */
14342 emit_insn (gen_rtx_SET (rval
, expected
));
14344 /* Emit the CAS. */
14345 emit_insn (gen (rval
, mem
, desired
, model
));
14347 /* Compare the expected value with the value loaded by the CAS, to establish
14348 whether the swap was made. */
14349 aarch64_gen_compare_reg (EQ
, rval
, expected
);
14352 /* Split a compare and swap pattern. */
14355 aarch64_split_compare_and_swap (rtx operands
[])
14357 rtx rval
, mem
, oldval
, newval
, scratch
;
14360 rtx_code_label
*label1
, *label2
;
14362 enum memmodel model
;
14365 rval
= operands
[0];
14367 oldval
= operands
[2];
14368 newval
= operands
[3];
14369 is_weak
= (operands
[4] != const0_rtx
);
14370 model_rtx
= operands
[5];
14371 scratch
= operands
[7];
14372 mode
= GET_MODE (mem
);
14373 model
= memmodel_from_int (INTVAL (model_rtx
));
14375 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14378 LD[A]XR rval, [mem]
14380 ST[L]XR scratch, newval, [mem]
14381 CBNZ scratch, .label1
14384 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
14389 label1
= gen_label_rtx ();
14390 emit_label (label1
);
14392 label2
= gen_label_rtx ();
14394 /* The initial load can be relaxed for a __sync operation since a final
14395 barrier will be emitted to stop code hoisting. */
14396 if (is_mm_sync (model
))
14397 aarch64_emit_load_exclusive (mode
, rval
, mem
,
14398 GEN_INT (MEMMODEL_RELAXED
));
14400 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
14404 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
14405 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14406 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14407 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14411 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
14412 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14413 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14414 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14415 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14418 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
14422 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
14423 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14424 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
14425 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14429 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14430 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
14431 emit_insn (gen_rtx_SET (cond
, x
));
14434 emit_label (label2
);
14435 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14436 to set the condition flags. If this is not used it will be removed by
14440 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14441 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
14442 emit_insn (gen_rtx_SET (cond
, x
));
14444 /* Emit any final barrier needed for a __sync operation. */
14445 if (is_mm_sync (model
))
14446 aarch64_emit_post_barrier (model
);
14449 /* Emit a BIC instruction. */
14452 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
14454 rtx shift_rtx
= GEN_INT (shift
);
14455 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14459 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
14460 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
14462 gcc_unreachable ();
14465 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
14468 /* Emit an atomic swap. */
14471 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
14472 rtx mem
, rtx model
)
14474 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14478 case E_QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
14479 case E_HImode
: gen
= gen_aarch64_atomic_swphi
; break;
14480 case E_SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
14481 case E_DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
14483 gcc_unreachable ();
14486 emit_insn (gen (dst
, mem
, value
, model
));
14489 /* Operations supported by aarch64_emit_atomic_load_op. */
14491 enum aarch64_atomic_load_op_code
14493 AARCH64_LDOP_PLUS
, /* A + B */
14494 AARCH64_LDOP_XOR
, /* A ^ B */
14495 AARCH64_LDOP_OR
, /* A | B */
14496 AARCH64_LDOP_BIC
/* A & ~B */
14499 /* Emit an atomic load-operate. */
14502 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
14503 machine_mode mode
, rtx dst
, rtx src
,
14504 rtx mem
, rtx model
)
14506 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
14507 const aarch64_atomic_load_op_fn plus
[] =
14509 gen_aarch64_atomic_loadaddqi
,
14510 gen_aarch64_atomic_loadaddhi
,
14511 gen_aarch64_atomic_loadaddsi
,
14512 gen_aarch64_atomic_loadadddi
14514 const aarch64_atomic_load_op_fn eor
[] =
14516 gen_aarch64_atomic_loadeorqi
,
14517 gen_aarch64_atomic_loadeorhi
,
14518 gen_aarch64_atomic_loadeorsi
,
14519 gen_aarch64_atomic_loadeordi
14521 const aarch64_atomic_load_op_fn ior
[] =
14523 gen_aarch64_atomic_loadsetqi
,
14524 gen_aarch64_atomic_loadsethi
,
14525 gen_aarch64_atomic_loadsetsi
,
14526 gen_aarch64_atomic_loadsetdi
14528 const aarch64_atomic_load_op_fn bic
[] =
14530 gen_aarch64_atomic_loadclrqi
,
14531 gen_aarch64_atomic_loadclrhi
,
14532 gen_aarch64_atomic_loadclrsi
,
14533 gen_aarch64_atomic_loadclrdi
14535 aarch64_atomic_load_op_fn gen
;
14540 case E_QImode
: idx
= 0; break;
14541 case E_HImode
: idx
= 1; break;
14542 case E_SImode
: idx
= 2; break;
14543 case E_DImode
: idx
= 3; break;
14545 gcc_unreachable ();
14550 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
14551 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
14552 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
14553 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
14555 gcc_unreachable ();
14558 emit_insn (gen (dst
, mem
, src
, model
));
14561 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14562 location to store the data read from memory. OUT_RESULT is the location to
14563 store the result of the operation. MEM is the memory location to read and
14564 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14565 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14569 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
14570 rtx mem
, rtx value
, rtx model_rtx
)
14572 machine_mode mode
= GET_MODE (mem
);
14573 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14574 const bool short_mode
= (mode
< SImode
);
14575 aarch64_atomic_load_op_code ldop_code
;
14580 out_data
= gen_lowpart (mode
, out_data
);
14583 out_result
= gen_lowpart (mode
, out_result
);
14585 /* Make sure the value is in a register, putting it into a destination
14586 register if it needs to be manipulated. */
14587 if (!register_operand (value
, mode
)
14588 || code
== AND
|| code
== MINUS
)
14590 src
= out_result
? out_result
: out_data
;
14591 emit_move_insn (src
, gen_lowpart (mode
, value
));
14595 gcc_assert (register_operand (src
, mode
));
14597 /* Preprocess the data for the operation as necessary. If the operation is
14598 a SET then emit a swap instruction and finish. */
14602 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
14606 /* Negate the value and treat it as a PLUS. */
14610 /* Resize the value if necessary. */
14612 src
= gen_lowpart (wmode
, src
);
14614 neg_src
= gen_rtx_NEG (wmode
, src
);
14615 emit_insn (gen_rtx_SET (src
, neg_src
));
14618 src
= gen_lowpart (mode
, src
);
14620 /* Fall-through. */
14622 ldop_code
= AARCH64_LDOP_PLUS
;
14626 ldop_code
= AARCH64_LDOP_OR
;
14630 ldop_code
= AARCH64_LDOP_XOR
;
14637 /* Resize the value if necessary. */
14639 src
= gen_lowpart (wmode
, src
);
14641 not_src
= gen_rtx_NOT (wmode
, src
);
14642 emit_insn (gen_rtx_SET (src
, not_src
));
14645 src
= gen_lowpart (mode
, src
);
14647 ldop_code
= AARCH64_LDOP_BIC
;
14651 /* The operation can't be done with atomic instructions. */
14652 gcc_unreachable ();
14655 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
14657 /* If necessary, calculate the data in memory after the update by redoing the
14658 operation from values in registers. */
14664 src
= gen_lowpart (wmode
, src
);
14665 out_data
= gen_lowpart (wmode
, out_data
);
14666 out_result
= gen_lowpart (wmode
, out_result
);
14675 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
14678 x
= gen_rtx_IOR (wmode
, out_data
, src
);
14681 x
= gen_rtx_XOR (wmode
, out_data
, src
);
14684 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
14687 gcc_unreachable ();
14690 emit_set_insn (out_result
, x
);
14695 /* Split an atomic operation. */
14698 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
14699 rtx value
, rtx model_rtx
, rtx cond
)
14701 machine_mode mode
= GET_MODE (mem
);
14702 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14703 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
14704 const bool is_sync
= is_mm_sync (model
);
14705 rtx_code_label
*label
;
14708 /* Split the atomic operation into a sequence. */
14709 label
= gen_label_rtx ();
14710 emit_label (label
);
14713 new_out
= gen_lowpart (wmode
, new_out
);
14715 old_out
= gen_lowpart (wmode
, old_out
);
14718 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
14720 /* The initial load can be relaxed for a __sync operation since a final
14721 barrier will be emitted to stop code hoisting. */
14723 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
14724 GEN_INT (MEMMODEL_RELAXED
));
14726 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
14735 x
= gen_rtx_AND (wmode
, old_out
, value
);
14736 emit_insn (gen_rtx_SET (new_out
, x
));
14737 x
= gen_rtx_NOT (wmode
, new_out
);
14738 emit_insn (gen_rtx_SET (new_out
, x
));
14742 if (CONST_INT_P (value
))
14744 value
= GEN_INT (-INTVAL (value
));
14747 /* Fall through. */
14750 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
14751 emit_insn (gen_rtx_SET (new_out
, x
));
14755 aarch64_emit_store_exclusive (mode
, cond
, mem
,
14756 gen_lowpart (mode
, new_out
), model_rtx
);
14758 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14759 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14760 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
14761 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14763 /* Emit any final barrier needed for a __sync operation. */
14765 aarch64_emit_post_barrier (model
);
14769 aarch64_init_libfuncs (void)
14771 /* Half-precision float operations. The compiler handles all operations
14772 with NULL libfuncs by converting to SFmode. */
14775 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
14776 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
14779 set_optab_libfunc (add_optab
, HFmode
, NULL
);
14780 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
14781 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
14782 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
14783 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
14786 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
14787 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
14788 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
14789 set_optab_libfunc (le_optab
, HFmode
, NULL
);
14790 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
14791 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
14792 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
14795 /* Target hook for c_mode_for_suffix. */
14796 static machine_mode
14797 aarch64_c_mode_for_suffix (char suffix
)
14805 /* We can only represent floating point constants which will fit in
14806 "quarter-precision" values. These values are characterised by
14807 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14810 (-1)^s * (n/16) * 2^r
14813 's' is the sign bit.
14814 'n' is an integer in the range 16 <= n <= 31.
14815 'r' is an integer in the range -3 <= r <= 4. */
14817 /* Return true iff X can be represented by a quarter-precision
14818 floating point immediate operand X. Note, we cannot represent 0.0. */
14820 aarch64_float_const_representable_p (rtx x
)
14822 /* This represents our current view of how many bits
14823 make up the mantissa. */
14824 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
14826 unsigned HOST_WIDE_INT mantissa
, mask
;
14827 REAL_VALUE_TYPE r
, m
;
14830 if (!CONST_DOUBLE_P (x
))
14833 /* We don't support HFmode constants yet. */
14834 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
14837 r
= *CONST_DOUBLE_REAL_VALUE (x
);
14839 /* We cannot represent infinities, NaNs or +/-zero. We won't
14840 know if we have +zero until we analyse the mantissa, but we
14841 can reject the other invalid values. */
14842 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
14843 || REAL_VALUE_MINUS_ZERO (r
))
14846 /* Extract exponent. */
14847 r
= real_value_abs (&r
);
14848 exponent
= REAL_EXP (&r
);
14850 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14851 highest (sign) bit, with a fixed binary point at bit point_pos.
14852 m1 holds the low part of the mantissa, m2 the high part.
14853 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14854 bits for the mantissa, this can fail (low bits will be lost). */
14855 real_ldexp (&m
, &r
, point_pos
- exponent
);
14856 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
14858 /* If the low part of the mantissa has bits set we cannot represent
14860 if (w
.ulow () != 0)
14862 /* We have rejected the lower HOST_WIDE_INT, so update our
14863 understanding of how many bits lie in the mantissa and
14864 look only at the high HOST_WIDE_INT. */
14865 mantissa
= w
.elt (1);
14866 point_pos
-= HOST_BITS_PER_WIDE_INT
;
14868 /* We can only represent values with a mantissa of the form 1.xxxx. */
14869 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
14870 if ((mantissa
& mask
) != 0)
14873 /* Having filtered unrepresentable values, we may now remove all
14874 but the highest 5 bits. */
14875 mantissa
>>= point_pos
- 5;
14877 /* We cannot represent the value 0.0, so reject it. This is handled
14882 /* Then, as bit 4 is always set, we can mask it off, leaving
14883 the mantissa in the range [0, 15]. */
14884 mantissa
&= ~(1 << 4);
14885 gcc_assert (mantissa
<= 15);
14887 /* GCC internally does not use IEEE754-like encoding (where normalized
14888 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14889 Our mantissa values are shifted 4 places to the left relative to
14890 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14891 by 5 places to correct for GCC's representation. */
14892 exponent
= 5 - exponent
;
14894 return (exponent
>= 0 && exponent
<= 7);
14897 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14898 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14899 output MOVI/MVNI, ORR or BIC immediate. */
14901 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
14902 enum simd_immediate_check which
)
14905 static char templ
[40];
14906 const char *mnemonic
;
14907 const char *shift_op
;
14908 unsigned int lane_count
= 0;
14911 struct simd_immediate_info info
;
14913 /* This will return true to show const_vector is legal for use as either
14914 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14915 It will also update INFO to show how the immediate should be generated.
14916 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14917 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
14918 gcc_assert (is_valid
);
14920 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14921 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
14923 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14925 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
14926 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14927 move immediate path. */
14928 if (aarch64_float_const_zero_rtx_p (info
.value
))
14929 info
.value
= GEN_INT (0);
14932 const unsigned int buf_size
= 20;
14933 char float_buf
[buf_size
] = {'\0'};
14934 real_to_decimal_for_mode (float_buf
,
14935 CONST_DOUBLE_REAL_VALUE (info
.value
),
14936 buf_size
, buf_size
, 1, info
.elt_mode
);
14938 if (lane_count
== 1)
14939 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
14941 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
14942 lane_count
, element_char
, float_buf
);
14947 gcc_assert (CONST_INT_P (info
.value
));
14949 if (which
== AARCH64_CHECK_MOV
)
14951 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
14952 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
14953 if (lane_count
== 1)
14954 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
14955 mnemonic
, UINTVAL (info
.value
));
14956 else if (info
.shift
)
14957 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14958 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
14959 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
14961 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14962 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
14963 element_char
, UINTVAL (info
.value
));
14967 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14968 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
14970 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14971 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
14972 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
14974 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14975 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
14976 element_char
, UINTVAL (info
.value
));
14982 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
14985 /* If a floating point number was passed and we desire to use it in an
14986 integer mode do the conversion to integer. */
14987 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
14989 unsigned HOST_WIDE_INT ival
;
14990 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
14991 gcc_unreachable ();
14992 immediate
= gen_int_mode (ival
, mode
);
14995 machine_mode vmode
;
14996 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14997 a 128 bit vector mode. */
14998 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
15000 vmode
= aarch64_simd_container_mode (mode
, width
);
15001 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
15002 return aarch64_output_simd_mov_immediate (v_op
, width
);
15005 /* Return the output string to use for moving immediate CONST_VECTOR
15006 into an SVE register. */
15009 aarch64_output_sve_mov_immediate (rtx const_vector
)
15011 static char templ
[40];
15012 struct simd_immediate_info info
;
15015 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
15016 gcc_assert (is_valid
);
15018 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15022 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
15023 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
15024 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
15028 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15030 if (aarch64_float_const_zero_rtx_p (info
.value
))
15031 info
.value
= GEN_INT (0);
15034 const int buf_size
= 20;
15035 char float_buf
[buf_size
] = {};
15036 real_to_decimal_for_mode (float_buf
,
15037 CONST_DOUBLE_REAL_VALUE (info
.value
),
15038 buf_size
, buf_size
, 1, info
.elt_mode
);
15040 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
15041 element_char
, float_buf
);
15046 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
15047 element_char
, INTVAL (info
.value
));
15051 /* Return the asm format for a PTRUE instruction whose destination has
15052 mode MODE. SUFFIX is the element size suffix. */
15055 aarch64_output_ptrue (machine_mode mode
, char suffix
)
15057 unsigned int nunits
;
15058 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
15059 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
15060 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
15062 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
15066 /* Split operands into moves from op[1] + op[2] into op[0]. */
15069 aarch64_split_combinev16qi (rtx operands
[3])
15071 unsigned int dest
= REGNO (operands
[0]);
15072 unsigned int src1
= REGNO (operands
[1]);
15073 unsigned int src2
= REGNO (operands
[2]);
15074 machine_mode halfmode
= GET_MODE (operands
[1]);
15075 unsigned int halfregs
= REG_NREGS (operands
[1]);
15076 rtx destlo
, desthi
;
15078 gcc_assert (halfmode
== V16QImode
);
15080 if (src1
== dest
&& src2
== dest
+ halfregs
)
15082 /* No-op move. Can't split to nothing; emit something. */
15083 emit_note (NOTE_INSN_DELETED
);
15087 /* Preserve register attributes for variable tracking. */
15088 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
15089 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
15090 GET_MODE_SIZE (halfmode
));
15092 /* Special case of reversed high/low parts. */
15093 if (reg_overlap_mentioned_p (operands
[2], destlo
)
15094 && reg_overlap_mentioned_p (operands
[1], desthi
))
15096 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15097 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
15098 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15100 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
15102 /* Try to avoid unnecessary moves if part of the result
15103 is in the right place already. */
15105 emit_move_insn (destlo
, operands
[1]);
15106 if (src2
!= dest
+ halfregs
)
15107 emit_move_insn (desthi
, operands
[2]);
15111 if (src2
!= dest
+ halfregs
)
15112 emit_move_insn (desthi
, operands
[2]);
15114 emit_move_insn (destlo
, operands
[1]);
15118 /* vec_perm support. */
15120 struct expand_vec_perm_d
15122 rtx target
, op0
, op1
;
15123 vec_perm_indices perm
;
15124 machine_mode vmode
;
15125 unsigned int vec_flags
;
15130 /* Generate a variable permutation. */
15133 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15135 machine_mode vmode
= GET_MODE (target
);
15136 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15138 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
15139 gcc_checking_assert (GET_MODE (op0
) == vmode
);
15140 gcc_checking_assert (GET_MODE (op1
) == vmode
);
15141 gcc_checking_assert (GET_MODE (sel
) == vmode
);
15142 gcc_checking_assert (TARGET_SIMD
);
15146 if (vmode
== V8QImode
)
15148 /* Expand the argument to a V16QI mode by duplicating it. */
15149 rtx pair
= gen_reg_rtx (V16QImode
);
15150 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
15151 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15155 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
15162 if (vmode
== V8QImode
)
15164 pair
= gen_reg_rtx (V16QImode
);
15165 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
15166 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15170 pair
= gen_reg_rtx (OImode
);
15171 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
15172 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
15177 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15178 NELT is the number of elements in the vector. */
15181 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
15184 machine_mode vmode
= GET_MODE (target
);
15185 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15188 /* The TBL instruction does not use a modulo index, so we must take care
15189 of that ourselves. */
15190 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
15191 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
15192 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
15194 /* For big-endian, we also need to reverse the index within the vector
15195 (but not which vector). */
15196 if (BYTES_BIG_ENDIAN
)
15198 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15200 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15201 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15202 NULL
, 0, OPTAB_LIB_WIDEN
);
15204 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15207 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15210 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15212 emit_insn (gen_rtx_SET (target
,
15213 gen_rtx_UNSPEC (GET_MODE (target
),
15214 gen_rtvec (2, op0
, op1
), code
)));
15217 /* Expand an SVE vec_perm with the given operands. */
15220 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15222 machine_mode data_mode
= GET_MODE (target
);
15223 machine_mode sel_mode
= GET_MODE (sel
);
15224 /* Enforced by the pattern condition. */
15225 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15227 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15228 size of the two value vectors, i.e. the upper bits of the indices
15229 are effectively ignored. SVE TBL instead produces 0 for any
15230 out-of-range indices, so we need to modulo all the vec_perm indices
15231 to ensure they are all in range. */
15232 rtx sel_reg
= force_reg (sel_mode
, sel
);
15234 /* Check if the sel only references the first values vector. */
15235 if (GET_CODE (sel
) == CONST_VECTOR
15236 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15238 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15242 /* Check if the two values vectors are the same. */
15243 if (rtx_equal_p (op0
, op1
))
15245 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15246 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15247 NULL
, 0, OPTAB_DIRECT
);
15248 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15252 /* Run TBL on for each value vector and combine the results. */
15254 rtx res0
= gen_reg_rtx (data_mode
);
15255 rtx res1
= gen_reg_rtx (data_mode
);
15256 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15257 if (GET_CODE (sel
) != CONST_VECTOR
15258 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15260 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15262 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15263 NULL
, 0, OPTAB_DIRECT
);
15265 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15266 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15267 NULL
, 0, OPTAB_DIRECT
);
15268 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15269 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15270 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15272 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15275 /* Recognize patterns suitable for the TRN instructions. */
15277 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15280 poly_uint64 nelt
= d
->perm
.length ();
15281 rtx out
, in0
, in1
, x
;
15282 machine_mode vmode
= d
->vmode
;
15284 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15287 /* Note that these are little-endian tests.
15288 We correct for big-endian later. */
15289 if (!d
->perm
[0].is_constant (&odd
)
15290 || (odd
!= 0 && odd
!= 1)
15291 || !d
->perm
.series_p (0, 2, odd
, 2)
15292 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
15301 /* We don't need a big-endian lane correction for SVE; see the comment
15302 at the head of aarch64-sve.md for details. */
15303 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15305 x
= in0
, in0
= in1
, in1
= x
;
15310 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15311 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
15315 /* Recognize patterns suitable for the UZP instructions. */
15317 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
15320 rtx out
, in0
, in1
, x
;
15321 machine_mode vmode
= d
->vmode
;
15323 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15326 /* Note that these are little-endian tests.
15327 We correct for big-endian later. */
15328 if (!d
->perm
[0].is_constant (&odd
)
15329 || (odd
!= 0 && odd
!= 1)
15330 || !d
->perm
.series_p (0, 1, odd
, 2))
15339 /* We don't need a big-endian lane correction for SVE; see the comment
15340 at the head of aarch64-sve.md for details. */
15341 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15343 x
= in0
, in0
= in1
, in1
= x
;
15348 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15349 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15353 /* Recognize patterns suitable for the ZIP instructions. */
15355 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15358 poly_uint64 nelt
= d
->perm
.length ();
15359 rtx out
, in0
, in1
, x
;
15360 machine_mode vmode
= d
->vmode
;
15362 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15365 /* Note that these are little-endian tests.
15366 We correct for big-endian later. */
15367 poly_uint64 first
= d
->perm
[0];
15368 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15369 || !d
->perm
.series_p (0, 2, first
, 1)
15370 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15372 high
= maybe_ne (first
, 0U);
15380 /* We don't need a big-endian lane correction for SVE; see the comment
15381 at the head of aarch64-sve.md for details. */
15382 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15384 x
= in0
, in0
= in1
, in1
= x
;
15389 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15390 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
15394 /* Recognize patterns for the EXT insn. */
15397 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
15399 HOST_WIDE_INT location
;
15402 /* The first element always refers to the first vector.
15403 Check if the extracted indices are increasing by one. */
15404 if (d
->vec_flags
== VEC_SVE_PRED
15405 || !d
->perm
[0].is_constant (&location
)
15406 || !d
->perm
.series_p (0, 1, location
, 1))
15413 /* The case where (location == 0) is a no-op for both big- and little-endian,
15414 and is removed by the mid-end at optimization levels -O1 and higher.
15416 We don't need a big-endian lane correction for SVE; see the comment
15417 at the head of aarch64-sve.md for details. */
15418 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
15420 /* After setup, we want the high elements of the first vector (stored
15421 at the LSB end of the register), and the low elements of the second
15422 vector (stored at the MSB end of the register). So swap. */
15423 std::swap (d
->op0
, d
->op1
);
15424 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15425 to_constant () is safe since this is restricted to Advanced SIMD
15427 location
= d
->perm
.length ().to_constant () - location
;
15430 offset
= GEN_INT (location
);
15431 emit_set_insn (d
->target
,
15432 gen_rtx_UNSPEC (d
->vmode
,
15433 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
15438 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15439 within each 64-bit, 32-bit or 16-bit granule. */
15442 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
15444 HOST_WIDE_INT diff
;
15445 unsigned int i
, size
, unspec
;
15446 machine_mode pred_mode
;
15448 if (d
->vec_flags
== VEC_SVE_PRED
15449 || !d
->one_vector_p
15450 || !d
->perm
[0].is_constant (&diff
))
15453 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
15456 unspec
= UNSPEC_REV64
;
15457 pred_mode
= VNx2BImode
;
15459 else if (size
== 4)
15461 unspec
= UNSPEC_REV32
;
15462 pred_mode
= VNx4BImode
;
15464 else if (size
== 2)
15466 unspec
= UNSPEC_REV16
;
15467 pred_mode
= VNx8BImode
;
15472 unsigned int step
= diff
+ 1;
15473 for (i
= 0; i
< step
; ++i
)
15474 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
15481 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
15482 if (d
->vec_flags
== VEC_SVE_DATA
)
15484 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15485 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
15486 UNSPEC_MERGE_PTRUE
);
15488 emit_set_insn (d
->target
, src
);
15492 /* Recognize patterns for the REV insn, which reverses elements within
15496 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
15498 poly_uint64 nelt
= d
->perm
.length ();
15500 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
15503 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
15510 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
15511 emit_set_insn (d
->target
, src
);
15516 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
15518 rtx out
= d
->target
;
15521 machine_mode vmode
= d
->vmode
;
15524 if (d
->vec_flags
== VEC_SVE_PRED
15525 || d
->perm
.encoding ().encoded_nelts () != 1
15526 || !d
->perm
[0].is_constant (&elt
))
15529 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
15536 /* The generic preparation in aarch64_expand_vec_perm_const_1
15537 swaps the operand order and the permute indices if it finds
15538 d->perm[0] to be in the second operand. Thus, we can always
15539 use d->op0 and need not do any extra arithmetic to get the
15540 correct lane number. */
15542 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
15544 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
15545 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
15546 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
15551 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
15553 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
15554 machine_mode vmode
= d
->vmode
;
15556 /* Make sure that the indices are constant. */
15557 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
15558 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
15559 if (!d
->perm
[i
].is_constant ())
15565 /* Generic code will try constant permutation twice. Once with the
15566 original mode and again with the elements lowered to QImode.
15567 So wait and don't do the selector expansion ourselves. */
15568 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
15571 /* to_constant is safe since this routine is specific to Advanced SIMD
15573 unsigned int nelt
= d
->perm
.length ().to_constant ();
15574 for (unsigned int i
= 0; i
< nelt
; ++i
)
15575 /* If big-endian and two vectors we end up with a weird mixed-endian
15576 mode on NEON. Reverse the index within each word but not the word
15577 itself. to_constant is safe because we checked is_constant above. */
15578 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
15579 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
15580 : d
->perm
[i
].to_constant ());
15582 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
15583 sel
= force_reg (vmode
, sel
);
15585 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
15589 /* Try to implement D using an SVE TBL instruction. */
15592 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
15594 unsigned HOST_WIDE_INT nelt
;
15596 /* Permuting two variable-length vectors could overflow the
15598 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
15604 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
15605 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
15606 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
15611 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
15613 /* The pattern matching functions above are written to look for a small
15614 number to begin the sequence (0, 1, N/2). If we begin with an index
15615 from the second operand, we can swap the operands. */
15616 poly_int64 nelt
= d
->perm
.length ();
15617 if (known_ge (d
->perm
[0], nelt
))
15619 d
->perm
.rotate_inputs (1);
15620 std::swap (d
->op0
, d
->op1
);
15623 if ((d
->vec_flags
== VEC_ADVSIMD
15624 || d
->vec_flags
== VEC_SVE_DATA
15625 || d
->vec_flags
== VEC_SVE_PRED
)
15626 && known_gt (nelt
, 1))
15628 if (aarch64_evpc_rev_local (d
))
15630 else if (aarch64_evpc_rev_global (d
))
15632 else if (aarch64_evpc_ext (d
))
15634 else if (aarch64_evpc_dup (d
))
15636 else if (aarch64_evpc_zip (d
))
15638 else if (aarch64_evpc_uzp (d
))
15640 else if (aarch64_evpc_trn (d
))
15642 if (d
->vec_flags
== VEC_SVE_DATA
)
15643 return aarch64_evpc_sve_tbl (d
);
15644 else if (d
->vec_flags
== VEC_SVE_DATA
)
15645 return aarch64_evpc_tbl (d
);
15650 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15653 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
15654 rtx op1
, const vec_perm_indices
&sel
)
15656 struct expand_vec_perm_d d
;
15658 /* Check whether the mask can be applied to a single vector. */
15659 if (op0
&& rtx_equal_p (op0
, op1
))
15660 d
.one_vector_p
= true;
15661 else if (sel
.all_from_input_p (0))
15663 d
.one_vector_p
= true;
15666 else if (sel
.all_from_input_p (1))
15668 d
.one_vector_p
= true;
15672 d
.one_vector_p
= false;
15674 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
15675 sel
.nelts_per_input ());
15677 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
15681 d
.testing_p
= !target
;
15684 return aarch64_expand_vec_perm_const_1 (&d
);
15686 rtx_insn
*last
= get_last_insn ();
15687 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
15688 gcc_assert (last
== get_last_insn ());
15693 /* Generate a byte permute mask for a register of mode MODE,
15694 which has NUNITS units. */
15697 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
15699 /* We have to reverse each vector because we dont have
15700 a permuted load that can reverse-load according to ABI rules. */
15702 rtvec v
= rtvec_alloc (16);
15704 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
15706 gcc_assert (BYTES_BIG_ENDIAN
);
15707 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
15709 for (i
= 0; i
< nunits
; i
++)
15710 for (j
= 0; j
< usize
; j
++)
15711 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
15712 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
15713 return force_reg (V16QImode
, mask
);
15716 /* Return true if X is a valid second operand for the SVE instruction
15717 that implements integer comparison OP_CODE. */
15720 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
15722 if (register_operand (x
, VOIDmode
))
15731 return aarch64_sve_cmp_immediate_p (x
, false);
15738 return aarch64_sve_cmp_immediate_p (x
, true);
15740 gcc_unreachable ();
15744 /* Use predicated SVE instructions to implement the equivalent of:
15748 given that PTRUE is an all-true predicate of the appropriate mode. */
15751 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
15753 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15754 gen_rtvec (2, ptrue
, op
),
15755 UNSPEC_MERGE_PTRUE
);
15756 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
15757 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15760 /* Likewise, but also clobber the condition codes. */
15763 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
15765 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15766 gen_rtvec (2, ptrue
, op
),
15767 UNSPEC_MERGE_PTRUE
);
15768 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
15769 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15772 /* Return the UNSPEC_COND_* code for comparison CODE. */
15774 static unsigned int
15775 aarch64_unspec_cond_code (rtx_code code
)
15780 return UNSPEC_COND_NE
;
15782 return UNSPEC_COND_EQ
;
15784 return UNSPEC_COND_LT
;
15786 return UNSPEC_COND_GT
;
15788 return UNSPEC_COND_LE
;
15790 return UNSPEC_COND_GE
;
15792 gcc_unreachable ();
15798 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15800 where <X> is the operation associated with comparison CODE. This form
15801 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15802 semantics, such as when PRED might not be all-true and when comparing
15803 inactive lanes could have side effects. */
15806 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
15807 rtx pred
, rtx op0
, rtx op1
)
15809 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
15810 gen_rtvec (3, pred
, op0
, op1
),
15811 aarch64_unspec_cond_code (code
));
15812 emit_set_insn (target
, unspec
);
15815 /* Expand an SVE integer comparison using the SVE equivalent of:
15817 (set TARGET (CODE OP0 OP1)). */
15820 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
15822 machine_mode pred_mode
= GET_MODE (target
);
15823 machine_mode data_mode
= GET_MODE (op0
);
15825 if (!aarch64_sve_cmp_operand_p (code
, op1
))
15826 op1
= force_reg (data_mode
, op1
);
15828 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15829 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15830 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
15833 /* Emit the SVE equivalent of:
15835 (set TMP1 (CODE1 OP0 OP1))
15836 (set TMP2 (CODE2 OP0 OP1))
15837 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15839 PTRUE is an all-true predicate with the same mode as TARGET. */
15842 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
15843 rtx ptrue
, rtx op0
, rtx op1
)
15845 machine_mode pred_mode
= GET_MODE (ptrue
);
15846 rtx tmp1
= gen_reg_rtx (pred_mode
);
15847 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
15848 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
15849 rtx tmp2
= gen_reg_rtx (pred_mode
);
15850 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
15851 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
15852 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
15855 /* Emit the SVE equivalent of:
15857 (set TMP (CODE OP0 OP1))
15858 (set TARGET (not TMP))
15860 PTRUE is an all-true predicate with the same mode as TARGET. */
15863 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
15866 machine_mode pred_mode
= GET_MODE (ptrue
);
15867 rtx tmp
= gen_reg_rtx (pred_mode
);
15868 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
15869 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
15870 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15873 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15875 (set TARGET (CODE OP0 OP1))
15877 If CAN_INVERT_P is true, the caller can also handle inverted results;
15878 return true if the result is in fact inverted. */
15881 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
15882 rtx op0
, rtx op1
, bool can_invert_p
)
15884 machine_mode pred_mode
= GET_MODE (target
);
15885 machine_mode data_mode
= GET_MODE (op0
);
15887 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15891 /* UNORDERED has no immediate form. */
15892 op1
= force_reg (data_mode
, op1
);
15901 /* There is native support for the comparison. */
15902 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15903 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15908 /* This is a trapping operation (LT or GT). */
15909 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
15913 if (!flag_trapping_math
)
15915 /* This would trap for signaling NaNs. */
15916 op1
= force_reg (data_mode
, op1
);
15917 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
15925 if (flag_trapping_math
)
15927 /* Work out which elements are ordered. */
15928 rtx ordered
= gen_reg_rtx (pred_mode
);
15929 op1
= force_reg (data_mode
, op1
);
15930 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
15932 /* Test the opposite condition for the ordered elements,
15933 then invert the result. */
15937 code
= reverse_condition_maybe_unordered (code
);
15940 aarch64_emit_sve_predicated_cond (target
, code
,
15941 ordered
, op0
, op1
);
15944 rtx tmp
= gen_reg_rtx (pred_mode
);
15945 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
15946 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15952 /* ORDERED has no immediate form. */
15953 op1
= force_reg (data_mode
, op1
);
15957 gcc_unreachable ();
15960 /* There is native support for the inverse comparison. */
15961 code
= reverse_condition_maybe_unordered (code
);
15964 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15965 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15968 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
15972 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15973 of the data being selected and CMP_MODE is the mode of the values being
15977 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
15980 machine_mode pred_mode
15981 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
15982 GET_MODE_SIZE (cmp_mode
)).require ();
15983 rtx pred
= gen_reg_rtx (pred_mode
);
15984 if (FLOAT_MODE_P (cmp_mode
))
15986 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
15987 ops
[4], ops
[5], true))
15988 std::swap (ops
[1], ops
[2]);
15991 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
15993 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
15994 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
15997 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15998 true. However due to issues with register allocation it is preferable
15999 to avoid tieing integer scalar and FP scalar modes. Executing integer
16000 operations in general registers is better than treating them as scalar
16001 vector operations. This reduces latency and avoids redundant int<->FP
16002 moves. So tie modes if they are either the same class, or vector modes
16003 with other vector modes, vector structs or any scalar mode. */
16006 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
16008 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
16011 /* We specifically want to allow elements of "structure" modes to
16012 be tieable to the structure. This more general condition allows
16013 other rarer situations too. The reason we don't extend this to
16014 predicate modes is that there are no predicate structure modes
16015 nor any specific instructions for extracting part of a predicate
16017 if (aarch64_vector_data_mode_p (mode1
)
16018 && aarch64_vector_data_mode_p (mode2
))
16021 /* Also allow any scalar modes with vectors. */
16022 if (aarch64_vector_mode_supported_p (mode1
)
16023 || aarch64_vector_mode_supported_p (mode2
))
16029 /* Return a new RTX holding the result of moving POINTER forward by
16033 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
16035 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
16037 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
16041 /* Return a new RTX holding the result of moving POINTER forward by the
16042 size of the mode it points to. */
16045 aarch64_progress_pointer (rtx pointer
)
16047 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
16050 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16054 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
16057 rtx reg
= gen_reg_rtx (mode
);
16059 /* "Cast" the pointers to the correct mode. */
16060 *src
= adjust_address (*src
, mode
, 0);
16061 *dst
= adjust_address (*dst
, mode
, 0);
16062 /* Emit the memcpy. */
16063 emit_move_insn (reg
, *src
);
16064 emit_move_insn (*dst
, reg
);
16065 /* Move the pointers forward. */
16066 *src
= aarch64_progress_pointer (*src
);
16067 *dst
= aarch64_progress_pointer (*dst
);
16070 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16071 we succeed, otherwise return false. */
16074 aarch64_expand_movmem (rtx
*operands
)
16077 rtx dst
= operands
[0];
16078 rtx src
= operands
[1];
16080 bool speed_p
= !optimize_function_for_size_p (cfun
);
16082 /* When optimizing for size, give a better estimate of the length of a
16083 memcpy call, but use the default otherwise. */
16084 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
16086 /* We can't do anything smart if the amount to copy is not constant. */
16087 if (!CONST_INT_P (operands
[2]))
16090 n
= UINTVAL (operands
[2]);
16092 /* Try to keep the number of instructions low. For cases below 16 bytes we
16093 need to make at most two moves. For cases above 16 bytes it will be one
16094 move for each 16 byte chunk, then at most two additional moves. */
16095 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
16098 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
16099 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
16101 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
16102 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
16104 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16110 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
16115 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
16120 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16121 4-byte chunk, partially overlapping with the previously copied chunk. */
16124 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16130 src
= aarch64_move_pointer (src
, move
);
16131 dst
= aarch64_move_pointer (dst
, move
);
16132 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16137 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16138 them, then (if applicable) an 8-byte chunk. */
16143 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
16148 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
16153 /* Finish the final bytes of the copy. We can always do this in one
16154 instruction. We either copy the exact amount we need, or partially
16155 overlap with the previous chunk we copied and copy 8-bytes. */
16159 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
16161 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
16163 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16168 src
= aarch64_move_pointer (src
, -1);
16169 dst
= aarch64_move_pointer (dst
, -1);
16170 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16176 src
= aarch64_move_pointer (src
, move
);
16177 dst
= aarch64_move_pointer (dst
, move
);
16178 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
16185 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16186 SImode stores. Handle the case when the constant has identical
16187 bottom and top halves. This is beneficial when the two stores can be
16188 merged into an STP and we avoid synthesising potentially expensive
16189 immediates twice. Return true if such a split is possible. */
16192 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
16194 rtx lo
= gen_lowpart (SImode
, src
);
16195 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
16197 bool size_p
= optimize_function_for_size_p (cfun
);
16199 if (!rtx_equal_p (lo
, hi
))
16202 unsigned int orig_cost
16203 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
16204 unsigned int lo_cost
16205 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
16207 /* We want to transform:
16209 MOVK x1, 0x140, lsl 16
16210 MOVK x1, 0xc0da, lsl 32
16211 MOVK x1, 0x140, lsl 48
16215 MOVK w1, 0x140, lsl 16
16217 So we want to perform this only when we save two instructions
16218 or more. When optimizing for size, however, accept any code size
16220 if (size_p
&& orig_cost
<= lo_cost
)
16224 && (orig_cost
<= lo_cost
+ 1))
16227 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
16228 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
16231 rtx tmp_reg
= gen_reg_rtx (SImode
);
16232 aarch64_expand_mov_immediate (tmp_reg
, lo
);
16233 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
16234 /* Don't emit an explicit store pair as this may not be always profitable.
16235 Let the sched-fusion logic decide whether to merge them. */
16236 emit_move_insn (mem_lo
, tmp_reg
);
16237 emit_move_insn (mem_hi
, tmp_reg
);
16242 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16244 static unsigned HOST_WIDE_INT
16245 aarch64_asan_shadow_offset (void)
16247 return (HOST_WIDE_INT_1
<< 36);
16251 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
16252 int code
, tree treeop0
, tree treeop1
)
16254 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16256 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16258 struct expand_operand ops
[4];
16261 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16263 op_mode
= GET_MODE (op0
);
16264 if (op_mode
== VOIDmode
)
16265 op_mode
= GET_MODE (op1
);
16273 icode
= CODE_FOR_cmpsi
;
16278 icode
= CODE_FOR_cmpdi
;
16283 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16284 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
16289 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16290 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
16298 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
16299 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
16305 *prep_seq
= get_insns ();
16308 create_fixed_operand (&ops
[0], op0
);
16309 create_fixed_operand (&ops
[1], op1
);
16312 if (!maybe_expand_insn (icode
, 2, ops
))
16317 *gen_seq
= get_insns ();
16320 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
16321 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
16325 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
16326 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
16328 rtx op0
, op1
, target
;
16329 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16330 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16332 struct expand_operand ops
[6];
16335 push_to_sequence (*prep_seq
);
16336 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16338 op_mode
= GET_MODE (op0
);
16339 if (op_mode
== VOIDmode
)
16340 op_mode
= GET_MODE (op1
);
16348 icode
= CODE_FOR_ccmpsi
;
16353 icode
= CODE_FOR_ccmpdi
;
16358 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16359 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
16364 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16365 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
16373 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
16374 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
16380 *prep_seq
= get_insns ();
16383 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
16384 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
16386 if (bit_code
!= AND
)
16388 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
16389 GET_MODE (XEXP (prev
, 0))),
16390 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
16391 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
16394 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
16395 create_fixed_operand (&ops
[1], target
);
16396 create_fixed_operand (&ops
[2], op0
);
16397 create_fixed_operand (&ops
[3], op1
);
16398 create_fixed_operand (&ops
[4], prev
);
16399 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
16401 push_to_sequence (*gen_seq
);
16402 if (!maybe_expand_insn (icode
, 6, ops
))
16408 *gen_seq
= get_insns ();
16411 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
16414 #undef TARGET_GEN_CCMP_FIRST
16415 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16417 #undef TARGET_GEN_CCMP_NEXT
16418 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16420 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16421 instruction fusion of some sort. */
16424 aarch64_macro_fusion_p (void)
16426 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
16430 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16431 should be kept together during scheduling. */
16434 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
16437 rtx prev_set
= single_set (prev
);
16438 rtx curr_set
= single_set (curr
);
16439 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16440 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
16442 if (!aarch64_macro_fusion_p ())
16445 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
16447 /* We are trying to match:
16448 prev (mov) == (set (reg r0) (const_int imm16))
16449 curr (movk) == (set (zero_extract (reg r0)
16452 (const_int imm16_1)) */
16454 set_dest
= SET_DEST (curr_set
);
16456 if (GET_CODE (set_dest
) == ZERO_EXTRACT
16457 && CONST_INT_P (SET_SRC (curr_set
))
16458 && CONST_INT_P (SET_SRC (prev_set
))
16459 && CONST_INT_P (XEXP (set_dest
, 2))
16460 && INTVAL (XEXP (set_dest
, 2)) == 16
16461 && REG_P (XEXP (set_dest
, 0))
16462 && REG_P (SET_DEST (prev_set
))
16463 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
16469 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
16472 /* We're trying to match:
16473 prev (adrp) == (set (reg r1)
16474 (high (symbol_ref ("SYM"))))
16475 curr (add) == (set (reg r0)
16477 (symbol_ref ("SYM"))))
16478 Note that r0 need not necessarily be the same as r1, especially
16479 during pre-regalloc scheduling. */
16481 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16482 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16484 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
16485 && REG_P (XEXP (SET_SRC (curr_set
), 0))
16486 && REGNO (XEXP (SET_SRC (curr_set
), 0))
16487 == REGNO (SET_DEST (prev_set
))
16488 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
16489 XEXP (SET_SRC (curr_set
), 1)))
16494 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
16497 /* We're trying to match:
16498 prev (movk) == (set (zero_extract (reg r0)
16501 (const_int imm16_1))
16502 curr (movk) == (set (zero_extract (reg r0)
16505 (const_int imm16_2)) */
16507 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
16508 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
16509 && REG_P (XEXP (SET_DEST (prev_set
), 0))
16510 && REG_P (XEXP (SET_DEST (curr_set
), 0))
16511 && REGNO (XEXP (SET_DEST (prev_set
), 0))
16512 == REGNO (XEXP (SET_DEST (curr_set
), 0))
16513 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
16514 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
16515 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
16516 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
16517 && CONST_INT_P (SET_SRC (prev_set
))
16518 && CONST_INT_P (SET_SRC (curr_set
)))
16522 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
16524 /* We're trying to match:
16525 prev (adrp) == (set (reg r0)
16526 (high (symbol_ref ("SYM"))))
16527 curr (ldr) == (set (reg r1)
16528 (mem (lo_sum (reg r0)
16529 (symbol_ref ("SYM")))))
16531 curr (ldr) == (set (reg r1)
16534 (symbol_ref ("SYM")))))) */
16535 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16536 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16538 rtx curr_src
= SET_SRC (curr_set
);
16540 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
16541 curr_src
= XEXP (curr_src
, 0);
16543 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
16544 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
16545 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
16546 == REGNO (SET_DEST (prev_set
))
16547 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
16548 XEXP (SET_SRC (prev_set
), 0)))
16553 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
16554 && aarch_crypto_can_dual_issue (prev
, curr
))
16557 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
16558 && any_condjump_p (curr
))
16560 enum attr_type prev_type
= get_attr_type (prev
);
16562 unsigned int condreg1
, condreg2
;
16564 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
16565 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
16567 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
16569 && modified_in_p (cc_reg_1
, prev
))
16571 /* FIXME: this misses some which is considered simple arthematic
16572 instructions for ThunderX. Simple shifts are missed here. */
16573 if (prev_type
== TYPE_ALUS_SREG
16574 || prev_type
== TYPE_ALUS_IMM
16575 || prev_type
== TYPE_LOGICS_REG
16576 || prev_type
== TYPE_LOGICS_IMM
)
16583 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
16584 && any_condjump_p (curr
))
16586 /* We're trying to match:
16587 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16588 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16590 (label_ref ("SYM"))
16592 if (SET_DEST (curr_set
) == (pc_rtx
)
16593 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
16594 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
16595 && REG_P (SET_DEST (prev_set
))
16596 && REGNO (SET_DEST (prev_set
))
16597 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
16599 /* Fuse ALU operations followed by conditional branch instruction. */
16600 switch (get_attr_type (prev
))
16603 case TYPE_ALU_SREG
:
16606 case TYPE_ADCS_REG
:
16607 case TYPE_ADCS_IMM
:
16608 case TYPE_LOGIC_REG
:
16609 case TYPE_LOGIC_IMM
:
16613 case TYPE_SHIFT_REG
:
16614 case TYPE_SHIFT_IMM
:
16629 /* Return true iff the instruction fusion described by OP is enabled. */
16632 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
16634 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
16637 /* If MEM is in the form of [base+offset], extract the two parts
16638 of address and set to BASE and OFFSET, otherwise return false
16639 after clearing BASE and OFFSET. */
16642 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
16646 gcc_assert (MEM_P (mem
));
16648 addr
= XEXP (mem
, 0);
16653 *offset
= const0_rtx
;
16657 if (GET_CODE (addr
) == PLUS
16658 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
16660 *base
= XEXP (addr
, 0);
16661 *offset
= XEXP (addr
, 1);
16666 *offset
= NULL_RTX
;
16671 /* Types for scheduling fusion. */
16672 enum sched_fusion_type
16674 SCHED_FUSION_NONE
= 0,
16675 SCHED_FUSION_LD_SIGN_EXTEND
,
16676 SCHED_FUSION_LD_ZERO_EXTEND
,
16682 /* If INSN is a load or store of address in the form of [base+offset],
16683 extract the two parts and set to BASE and OFFSET. Return scheduling
16684 fusion type this INSN is. */
16686 static enum sched_fusion_type
16687 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
16690 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
16692 gcc_assert (INSN_P (insn
));
16693 x
= PATTERN (insn
);
16694 if (GET_CODE (x
) != SET
)
16695 return SCHED_FUSION_NONE
;
16698 dest
= SET_DEST (x
);
16700 machine_mode dest_mode
= GET_MODE (dest
);
16702 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
16703 return SCHED_FUSION_NONE
;
16705 if (GET_CODE (src
) == SIGN_EXTEND
)
16707 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
16708 src
= XEXP (src
, 0);
16709 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16710 return SCHED_FUSION_NONE
;
16712 else if (GET_CODE (src
) == ZERO_EXTEND
)
16714 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
16715 src
= XEXP (src
, 0);
16716 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16717 return SCHED_FUSION_NONE
;
16720 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
16721 extract_base_offset_in_addr (src
, base
, offset
);
16722 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
16724 fusion
= SCHED_FUSION_ST
;
16725 extract_base_offset_in_addr (dest
, base
, offset
);
16728 return SCHED_FUSION_NONE
;
16730 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
16731 fusion
= SCHED_FUSION_NONE
;
16736 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16738 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16739 and PRI are only calculated for these instructions. For other instruction,
16740 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16741 type instruction fusion can be added by returning different priorities.
16743 It's important that irrelevant instructions get the largest FUSION_PRI. */
16746 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
16747 int *fusion_pri
, int *pri
)
16751 enum sched_fusion_type fusion
;
16753 gcc_assert (INSN_P (insn
));
16756 fusion
= fusion_load_store (insn
, &base
, &offset
);
16757 if (fusion
== SCHED_FUSION_NONE
)
16764 /* Set FUSION_PRI according to fusion type and base register. */
16765 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
16767 /* Calculate PRI. */
16770 /* INSN with smaller offset goes first. */
16771 off_val
= (int)(INTVAL (offset
));
16773 tmp
-= (off_val
& 0xfffff);
16775 tmp
+= ((- off_val
) & 0xfffff);
16781 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16782 Adjust priority of sha1h instructions so they are scheduled before
16783 other SHA1 instructions. */
16786 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
16788 rtx x
= PATTERN (insn
);
16790 if (GET_CODE (x
) == SET
)
16794 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
16795 return priority
+ 10;
16801 /* Given OPERANDS of consecutive load/store, check if we can merge
16802 them into ldp/stp. LOAD is true if they are load instructions.
16803 MODE is the mode of memory operands. */
16806 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
16809 HOST_WIDE_INT offval_1
, offval_2
, msize
;
16810 enum reg_class rclass_1
, rclass_2
;
16811 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
16815 mem_1
= operands
[1];
16816 mem_2
= operands
[3];
16817 reg_1
= operands
[0];
16818 reg_2
= operands
[2];
16819 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
16820 if (REGNO (reg_1
) == REGNO (reg_2
))
16825 mem_1
= operands
[0];
16826 mem_2
= operands
[2];
16827 reg_1
= operands
[1];
16828 reg_2
= operands
[3];
16831 /* The mems cannot be volatile. */
16832 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
16835 /* If we have SImode and slow unaligned ldp,
16836 check the alignment to be at least 8 byte. */
16838 && (aarch64_tune_params
.extra_tuning_flags
16839 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16841 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16844 /* Check if the addresses are in the form of [base+offset]. */
16845 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16846 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16848 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16849 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16852 /* Check if the bases are same. */
16853 if (!rtx_equal_p (base_1
, base_2
))
16856 /* The operands must be of the same size. */
16857 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
16858 GET_MODE_SIZE (GET_MODE (mem_2
))));
16860 offval_1
= INTVAL (offset_1
);
16861 offval_2
= INTVAL (offset_2
);
16862 /* We should only be trying this for fixed-sized modes. There is no
16863 SVE LDP/STP instruction. */
16864 msize
= GET_MODE_SIZE (mode
).to_constant ();
16865 /* Check if the offsets are consecutive. */
16866 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
16869 /* Check if the addresses are clobbered by load. */
16872 if (reg_mentioned_p (reg_1
, mem_1
))
16875 /* In increasing order, the last load can clobber the address. */
16876 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
16880 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16881 rclass_1
= FP_REGS
;
16883 rclass_1
= GENERAL_REGS
;
16885 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16886 rclass_2
= FP_REGS
;
16888 rclass_2
= GENERAL_REGS
;
16890 /* Check if the registers are of same class. */
16891 if (rclass_1
!= rclass_2
)
16897 /* Given OPERANDS of consecutive load/store, check if we can merge
16898 them into ldp/stp by adjusting the offset. LOAD is true if they
16899 are load instructions. MODE is the mode of memory operands.
16901 Given below consecutive stores:
16903 str w1, [xb, 0x100]
16904 str w1, [xb, 0x104]
16905 str w1, [xb, 0x108]
16906 str w1, [xb, 0x10c]
16908 Though the offsets are out of the range supported by stp, we can
16909 still pair them after adjusting the offset, like:
16911 add scratch, xb, 0x100
16912 stp w1, w1, [scratch]
16913 stp w1, w1, [scratch, 0x8]
16915 The peephole patterns detecting this opportunity should guarantee
16916 the scratch register is avaliable. */
16919 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
16922 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
16923 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
16924 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
16925 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
16929 reg_1
= operands
[0];
16930 mem_1
= operands
[1];
16931 reg_2
= operands
[2];
16932 mem_2
= operands
[3];
16933 reg_3
= operands
[4];
16934 mem_3
= operands
[5];
16935 reg_4
= operands
[6];
16936 mem_4
= operands
[7];
16937 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
16938 && REG_P (reg_3
) && REG_P (reg_4
));
16939 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
16944 mem_1
= operands
[0];
16945 reg_1
= operands
[1];
16946 mem_2
= operands
[2];
16947 reg_2
= operands
[3];
16948 mem_3
= operands
[4];
16949 reg_3
= operands
[5];
16950 mem_4
= operands
[6];
16951 reg_4
= operands
[7];
16953 /* Skip if memory operand is by itslef valid for ldp/stp. */
16954 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
16957 /* The mems cannot be volatile. */
16958 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
16959 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
16962 /* Check if the addresses are in the form of [base+offset]. */
16963 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16964 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16966 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16967 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16969 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
16970 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
16972 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
16973 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
16976 /* Check if the bases are same. */
16977 if (!rtx_equal_p (base_1
, base_2
)
16978 || !rtx_equal_p (base_2
, base_3
)
16979 || !rtx_equal_p (base_3
, base_4
))
16982 offval_1
= INTVAL (offset_1
);
16983 offval_2
= INTVAL (offset_2
);
16984 offval_3
= INTVAL (offset_3
);
16985 offval_4
= INTVAL (offset_4
);
16986 msize
= GET_MODE_SIZE (mode
);
16987 /* Check if the offsets are consecutive. */
16988 if ((offval_1
!= (offval_2
+ msize
)
16989 || offval_1
!= (offval_3
+ msize
* 2)
16990 || offval_1
!= (offval_4
+ msize
* 3))
16991 && (offval_4
!= (offval_3
+ msize
)
16992 || offval_4
!= (offval_2
+ msize
* 2)
16993 || offval_4
!= (offval_1
+ msize
* 3)))
16996 /* Check if the addresses are clobbered by load. */
16999 if (reg_mentioned_p (reg_1
, mem_1
)
17000 || reg_mentioned_p (reg_2
, mem_2
)
17001 || reg_mentioned_p (reg_3
, mem_3
))
17004 /* In increasing order, the last load can clobber the address. */
17005 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
17009 /* If we have SImode and slow unaligned ldp,
17010 check the alignment to be at least 8 byte. */
17012 && (aarch64_tune_params
.extra_tuning_flags
17013 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17015 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
17018 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
17019 rclass_1
= FP_REGS
;
17021 rclass_1
= GENERAL_REGS
;
17023 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
17024 rclass_2
= FP_REGS
;
17026 rclass_2
= GENERAL_REGS
;
17028 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
17029 rclass_3
= FP_REGS
;
17031 rclass_3
= GENERAL_REGS
;
17033 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
17034 rclass_4
= FP_REGS
;
17036 rclass_4
= GENERAL_REGS
;
17038 /* Check if the registers are of same class. */
17039 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
17045 /* Given OPERANDS of consecutive load/store, this function pairs them
17046 into ldp/stp after adjusting the offset. It depends on the fact
17047 that addresses of load/store instructions are in increasing order.
17048 MODE is the mode of memory operands. CODE is the rtl operator
17049 which should be applied to all memory operands, it's SIGN_EXTEND,
17050 ZERO_EXTEND or UNKNOWN. */
17053 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17054 scalar_mode mode
, RTX_CODE code
)
17056 rtx base
, offset
, t1
, t2
;
17057 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17058 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
17062 mem_1
= operands
[1];
17063 mem_2
= operands
[3];
17064 mem_3
= operands
[5];
17065 mem_4
= operands
[7];
17069 mem_1
= operands
[0];
17070 mem_2
= operands
[2];
17071 mem_3
= operands
[4];
17072 mem_4
= operands
[6];
17073 gcc_assert (code
== UNKNOWN
);
17076 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
17077 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
17079 /* Adjust offset thus it can fit in ldp/stp instruction. */
17080 msize
= GET_MODE_SIZE (mode
);
17081 stp_off_limit
= msize
* 0x40;
17082 off_val
= INTVAL (offset
);
17083 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
17084 new_off
= abs_off
% stp_off_limit
;
17085 adj_off
= abs_off
- new_off
;
17087 /* Further adjust to make sure all offsets are OK. */
17088 if ((new_off
+ msize
* 2) >= stp_off_limit
)
17090 adj_off
+= stp_off_limit
;
17091 new_off
-= stp_off_limit
;
17094 /* Make sure the adjustment can be done with ADD/SUB instructions. */
17095 if (adj_off
>= 0x1000)
17100 adj_off
= -adj_off
;
17101 new_off
= -new_off
;
17104 /* Create new memory references. */
17105 mem_1
= change_address (mem_1
, VOIDmode
,
17106 plus_constant (DImode
, operands
[8], new_off
));
17108 /* Check if the adjusted address is OK for ldp/stp. */
17109 if (!aarch64_mem_pair_operand (mem_1
, mode
))
17112 msize
= GET_MODE_SIZE (mode
);
17113 mem_2
= change_address (mem_2
, VOIDmode
,
17114 plus_constant (DImode
,
17117 mem_3
= change_address (mem_3
, VOIDmode
,
17118 plus_constant (DImode
,
17120 new_off
+ msize
* 2));
17121 mem_4
= change_address (mem_4
, VOIDmode
,
17122 plus_constant (DImode
,
17124 new_off
+ msize
* 3));
17126 if (code
== ZERO_EXTEND
)
17128 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
17129 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
17130 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
17131 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
17133 else if (code
== SIGN_EXTEND
)
17135 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
17136 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
17137 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
17138 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
17143 operands
[1] = mem_1
;
17144 operands
[3] = mem_2
;
17145 operands
[5] = mem_3
;
17146 operands
[7] = mem_4
;
17150 operands
[0] = mem_1
;
17151 operands
[2] = mem_2
;
17152 operands
[4] = mem_3
;
17153 operands
[6] = mem_4
;
17156 /* Emit adjusting instruction. */
17157 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, adj_off
)));
17158 /* Emit ldp/stp instructions. */
17159 t1
= gen_rtx_SET (operands
[0], operands
[1]);
17160 t2
= gen_rtx_SET (operands
[2], operands
[3]);
17161 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17162 t1
= gen_rtx_SET (operands
[4], operands
[5]);
17163 t2
= gen_rtx_SET (operands
[6], operands
[7]);
17164 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17168 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17169 it isn't worth branching around empty masked ops (including masked
17173 aarch64_empty_mask_is_expensive (unsigned)
17178 /* Return 1 if pseudo register should be created and used to hold
17179 GOT address for PIC code. */
17182 aarch64_use_pseudo_pic_reg (void)
17184 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
17187 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17190 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
17192 switch (XINT (x
, 1))
17194 case UNSPEC_GOTSMALLPIC
:
17195 case UNSPEC_GOTSMALLPIC28K
:
17196 case UNSPEC_GOTTINYPIC
:
17202 return default_unspec_may_trap_p (x
, flags
);
17206 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17207 return the log2 of that value. Otherwise return -1. */
17210 aarch64_fpconst_pow_of_2 (rtx x
)
17212 const REAL_VALUE_TYPE
*r
;
17214 if (!CONST_DOUBLE_P (x
))
17217 r
= CONST_DOUBLE_REAL_VALUE (x
);
17219 if (REAL_VALUE_NEGATIVE (*r
)
17220 || REAL_VALUE_ISNAN (*r
)
17221 || REAL_VALUE_ISINF (*r
)
17222 || !real_isinteger (r
, DFmode
))
17225 return exact_log2 (real_to_integer (r
));
17228 /* If X is a vector of equal CONST_DOUBLE values and that value is
17229 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17232 aarch64_vec_fpconst_pow_of_2 (rtx x
)
17235 if (GET_CODE (x
) != CONST_VECTOR
17236 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
17239 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
17242 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
17246 for (int i
= 1; i
< nelts
; i
++)
17247 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
17253 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17256 __fp16 always promotes through this hook.
17257 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17258 through the generic excess precision logic rather than here. */
17261 aarch64_promoted_type (const_tree t
)
17263 if (SCALAR_FLOAT_TYPE_P (t
)
17264 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
17265 return float_type_node
;
17270 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17273 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
17274 optimization_type opt_type
)
17279 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
17286 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17288 static unsigned int
17289 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
17292 /* Polynomial invariant 1 == (VG / 2) - 1. */
17293 gcc_assert (i
== 1);
17296 return AARCH64_DWARF_VG
;
17299 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17300 if MODE is HFmode, and punt to the generic implementation otherwise. */
17303 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
17305 return (mode
== HFmode
17307 : default_libgcc_floating_mode_supported_p (mode
));
17310 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17311 if MODE is HFmode, and punt to the generic implementation otherwise. */
17314 aarch64_scalar_mode_supported_p (scalar_mode mode
)
17316 return (mode
== HFmode
17318 : default_scalar_mode_supported_p (mode
));
17321 /* Set the value of FLT_EVAL_METHOD.
17322 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17324 0: evaluate all operations and constants, whose semantic type has at
17325 most the range and precision of type float, to the range and
17326 precision of float; evaluate all other operations and constants to
17327 the range and precision of the semantic type;
17329 N, where _FloatN is a supported interchange floating type
17330 evaluate all operations and constants, whose semantic type has at
17331 most the range and precision of _FloatN type, to the range and
17332 precision of the _FloatN type; evaluate all other operations and
17333 constants to the range and precision of the semantic type;
17335 If we have the ARMv8.2-A extensions then we support _Float16 in native
17336 precision, so we should set this to 16. Otherwise, we support the type,
17337 but want to evaluate expressions in float precision, so set this to
17340 static enum flt_eval_method
17341 aarch64_excess_precision (enum excess_precision_type type
)
17345 case EXCESS_PRECISION_TYPE_FAST
:
17346 case EXCESS_PRECISION_TYPE_STANDARD
:
17347 /* We can calculate either in 16-bit range and precision or
17348 32-bit range and precision. Make that decision based on whether
17349 we have native support for the ARMv8.2-A 16-bit floating-point
17350 instructions or not. */
17351 return (TARGET_FP_F16INST
17352 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17353 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
17354 case EXCESS_PRECISION_TYPE_IMPLICIT
:
17355 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
17357 gcc_unreachable ();
17359 return FLT_EVAL_METHOD_UNPREDICTABLE
;
17362 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17363 scheduled for speculative execution. Reject the long-running division
17364 and square-root instructions. */
17367 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
17369 switch (get_attr_type (insn
))
17377 case TYPE_NEON_FP_SQRT_S
:
17378 case TYPE_NEON_FP_SQRT_D
:
17379 case TYPE_NEON_FP_SQRT_S_Q
:
17380 case TYPE_NEON_FP_SQRT_D_Q
:
17381 case TYPE_NEON_FP_DIV_S
:
17382 case TYPE_NEON_FP_DIV_D
:
17383 case TYPE_NEON_FP_DIV_S_Q
:
17384 case TYPE_NEON_FP_DIV_D_Q
:
17391 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17394 aarch64_compute_pressure_classes (reg_class
*classes
)
17397 classes
[i
++] = GENERAL_REGS
;
17398 classes
[i
++] = FP_REGS
;
17399 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17400 registers need to go in PR_LO_REGS at some point during their
17401 lifetime. Splitting it into two halves has the effect of making
17402 all predicates count against PR_LO_REGS, so that we try whenever
17403 possible to restrict the number of live predicates to 8. This
17404 greatly reduces the amount of spilling in certain loops. */
17405 classes
[i
++] = PR_LO_REGS
;
17406 classes
[i
++] = PR_HI_REGS
;
17410 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17413 aarch64_can_change_mode_class (machine_mode from
,
17414 machine_mode to
, reg_class_t
)
17416 if (BYTES_BIG_ENDIAN
)
17418 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
17419 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
17421 /* Don't allow changes between SVE data modes and non-SVE modes.
17422 See the comment at the head of aarch64-sve.md for details. */
17423 if (from_sve_p
!= to_sve_p
)
17426 /* Don't allow changes in element size: lane 0 of the new vector
17427 would not then be lane 0 of the old vector. See the comment
17428 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17431 In the worst case, this forces a register to be spilled in
17432 one mode and reloaded in the other, which handles the
17433 endianness correctly. */
17434 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
17440 /* Implement TARGET_EARLY_REMAT_MODES. */
17443 aarch64_select_early_remat_modes (sbitmap modes
)
17445 /* SVE values are not normally live across a call, so it should be
17446 worth doing early rematerialization even in VL-specific mode. */
17447 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
17449 machine_mode mode
= (machine_mode
) i
;
17450 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
17451 if (vec_flags
& VEC_ANY_SVE
)
17452 bitmap_set_bit (modes
, i
);
17456 /* Target-specific selftests. */
17460 namespace selftest
{
17462 /* Selftest for the RTL loader.
17463 Verify that the RTL loader copes with a dump from
17464 print_rtx_function. This is essentially just a test that class
17465 function_reader can handle a real dump, but it also verifies
17466 that lookup_reg_by_dump_name correctly handles hard regs.
17467 The presence of hard reg names in the dump means that the test is
17468 target-specific, hence it is in this file. */
17471 aarch64_test_loading_full_dump ()
17473 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
17475 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
17477 rtx_insn
*insn_1
= get_insn_by_uid (1);
17478 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
17480 rtx_insn
*insn_15
= get_insn_by_uid (15);
17481 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
17482 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
17484 /* Verify crtl->return_rtx. */
17485 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
17486 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
17487 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
17490 /* Run all target-specific selftests. */
17493 aarch64_run_selftests (void)
17495 aarch64_test_loading_full_dump ();
17498 } // namespace selftest
17500 #endif /* #if CHECKING_P */
17502 #undef TARGET_ADDRESS_COST
17503 #define TARGET_ADDRESS_COST aarch64_address_cost
17505 /* This hook will determines whether unnamed bitfields affect the alignment
17506 of the containing structure. The hook returns true if the structure
17507 should inherit the alignment requirements of an unnamed bitfield's
17509 #undef TARGET_ALIGN_ANON_BITFIELD
17510 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17512 #undef TARGET_ASM_ALIGNED_DI_OP
17513 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17515 #undef TARGET_ASM_ALIGNED_HI_OP
17516 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17518 #undef TARGET_ASM_ALIGNED_SI_OP
17519 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17521 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17522 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17523 hook_bool_const_tree_hwi_hwi_const_tree_true
17525 #undef TARGET_ASM_FILE_START
17526 #define TARGET_ASM_FILE_START aarch64_start_file
17528 #undef TARGET_ASM_OUTPUT_MI_THUNK
17529 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17531 #undef TARGET_ASM_SELECT_RTX_SECTION
17532 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17534 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17535 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17537 #undef TARGET_BUILD_BUILTIN_VA_LIST
17538 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17540 #undef TARGET_CALLEE_COPIES
17541 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17543 #undef TARGET_CAN_ELIMINATE
17544 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17546 #undef TARGET_CAN_INLINE_P
17547 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17549 #undef TARGET_CANNOT_FORCE_CONST_MEM
17550 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17552 #undef TARGET_CASE_VALUES_THRESHOLD
17553 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17555 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17556 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17558 /* Only the least significant bit is used for initialization guard
17560 #undef TARGET_CXX_GUARD_MASK_BIT
17561 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17563 #undef TARGET_C_MODE_FOR_SUFFIX
17564 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17566 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17567 #undef TARGET_DEFAULT_TARGET_FLAGS
17568 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17571 #undef TARGET_CLASS_MAX_NREGS
17572 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17574 #undef TARGET_BUILTIN_DECL
17575 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17577 #undef TARGET_BUILTIN_RECIPROCAL
17578 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17580 #undef TARGET_C_EXCESS_PRECISION
17581 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17583 #undef TARGET_EXPAND_BUILTIN
17584 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17586 #undef TARGET_EXPAND_BUILTIN_VA_START
17587 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17589 #undef TARGET_FOLD_BUILTIN
17590 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17592 #undef TARGET_FUNCTION_ARG
17593 #define TARGET_FUNCTION_ARG aarch64_function_arg
17595 #undef TARGET_FUNCTION_ARG_ADVANCE
17596 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17598 #undef TARGET_FUNCTION_ARG_BOUNDARY
17599 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17601 #undef TARGET_FUNCTION_ARG_PADDING
17602 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17604 #undef TARGET_GET_RAW_RESULT_MODE
17605 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17606 #undef TARGET_GET_RAW_ARG_MODE
17607 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17609 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17610 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17612 #undef TARGET_FUNCTION_VALUE
17613 #define TARGET_FUNCTION_VALUE aarch64_function_value
17615 #undef TARGET_FUNCTION_VALUE_REGNO_P
17616 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17618 #undef TARGET_GIMPLE_FOLD_BUILTIN
17619 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17621 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17622 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17624 #undef TARGET_INIT_BUILTINS
17625 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17627 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17628 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17629 aarch64_ira_change_pseudo_allocno_class
17631 #undef TARGET_LEGITIMATE_ADDRESS_P
17632 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17634 #undef TARGET_LEGITIMATE_CONSTANT_P
17635 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17637 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17638 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17639 aarch64_legitimize_address_displacement
17641 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17642 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17644 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17645 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17646 aarch64_libgcc_floating_mode_supported_p
17648 #undef TARGET_MANGLE_TYPE
17649 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17651 #undef TARGET_MEMORY_MOVE_COST
17652 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17654 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17655 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17657 #undef TARGET_MUST_PASS_IN_STACK
17658 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17660 /* This target hook should return true if accesses to volatile bitfields
17661 should use the narrowest mode possible. It should return false if these
17662 accesses should use the bitfield container type. */
17663 #undef TARGET_NARROW_VOLATILE_BITFIELD
17664 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17666 #undef TARGET_OPTION_OVERRIDE
17667 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17669 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17670 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17671 aarch64_override_options_after_change
17673 #undef TARGET_OPTION_SAVE
17674 #define TARGET_OPTION_SAVE aarch64_option_save
17676 #undef TARGET_OPTION_RESTORE
17677 #define TARGET_OPTION_RESTORE aarch64_option_restore
17679 #undef TARGET_OPTION_PRINT
17680 #define TARGET_OPTION_PRINT aarch64_option_print
17682 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17683 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17685 #undef TARGET_SET_CURRENT_FUNCTION
17686 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17688 #undef TARGET_PASS_BY_REFERENCE
17689 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17691 #undef TARGET_PREFERRED_RELOAD_CLASS
17692 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17694 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17695 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17697 #undef TARGET_PROMOTED_TYPE
17698 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17700 #undef TARGET_SECONDARY_RELOAD
17701 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17703 #undef TARGET_SHIFT_TRUNCATION_MASK
17704 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17706 #undef TARGET_SETUP_INCOMING_VARARGS
17707 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17709 #undef TARGET_STRUCT_VALUE_RTX
17710 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17712 #undef TARGET_REGISTER_MOVE_COST
17713 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17715 #undef TARGET_RETURN_IN_MEMORY
17716 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17718 #undef TARGET_RETURN_IN_MSB
17719 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17721 #undef TARGET_RTX_COSTS
17722 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17724 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17725 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17727 #undef TARGET_SCHED_ISSUE_RATE
17728 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17730 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17731 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17732 aarch64_sched_first_cycle_multipass_dfa_lookahead
17734 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17735 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17736 aarch64_first_cycle_multipass_dfa_lookahead_guard
17738 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17739 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17740 aarch64_get_separate_components
17742 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17743 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17744 aarch64_components_for_bb
17746 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17747 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17748 aarch64_disqualify_components
17750 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17751 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17752 aarch64_emit_prologue_components
17754 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17755 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17756 aarch64_emit_epilogue_components
17758 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17759 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17760 aarch64_set_handled_components
17762 #undef TARGET_TRAMPOLINE_INIT
17763 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17765 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17766 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17768 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17769 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17771 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17772 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17773 aarch64_builtin_support_vector_misalignment
17775 #undef TARGET_ARRAY_MODE
17776 #define TARGET_ARRAY_MODE aarch64_array_mode
17778 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17779 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17781 #undef TARGET_VECTORIZE_ADD_STMT_COST
17782 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17784 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17785 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17786 aarch64_builtin_vectorization_cost
17788 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17789 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17791 #undef TARGET_VECTORIZE_BUILTINS
17792 #define TARGET_VECTORIZE_BUILTINS
17794 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17795 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17796 aarch64_builtin_vectorized_function
17798 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17799 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17800 aarch64_autovectorize_vector_sizes
17802 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17803 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17804 aarch64_atomic_assign_expand_fenv
17806 /* Section anchor support. */
17808 #undef TARGET_MIN_ANCHOR_OFFSET
17809 #define TARGET_MIN_ANCHOR_OFFSET -256
17811 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17812 byte offset; we can do much more for larger data types, but have no way
17813 to determine the size of the access. We assume accesses are aligned. */
17814 #undef TARGET_MAX_ANCHOR_OFFSET
17815 #define TARGET_MAX_ANCHOR_OFFSET 4095
17817 #undef TARGET_VECTOR_ALIGNMENT
17818 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17820 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17821 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17822 aarch64_vectorize_preferred_vector_alignment
17823 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17824 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17825 aarch64_simd_vector_alignment_reachable
17827 /* vec_perm support. */
17829 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17830 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17831 aarch64_vectorize_vec_perm_const
17833 #undef TARGET_VECTORIZE_GET_MASK_MODE
17834 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17835 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17836 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17837 aarch64_empty_mask_is_expensive
17839 #undef TARGET_INIT_LIBFUNCS
17840 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17842 #undef TARGET_FIXED_CONDITION_CODE_REGS
17843 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17845 #undef TARGET_FLAGS_REGNUM
17846 #define TARGET_FLAGS_REGNUM CC_REGNUM
17848 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17849 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17851 #undef TARGET_ASAN_SHADOW_OFFSET
17852 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17854 #undef TARGET_LEGITIMIZE_ADDRESS
17855 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17857 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17858 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17860 #undef TARGET_CAN_USE_DOLOOP_P
17861 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17863 #undef TARGET_SCHED_ADJUST_PRIORITY
17864 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17866 #undef TARGET_SCHED_MACRO_FUSION_P
17867 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17869 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17870 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17872 #undef TARGET_SCHED_FUSION_PRIORITY
17873 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17875 #undef TARGET_UNSPEC_MAY_TRAP_P
17876 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17878 #undef TARGET_USE_PSEUDO_PIC_REG
17879 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17881 #undef TARGET_PRINT_OPERAND
17882 #define TARGET_PRINT_OPERAND aarch64_print_operand
17884 #undef TARGET_PRINT_OPERAND_ADDRESS
17885 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17887 #undef TARGET_OPTAB_SUPPORTED_P
17888 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17890 #undef TARGET_OMIT_STRUCT_RETURN_REG
17891 #define TARGET_OMIT_STRUCT_RETURN_REG true
17893 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17894 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17895 aarch64_dwarf_poly_indeterminate_value
17897 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17898 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17899 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17901 #undef TARGET_HARD_REGNO_NREGS
17902 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17903 #undef TARGET_HARD_REGNO_MODE_OK
17904 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17906 #undef TARGET_MODES_TIEABLE_P
17907 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17909 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17910 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17911 aarch64_hard_regno_call_part_clobbered
17913 #undef TARGET_CONSTANT_ALIGNMENT
17914 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17916 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17917 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17919 #undef TARGET_CAN_CHANGE_MODE_CLASS
17920 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17922 #undef TARGET_SELECT_EARLY_REMAT_MODES
17923 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17926 #undef TARGET_RUN_TARGET_SELFTESTS
17927 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17928 #endif /* #if CHECKING_P */
17930 struct gcc_target targetm
= TARGET_INITIALIZER
;
17932 #include "gt-aarch64.h"