1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Information about a legitimate vector immediate operand. */
82 struct simd_immediate_info
84 enum insn_type
{ MOV
, MVN
};
85 enum modifier_type
{ LSL
, MSL
};
87 simd_immediate_info () {}
88 simd_immediate_info (scalar_float_mode
, rtx
);
89 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
90 insn_type
= MOV
, modifier_type
= LSL
,
92 simd_immediate_info (scalar_mode
, rtx
, rtx
);
94 /* The mode of the elements. */
97 /* The value of each element if all elements are the same, or the
98 first value if the constant is a series. */
101 /* The value of the step if the constant is a series, null otherwise. */
104 /* The instruction to use to move the immediate into a vector. */
107 /* The kind of shift modifier to use, and the number of bits to shift.
108 This is (LSL, 0) if no shift is needed. */
109 modifier_type modifier
;
113 /* Construct a floating-point immediate in which each element has mode
114 ELT_MODE_IN and value VALUE_IN. */
115 inline simd_immediate_info
116 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
117 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
118 modifier (LSL
), shift (0)
121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
122 and value VALUE_IN. The other parameters are as for the structure
124 inline simd_immediate_info
125 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
126 unsigned HOST_WIDE_INT value_in
,
127 insn_type insn_in
, modifier_type modifier_in
,
128 unsigned int shift_in
)
129 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
130 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
134 and where element I is equal to VALUE_IN + I * STEP_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
137 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
138 modifier (LSL
), shift (0)
141 /* The current code model. */
142 enum aarch64_code_model aarch64_cmodel
;
144 /* The number of 64-bit elements in an SVE vector. */
145 poly_uint16 aarch64_sve_vg
;
148 #undef TARGET_HAVE_TLS
149 #define TARGET_HAVE_TLS 1
152 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
155 machine_mode
*, int *,
157 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
158 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
159 static void aarch64_override_options_after_change (void);
160 static bool aarch64_vector_mode_supported_p (machine_mode
);
161 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
166 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
167 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
168 aarch64_addr_query_type
);
170 /* Major revision number of the ARM Architecture implemented by the target. */
171 unsigned aarch64_architecture_version
;
173 /* The processor for which instructions should be scheduled. */
174 enum aarch64_processor aarch64_tune
= cortexa53
;
176 /* Mask to specify which instruction scheduling options should be used. */
177 unsigned long aarch64_tune_flags
= 0;
179 /* Global flag for PC relative loads. */
180 bool aarch64_pcrelative_literal_loads
;
182 /* Global flag for whether frame pointer is enabled. */
183 bool aarch64_use_frame_pointer
;
185 /* Support for command line parsing of boolean flags in the tuning
187 struct aarch64_flag_desc
193 #define AARCH64_FUSION_PAIR(name, internal_name) \
194 { name, AARCH64_FUSE_##internal_name },
195 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
197 { "none", AARCH64_FUSE_NOTHING
},
198 #include "aarch64-fusion-pairs.def"
199 { "all", AARCH64_FUSE_ALL
},
200 { NULL
, AARCH64_FUSE_NOTHING
}
203 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
204 { name, AARCH64_EXTRA_TUNE_##internal_name },
205 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
207 { "none", AARCH64_EXTRA_TUNE_NONE
},
208 #include "aarch64-tuning-flags.def"
209 { "all", AARCH64_EXTRA_TUNE_ALL
},
210 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
213 /* Tuning parameters. */
215 static const struct cpu_addrcost_table generic_addrcost_table
=
225 0, /* register_offset */
226 0, /* register_sextend */
227 0, /* register_zextend */
231 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
241 1, /* register_offset */
242 1, /* register_sextend */
243 2, /* register_zextend */
247 static const struct cpu_addrcost_table xgene1_addrcost_table
=
257 0, /* register_offset */
258 1, /* register_sextend */
259 1, /* register_zextend */
263 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
273 2, /* register_offset */
274 3, /* register_sextend */
275 3, /* register_zextend */
279 static const struct cpu_addrcost_table tsv110_addrcost_table
=
289 0, /* register_offset */
290 1, /* register_sextend */
291 1, /* register_zextend */
295 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
305 3, /* register_offset */
306 3, /* register_sextend */
307 3, /* register_zextend */
311 static const struct cpu_regmove_cost generic_regmove_cost
=
314 /* Avoid the use of slow int<->fp moves for spilling by setting
315 their cost higher than memmov_cost. */
321 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
324 /* Avoid the use of slow int<->fp moves for spilling by setting
325 their cost higher than memmov_cost. */
331 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
334 /* Avoid the use of slow int<->fp moves for spilling by setting
335 their cost higher than memmov_cost. */
341 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
344 /* Avoid the use of slow int<->fp moves for spilling by setting
345 their cost higher than memmov_cost (actual, 4 and 9). */
351 static const struct cpu_regmove_cost thunderx_regmove_cost
=
359 static const struct cpu_regmove_cost xgene1_regmove_cost
=
362 /* Avoid the use of slow int<->fp moves for spilling by setting
363 their cost higher than memmov_cost. */
369 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
372 /* Avoid the use of int<->fp moves for spilling. */
378 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
381 /* Avoid the use of int<->fp moves for spilling. */
387 static const struct cpu_regmove_cost tsv110_regmove_cost
=
390 /* Avoid the use of slow int<->fp moves for spilling by setting
391 their cost higher than memmov_cost. */
397 /* Generic costs for vector insn classes. */
398 static const struct cpu_vector_cost generic_vector_cost
=
400 1, /* scalar_int_stmt_cost */
401 1, /* scalar_fp_stmt_cost */
402 1, /* scalar_load_cost */
403 1, /* scalar_store_cost */
404 1, /* vec_int_stmt_cost */
405 1, /* vec_fp_stmt_cost */
406 2, /* vec_permute_cost */
407 1, /* vec_to_scalar_cost */
408 1, /* scalar_to_vec_cost */
409 1, /* vec_align_load_cost */
410 1, /* vec_unalign_load_cost */
411 1, /* vec_unalign_store_cost */
412 1, /* vec_store_cost */
413 3, /* cond_taken_branch_cost */
414 1 /* cond_not_taken_branch_cost */
417 /* QDF24XX costs for vector insn classes. */
418 static const struct cpu_vector_cost qdf24xx_vector_cost
=
420 1, /* scalar_int_stmt_cost */
421 1, /* scalar_fp_stmt_cost */
422 1, /* scalar_load_cost */
423 1, /* scalar_store_cost */
424 1, /* vec_int_stmt_cost */
425 3, /* vec_fp_stmt_cost */
426 2, /* vec_permute_cost */
427 1, /* vec_to_scalar_cost */
428 1, /* scalar_to_vec_cost */
429 1, /* vec_align_load_cost */
430 1, /* vec_unalign_load_cost */
431 1, /* vec_unalign_store_cost */
432 1, /* vec_store_cost */
433 3, /* cond_taken_branch_cost */
434 1 /* cond_not_taken_branch_cost */
437 /* ThunderX costs for vector insn classes. */
438 static const struct cpu_vector_cost thunderx_vector_cost
=
440 1, /* scalar_int_stmt_cost */
441 1, /* scalar_fp_stmt_cost */
442 3, /* scalar_load_cost */
443 1, /* scalar_store_cost */
444 4, /* vec_int_stmt_cost */
445 1, /* vec_fp_stmt_cost */
446 4, /* vec_permute_cost */
447 2, /* vec_to_scalar_cost */
448 2, /* scalar_to_vec_cost */
449 3, /* vec_align_load_cost */
450 5, /* vec_unalign_load_cost */
451 5, /* vec_unalign_store_cost */
452 1, /* vec_store_cost */
453 3, /* cond_taken_branch_cost */
454 3 /* cond_not_taken_branch_cost */
457 static const struct cpu_vector_cost tsv110_vector_cost
=
459 1, /* scalar_int_stmt_cost */
460 1, /* scalar_fp_stmt_cost */
461 5, /* scalar_load_cost */
462 1, /* scalar_store_cost */
463 2, /* vec_int_stmt_cost */
464 2, /* vec_fp_stmt_cost */
465 2, /* vec_permute_cost */
466 3, /* vec_to_scalar_cost */
467 2, /* scalar_to_vec_cost */
468 5, /* vec_align_load_cost */
469 5, /* vec_unalign_load_cost */
470 1, /* vec_unalign_store_cost */
471 1, /* vec_store_cost */
472 1, /* cond_taken_branch_cost */
473 1 /* cond_not_taken_branch_cost */
476 /* Generic costs for vector insn classes. */
477 static const struct cpu_vector_cost cortexa57_vector_cost
=
479 1, /* scalar_int_stmt_cost */
480 1, /* scalar_fp_stmt_cost */
481 4, /* scalar_load_cost */
482 1, /* scalar_store_cost */
483 2, /* vec_int_stmt_cost */
484 2, /* vec_fp_stmt_cost */
485 3, /* vec_permute_cost */
486 8, /* vec_to_scalar_cost */
487 8, /* scalar_to_vec_cost */
488 4, /* vec_align_load_cost */
489 4, /* vec_unalign_load_cost */
490 1, /* vec_unalign_store_cost */
491 1, /* vec_store_cost */
492 1, /* cond_taken_branch_cost */
493 1 /* cond_not_taken_branch_cost */
496 static const struct cpu_vector_cost exynosm1_vector_cost
=
498 1, /* scalar_int_stmt_cost */
499 1, /* scalar_fp_stmt_cost */
500 5, /* scalar_load_cost */
501 1, /* scalar_store_cost */
502 3, /* vec_int_stmt_cost */
503 3, /* vec_fp_stmt_cost */
504 3, /* vec_permute_cost */
505 3, /* vec_to_scalar_cost */
506 3, /* scalar_to_vec_cost */
507 5, /* vec_align_load_cost */
508 5, /* vec_unalign_load_cost */
509 1, /* vec_unalign_store_cost */
510 1, /* vec_store_cost */
511 1, /* cond_taken_branch_cost */
512 1 /* cond_not_taken_branch_cost */
515 /* Generic costs for vector insn classes. */
516 static const struct cpu_vector_cost xgene1_vector_cost
=
518 1, /* scalar_int_stmt_cost */
519 1, /* scalar_fp_stmt_cost */
520 5, /* scalar_load_cost */
521 1, /* scalar_store_cost */
522 2, /* vec_int_stmt_cost */
523 2, /* vec_fp_stmt_cost */
524 2, /* vec_permute_cost */
525 4, /* vec_to_scalar_cost */
526 4, /* scalar_to_vec_cost */
527 10, /* vec_align_load_cost */
528 10, /* vec_unalign_load_cost */
529 2, /* vec_unalign_store_cost */
530 2, /* vec_store_cost */
531 2, /* cond_taken_branch_cost */
532 1 /* cond_not_taken_branch_cost */
535 /* Costs for vector insn classes for Vulcan. */
536 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
538 1, /* scalar_int_stmt_cost */
539 6, /* scalar_fp_stmt_cost */
540 4, /* scalar_load_cost */
541 1, /* scalar_store_cost */
542 5, /* vec_int_stmt_cost */
543 6, /* vec_fp_stmt_cost */
544 3, /* vec_permute_cost */
545 6, /* vec_to_scalar_cost */
546 5, /* scalar_to_vec_cost */
547 8, /* vec_align_load_cost */
548 8, /* vec_unalign_load_cost */
549 4, /* vec_unalign_store_cost */
550 4, /* vec_store_cost */
551 2, /* cond_taken_branch_cost */
552 1 /* cond_not_taken_branch_cost */
555 /* Generic costs for branch instructions. */
556 static const struct cpu_branch_cost generic_branch_cost
=
558 1, /* Predictable. */
559 3 /* Unpredictable. */
562 /* Generic approximation modes. */
563 static const cpu_approx_modes generic_approx_modes
=
565 AARCH64_APPROX_NONE
, /* division */
566 AARCH64_APPROX_NONE
, /* sqrt */
567 AARCH64_APPROX_NONE
/* recip_sqrt */
570 /* Approximation modes for Exynos M1. */
571 static const cpu_approx_modes exynosm1_approx_modes
=
573 AARCH64_APPROX_NONE
, /* division */
574 AARCH64_APPROX_ALL
, /* sqrt */
575 AARCH64_APPROX_ALL
/* recip_sqrt */
578 /* Approximation modes for X-Gene 1. */
579 static const cpu_approx_modes xgene1_approx_modes
=
581 AARCH64_APPROX_NONE
, /* division */
582 AARCH64_APPROX_NONE
, /* sqrt */
583 AARCH64_APPROX_ALL
/* recip_sqrt */
586 /* Generic prefetch settings (which disable prefetch). */
587 static const cpu_prefetch_tune generic_prefetch_tune
=
590 -1, /* l1_cache_size */
591 -1, /* l1_cache_line_size */
592 -1, /* l2_cache_size */
593 true, /* prefetch_dynamic_strides */
594 -1, /* minimum_stride */
595 -1 /* default_opt_level */
598 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
601 -1, /* l1_cache_size */
602 64, /* l1_cache_line_size */
603 -1, /* l2_cache_size */
604 true, /* prefetch_dynamic_strides */
605 -1, /* minimum_stride */
606 -1 /* default_opt_level */
609 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
612 32, /* l1_cache_size */
613 64, /* l1_cache_line_size */
614 512, /* l2_cache_size */
615 false, /* prefetch_dynamic_strides */
616 2048, /* minimum_stride */
617 3 /* default_opt_level */
620 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
623 32, /* l1_cache_size */
624 128, /* l1_cache_line_size */
625 16*1024, /* l2_cache_size */
626 true, /* prefetch_dynamic_strides */
627 -1, /* minimum_stride */
628 3 /* default_opt_level */
631 static const cpu_prefetch_tune thunderx_prefetch_tune
=
634 32, /* l1_cache_size */
635 128, /* l1_cache_line_size */
636 -1, /* l2_cache_size */
637 true, /* prefetch_dynamic_strides */
638 -1, /* minimum_stride */
639 -1 /* default_opt_level */
642 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
645 32, /* l1_cache_size */
646 64, /* l1_cache_line_size */
647 256, /* l2_cache_size */
648 true, /* prefetch_dynamic_strides */
649 -1, /* minimum_stride */
650 -1 /* default_opt_level */
653 static const cpu_prefetch_tune tsv110_prefetch_tune
=
656 64, /* l1_cache_size */
657 64, /* l1_cache_line_size */
658 512, /* l2_cache_size */
659 true, /* prefetch_dynamic_strides */
660 -1, /* minimum_stride */
661 -1 /* default_opt_level */
664 static const struct tune_params generic_tunings
=
666 &cortexa57_extra_costs
,
667 &generic_addrcost_table
,
668 &generic_regmove_cost
,
669 &generic_vector_cost
,
670 &generic_branch_cost
,
671 &generic_approx_modes
,
674 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
675 "8", /* function_align. */
676 "4", /* jump_align. */
677 "8", /* loop_align. */
678 2, /* int_reassoc_width. */
679 4, /* fp_reassoc_width. */
680 1, /* vec_reassoc_width. */
681 2, /* min_div_recip_mul_sf. */
682 2, /* min_div_recip_mul_df. */
683 0, /* max_case_values. */
684 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
685 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
686 &generic_prefetch_tune
689 static const struct tune_params cortexa35_tunings
=
691 &cortexa53_extra_costs
,
692 &generic_addrcost_table
,
693 &cortexa53_regmove_cost
,
694 &generic_vector_cost
,
695 &generic_branch_cost
,
696 &generic_approx_modes
,
699 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
700 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
701 "16", /* function_align. */
702 "4", /* jump_align. */
703 "8", /* loop_align. */
704 2, /* int_reassoc_width. */
705 4, /* fp_reassoc_width. */
706 1, /* vec_reassoc_width. */
707 2, /* min_div_recip_mul_sf. */
708 2, /* min_div_recip_mul_df. */
709 0, /* max_case_values. */
710 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
711 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
712 &generic_prefetch_tune
715 static const struct tune_params cortexa53_tunings
=
717 &cortexa53_extra_costs
,
718 &generic_addrcost_table
,
719 &cortexa53_regmove_cost
,
720 &generic_vector_cost
,
721 &generic_branch_cost
,
722 &generic_approx_modes
,
725 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
726 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
727 "16", /* function_align. */
728 "4", /* jump_align. */
729 "8", /* loop_align. */
730 2, /* int_reassoc_width. */
731 4, /* fp_reassoc_width. */
732 1, /* vec_reassoc_width. */
733 2, /* min_div_recip_mul_sf. */
734 2, /* min_div_recip_mul_df. */
735 0, /* max_case_values. */
736 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
737 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
738 &generic_prefetch_tune
741 static const struct tune_params cortexa57_tunings
=
743 &cortexa57_extra_costs
,
744 &generic_addrcost_table
,
745 &cortexa57_regmove_cost
,
746 &cortexa57_vector_cost
,
747 &generic_branch_cost
,
748 &generic_approx_modes
,
751 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
752 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
753 "16", /* function_align. */
754 "4", /* jump_align. */
755 "8", /* loop_align. */
756 2, /* int_reassoc_width. */
757 4, /* fp_reassoc_width. */
758 1, /* vec_reassoc_width. */
759 2, /* min_div_recip_mul_sf. */
760 2, /* min_div_recip_mul_df. */
761 0, /* max_case_values. */
762 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
763 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
764 &generic_prefetch_tune
767 static const struct tune_params cortexa72_tunings
=
769 &cortexa57_extra_costs
,
770 &generic_addrcost_table
,
771 &cortexa57_regmove_cost
,
772 &cortexa57_vector_cost
,
773 &generic_branch_cost
,
774 &generic_approx_modes
,
777 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
778 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
779 "16", /* function_align. */
780 "4", /* jump_align. */
781 "8", /* loop_align. */
782 2, /* int_reassoc_width. */
783 4, /* fp_reassoc_width. */
784 1, /* vec_reassoc_width. */
785 2, /* min_div_recip_mul_sf. */
786 2, /* min_div_recip_mul_df. */
787 0, /* max_case_values. */
788 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
789 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
790 &generic_prefetch_tune
793 static const struct tune_params cortexa73_tunings
=
795 &cortexa57_extra_costs
,
796 &generic_addrcost_table
,
797 &cortexa57_regmove_cost
,
798 &cortexa57_vector_cost
,
799 &generic_branch_cost
,
800 &generic_approx_modes
,
801 4, /* memmov_cost. */
803 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
804 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
805 "16", /* function_align. */
806 "4", /* jump_align. */
807 "8", /* loop_align. */
808 2, /* int_reassoc_width. */
809 4, /* fp_reassoc_width. */
810 1, /* vec_reassoc_width. */
811 2, /* min_div_recip_mul_sf. */
812 2, /* min_div_recip_mul_df. */
813 0, /* max_case_values. */
814 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
815 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
816 &generic_prefetch_tune
821 static const struct tune_params exynosm1_tunings
=
823 &exynosm1_extra_costs
,
824 &exynosm1_addrcost_table
,
825 &exynosm1_regmove_cost
,
826 &exynosm1_vector_cost
,
827 &generic_branch_cost
,
828 &exynosm1_approx_modes
,
831 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
832 "4", /* function_align. */
833 "4", /* jump_align. */
834 "4", /* loop_align. */
835 2, /* int_reassoc_width. */
836 4, /* fp_reassoc_width. */
837 1, /* vec_reassoc_width. */
838 2, /* min_div_recip_mul_sf. */
839 2, /* min_div_recip_mul_df. */
840 48, /* max_case_values. */
841 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
842 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
843 &exynosm1_prefetch_tune
846 static const struct tune_params thunderxt88_tunings
=
848 &thunderx_extra_costs
,
849 &generic_addrcost_table
,
850 &thunderx_regmove_cost
,
851 &thunderx_vector_cost
,
852 &generic_branch_cost
,
853 &generic_approx_modes
,
856 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
857 "8", /* function_align. */
858 "8", /* jump_align. */
859 "8", /* loop_align. */
860 2, /* int_reassoc_width. */
861 4, /* fp_reassoc_width. */
862 1, /* vec_reassoc_width. */
863 2, /* min_div_recip_mul_sf. */
864 2, /* min_div_recip_mul_df. */
865 0, /* max_case_values. */
866 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
867 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
868 &thunderxt88_prefetch_tune
871 static const struct tune_params thunderx_tunings
=
873 &thunderx_extra_costs
,
874 &generic_addrcost_table
,
875 &thunderx_regmove_cost
,
876 &thunderx_vector_cost
,
877 &generic_branch_cost
,
878 &generic_approx_modes
,
881 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
882 "8", /* function_align. */
883 "8", /* jump_align. */
884 "8", /* loop_align. */
885 2, /* int_reassoc_width. */
886 4, /* fp_reassoc_width. */
887 1, /* vec_reassoc_width. */
888 2, /* min_div_recip_mul_sf. */
889 2, /* min_div_recip_mul_df. */
890 0, /* max_case_values. */
891 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
892 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
893 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
894 &thunderx_prefetch_tune
897 static const struct tune_params tsv110_tunings
=
900 &tsv110_addrcost_table
,
901 &tsv110_regmove_cost
,
903 &generic_branch_cost
,
904 &generic_approx_modes
,
907 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
908 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
909 "16", /* function_align. */
910 "4", /* jump_align. */
911 "8", /* loop_align. */
912 2, /* int_reassoc_width. */
913 4, /* fp_reassoc_width. */
914 1, /* vec_reassoc_width. */
915 2, /* min_div_recip_mul_sf. */
916 2, /* min_div_recip_mul_df. */
917 0, /* max_case_values. */
918 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
919 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
920 &tsv110_prefetch_tune
923 static const struct tune_params xgene1_tunings
=
926 &xgene1_addrcost_table
,
927 &xgene1_regmove_cost
,
929 &generic_branch_cost
,
930 &xgene1_approx_modes
,
933 AARCH64_FUSE_NOTHING
, /* fusible_ops */
934 "16", /* function_align. */
935 "8", /* jump_align. */
936 "16", /* loop_align. */
937 2, /* int_reassoc_width. */
938 4, /* fp_reassoc_width. */
939 1, /* vec_reassoc_width. */
940 2, /* min_div_recip_mul_sf. */
941 2, /* min_div_recip_mul_df. */
942 0, /* max_case_values. */
943 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
944 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
945 &generic_prefetch_tune
948 static const struct tune_params qdf24xx_tunings
=
950 &qdf24xx_extra_costs
,
951 &qdf24xx_addrcost_table
,
952 &qdf24xx_regmove_cost
,
953 &qdf24xx_vector_cost
,
954 &generic_branch_cost
,
955 &generic_approx_modes
,
958 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
959 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
960 "16", /* function_align. */
961 "8", /* jump_align. */
962 "16", /* loop_align. */
963 2, /* int_reassoc_width. */
964 4, /* fp_reassoc_width. */
965 1, /* vec_reassoc_width. */
966 2, /* min_div_recip_mul_sf. */
967 2, /* min_div_recip_mul_df. */
968 0, /* max_case_values. */
969 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
970 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
971 &qdf24xx_prefetch_tune
974 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
976 static const struct tune_params saphira_tunings
=
978 &generic_extra_costs
,
979 &generic_addrcost_table
,
980 &generic_regmove_cost
,
981 &generic_vector_cost
,
982 &generic_branch_cost
,
983 &generic_approx_modes
,
986 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
987 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
988 "16", /* function_align. */
989 "8", /* jump_align. */
990 "16", /* loop_align. */
991 2, /* int_reassoc_width. */
992 4, /* fp_reassoc_width. */
993 1, /* vec_reassoc_width. */
994 2, /* min_div_recip_mul_sf. */
995 2, /* min_div_recip_mul_df. */
996 0, /* max_case_values. */
997 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
998 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
999 &generic_prefetch_tune
1002 static const struct tune_params thunderx2t99_tunings
=
1004 &thunderx2t99_extra_costs
,
1005 &thunderx2t99_addrcost_table
,
1006 &thunderx2t99_regmove_cost
,
1007 &thunderx2t99_vector_cost
,
1008 &generic_branch_cost
,
1009 &generic_approx_modes
,
1010 4, /* memmov_cost. */
1011 4, /* issue_rate. */
1012 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1013 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1014 "16", /* function_align. */
1015 "8", /* jump_align. */
1016 "16", /* loop_align. */
1017 3, /* int_reassoc_width. */
1018 2, /* fp_reassoc_width. */
1019 2, /* vec_reassoc_width. */
1020 2, /* min_div_recip_mul_sf. */
1021 2, /* min_div_recip_mul_df. */
1022 0, /* max_case_values. */
1023 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1024 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1025 &thunderx2t99_prefetch_tune
1028 /* Support for fine-grained override of the tuning structures. */
1029 struct aarch64_tuning_override_function
1032 void (*parse_override
)(const char*, struct tune_params
*);
1035 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1036 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1038 static const struct aarch64_tuning_override_function
1039 aarch64_tuning_override_functions
[] =
1041 { "fuse", aarch64_parse_fuse_string
},
1042 { "tune", aarch64_parse_tune_string
},
1046 /* A processor implementing AArch64. */
1049 const char *const name
;
1050 enum aarch64_processor ident
;
1051 enum aarch64_processor sched_core
;
1052 enum aarch64_arch arch
;
1053 unsigned architecture_version
;
1054 const unsigned long flags
;
1055 const struct tune_params
*const tune
;
1058 /* Architectures implementing AArch64. */
1059 static const struct processor all_architectures
[] =
1061 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1062 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1063 #include "aarch64-arches.def"
1064 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1067 /* Processor cores implementing AArch64. */
1068 static const struct processor all_cores
[] =
1070 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1071 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1072 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1073 FLAGS, &COSTS##_tunings},
1074 #include "aarch64-cores.def"
1075 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1076 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1077 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1081 /* Target specification. These are populated by the -march, -mtune, -mcpu
1082 handling code or by target attributes. */
1083 static const struct processor
*selected_arch
;
1084 static const struct processor
*selected_cpu
;
1085 static const struct processor
*selected_tune
;
1087 /* The current tuning set. */
1088 struct tune_params aarch64_tune_params
= generic_tunings
;
1090 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1092 /* An ISA extension in the co-processor and main instruction set space. */
1093 struct aarch64_option_extension
1095 const char *const name
;
1096 const unsigned long flags_on
;
1097 const unsigned long flags_off
;
1100 typedef enum aarch64_cond_code
1102 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1103 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1104 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1108 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1110 /* The condition codes of the processor, and the inverse function. */
1111 static const char * const aarch64_condition_codes
[] =
1113 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1114 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1117 /* Generate code to enable conditional branches in functions over 1 MiB. */
1119 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1120 const char * branch_format
)
1122 rtx_code_label
* tmp_label
= gen_label_rtx ();
1123 char label_buf
[256];
1125 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1126 CODE_LABEL_NUMBER (tmp_label
));
1127 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1128 rtx dest_label
= operands
[pos_label
];
1129 operands
[pos_label
] = tmp_label
;
1131 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1132 output_asm_insn (buffer
, operands
);
1134 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1135 operands
[pos_label
] = dest_label
;
1136 output_asm_insn (buffer
, operands
);
1141 aarch64_err_no_fpadvsimd (machine_mode mode
)
1143 if (TARGET_GENERAL_REGS_ONLY
)
1144 if (FLOAT_MODE_P (mode
))
1145 error ("%qs is incompatible with the use of floating-point types",
1146 "-mgeneral-regs-only");
1148 error ("%qs is incompatible with the use of vector types",
1149 "-mgeneral-regs-only");
1151 if (FLOAT_MODE_P (mode
))
1152 error ("%qs feature modifier is incompatible with the use of"
1153 " floating-point types", "+nofp");
1155 error ("%qs feature modifier is incompatible with the use of"
1156 " vector types", "+nofp");
1159 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1160 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1161 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1162 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1163 and GENERAL_REGS is lower than the memory cost (in this case the best class
1164 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1165 cost results in bad allocations with many redundant int<->FP moves which
1166 are expensive on various cores.
1167 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1168 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1169 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1170 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1171 The result of this is that it is no longer inefficient to have a higher
1172 memory move cost than the register move cost.
1176 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1177 reg_class_t best_class
)
1181 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1182 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1183 return allocno_class
;
1185 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1186 || !reg_class_subset_p (FP_REGS
, best_class
))
1189 mode
= PSEUDO_REGNO_MODE (regno
);
1190 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1194 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1196 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1197 return aarch64_tune_params
.min_div_recip_mul_sf
;
1198 return aarch64_tune_params
.min_div_recip_mul_df
;
1201 /* Return the reassociation width of treeop OPC with mode MODE. */
1203 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1205 if (VECTOR_MODE_P (mode
))
1206 return aarch64_tune_params
.vec_reassoc_width
;
1207 if (INTEGRAL_MODE_P (mode
))
1208 return aarch64_tune_params
.int_reassoc_width
;
1209 /* Avoid reassociating floating point addition so we emit more FMAs. */
1210 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1211 return aarch64_tune_params
.fp_reassoc_width
;
1215 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1217 aarch64_dbx_register_number (unsigned regno
)
1219 if (GP_REGNUM_P (regno
))
1220 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1221 else if (regno
== SP_REGNUM
)
1222 return AARCH64_DWARF_SP
;
1223 else if (FP_REGNUM_P (regno
))
1224 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1225 else if (PR_REGNUM_P (regno
))
1226 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1227 else if (regno
== VG_REGNUM
)
1228 return AARCH64_DWARF_VG
;
1230 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1231 equivalent DWARF register. */
1232 return DWARF_FRAME_REGISTERS
;
1235 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1237 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1240 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1243 /* Return true if MODE is an SVE predicate mode. */
1245 aarch64_sve_pred_mode_p (machine_mode mode
)
1248 && (mode
== VNx16BImode
1249 || mode
== VNx8BImode
1250 || mode
== VNx4BImode
1251 || mode
== VNx2BImode
));
1254 /* Three mutually-exclusive flags describing a vector or predicate type. */
1255 const unsigned int VEC_ADVSIMD
= 1;
1256 const unsigned int VEC_SVE_DATA
= 2;
1257 const unsigned int VEC_SVE_PRED
= 4;
1258 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1259 a structure of 2, 3 or 4 vectors. */
1260 const unsigned int VEC_STRUCT
= 8;
1261 /* Useful combinations of the above. */
1262 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1263 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1265 /* Return a set of flags describing the vector properties of mode MODE.
1266 Ignore modes that are not supported by the current target. */
1268 aarch64_classify_vector_mode (machine_mode mode
)
1270 if (aarch64_advsimd_struct_mode_p (mode
))
1271 return VEC_ADVSIMD
| VEC_STRUCT
;
1273 if (aarch64_sve_pred_mode_p (mode
))
1274 return VEC_SVE_PRED
;
1276 scalar_mode inner
= GET_MODE_INNER (mode
);
1277 if (VECTOR_MODE_P (mode
)
1284 || inner
== DFmode
))
1288 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1289 return VEC_SVE_DATA
;
1290 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1291 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1292 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1293 return VEC_SVE_DATA
| VEC_STRUCT
;
1296 /* This includes V1DF but not V1DI (which doesn't exist). */
1298 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1299 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1306 /* Return true if MODE is any of the data vector modes, including
1309 aarch64_vector_data_mode_p (machine_mode mode
)
1311 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1314 /* Return true if MODE is an SVE data vector mode; either a single vector
1315 or a structure of vectors. */
1317 aarch64_sve_data_mode_p (machine_mode mode
)
1319 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1322 /* Implement target hook TARGET_ARRAY_MODE. */
1323 static opt_machine_mode
1324 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1326 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1327 && IN_RANGE (nelems
, 2, 4))
1328 return mode_for_vector (GET_MODE_INNER (mode
),
1329 GET_MODE_NUNITS (mode
) * nelems
);
1331 return opt_machine_mode ();
1334 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1336 aarch64_array_mode_supported_p (machine_mode mode
,
1337 unsigned HOST_WIDE_INT nelems
)
1340 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1341 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1342 && (nelems
>= 2 && nelems
<= 4))
1348 /* Return the SVE predicate mode to use for elements that have
1349 ELEM_NBYTES bytes, if such a mode exists. */
1352 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1356 if (elem_nbytes
== 1)
1358 if (elem_nbytes
== 2)
1360 if (elem_nbytes
== 4)
1362 if (elem_nbytes
== 8)
1365 return opt_machine_mode ();
1368 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1370 static opt_machine_mode
1371 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1373 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1375 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1376 machine_mode pred_mode
;
1377 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1381 return default_get_mask_mode (nunits
, nbytes
);
1384 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1385 prefer to use the first arithmetic operand as the else value if
1386 the else value doesn't matter, since that exactly matches the SVE
1387 destructive merging form. For ternary operations we could either
1388 pick the first operand and use FMAD-like instructions or the last
1389 operand and use FMLA-like instructions; the latter seems more
1393 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1395 return nops
== 3 ? ops
[2] : ops
[0];
1398 /* Implement TARGET_HARD_REGNO_NREGS. */
1401 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1403 /* ??? Logically we should only need to provide a value when
1404 HARD_REGNO_MODE_OK says that the combination is valid,
1405 but at the moment we need to handle all modes. Just ignore
1406 any runtime parts for registers that can't store them. */
1407 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1408 switch (aarch64_regno_regclass (regno
))
1412 if (aarch64_sve_data_mode_p (mode
))
1413 return exact_div (GET_MODE_SIZE (mode
),
1414 BYTES_PER_SVE_VECTOR
).to_constant ();
1415 return CEIL (lowest_size
, UNITS_PER_VREG
);
1421 return CEIL (lowest_size
, UNITS_PER_WORD
);
1426 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1429 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1431 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1432 return regno
== CC_REGNUM
;
1434 if (regno
== VG_REGNUM
)
1435 /* This must have the same size as _Unwind_Word. */
1436 return mode
== DImode
;
1438 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1439 if (vec_flags
& VEC_SVE_PRED
)
1440 return PR_REGNUM_P (regno
);
1442 if (PR_REGNUM_P (regno
))
1445 if (regno
== SP_REGNUM
)
1446 /* The purpose of comparing with ptr_mode is to support the
1447 global register variable associated with the stack pointer
1448 register via the syntax of asm ("wsp") in ILP32. */
1449 return mode
== Pmode
|| mode
== ptr_mode
;
1451 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1452 return mode
== Pmode
;
1454 if (GP_REGNUM_P (regno
) && known_le (GET_MODE_SIZE (mode
), 16))
1457 if (FP_REGNUM_P (regno
))
1459 if (vec_flags
& VEC_STRUCT
)
1460 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1462 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1468 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1469 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1470 clobbers the top 64 bits when restoring the bottom 64 bits. */
1473 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1475 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1478 /* Implement REGMODE_NATURAL_SIZE. */
1480 aarch64_regmode_natural_size (machine_mode mode
)
1482 /* The natural size for SVE data modes is one SVE data vector,
1483 and similarly for predicates. We can't independently modify
1484 anything smaller than that. */
1485 /* ??? For now, only do this for variable-width SVE registers.
1486 Doing it for constant-sized registers breaks lower-subreg.c. */
1487 /* ??? And once that's fixed, we should probably have similar
1488 code for Advanced SIMD. */
1489 if (!aarch64_sve_vg
.is_constant ())
1491 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1492 if (vec_flags
& VEC_SVE_PRED
)
1493 return BYTES_PER_SVE_PRED
;
1494 if (vec_flags
& VEC_SVE_DATA
)
1495 return BYTES_PER_SVE_VECTOR
;
1497 return UNITS_PER_WORD
;
1500 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1502 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1505 /* The predicate mode determines which bits are significant and
1506 which are "don't care". Decreasing the number of lanes would
1507 lose data while increasing the number of lanes would make bits
1508 unnecessarily significant. */
1509 if (PR_REGNUM_P (regno
))
1511 if (known_ge (GET_MODE_SIZE (mode
), 4))
1517 /* Return true if I's bits are consecutive ones from the MSB. */
1519 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1521 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1524 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1525 that strcpy from constants will be faster. */
1527 static HOST_WIDE_INT
1528 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1530 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1531 return MAX (align
, BITS_PER_WORD
);
1535 /* Return true if calls to DECL should be treated as
1536 long-calls (ie called via a register). */
1538 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1543 /* Return true if calls to symbol-ref SYM should be treated as
1544 long-calls (ie called via a register). */
1546 aarch64_is_long_call_p (rtx sym
)
1548 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1551 /* Return true if calls to symbol-ref SYM should not go through
1555 aarch64_is_noplt_call_p (rtx sym
)
1557 const_tree decl
= SYMBOL_REF_DECL (sym
);
1562 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1563 && !targetm
.binds_local_p (decl
))
1569 /* Return true if the offsets to a zero/sign-extract operation
1570 represent an expression that matches an extend operation. The
1571 operands represent the paramters from
1573 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1575 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1578 HOST_WIDE_INT mult_val
, extract_val
;
1580 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1583 mult_val
= INTVAL (mult_imm
);
1584 extract_val
= INTVAL (extract_imm
);
1587 && extract_val
< GET_MODE_BITSIZE (mode
)
1588 && exact_log2 (extract_val
& ~7) > 0
1589 && (extract_val
& 7) <= 4
1590 && mult_val
== (1 << (extract_val
& 7)))
1596 /* Emit an insn that's a simple single-set. Both the operands must be
1597 known to be valid. */
1598 inline static rtx_insn
*
1599 emit_set_insn (rtx x
, rtx y
)
1601 return emit_insn (gen_rtx_SET (x
, y
));
1604 /* X and Y are two things to compare using CODE. Emit the compare insn and
1605 return the rtx for register 0 in the proper mode. */
1607 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1609 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1610 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1612 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1616 /* Build the SYMBOL_REF for __tls_get_addr. */
1618 static GTY(()) rtx tls_get_addr_libfunc
;
1621 aarch64_tls_get_addr (void)
1623 if (!tls_get_addr_libfunc
)
1624 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1625 return tls_get_addr_libfunc
;
1628 /* Return the TLS model to use for ADDR. */
1630 static enum tls_model
1631 tls_symbolic_operand_type (rtx addr
)
1633 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1634 if (GET_CODE (addr
) == CONST
)
1637 rtx sym
= strip_offset (addr
, &addend
);
1638 if (GET_CODE (sym
) == SYMBOL_REF
)
1639 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1641 else if (GET_CODE (addr
) == SYMBOL_REF
)
1642 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1647 /* We'll allow lo_sum's in addresses in our legitimate addresses
1648 so that combine would take care of combining addresses where
1649 necessary, but for generation purposes, we'll generate the address
1652 tmp = hi (symbol_ref); adrp x1, foo
1653 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1657 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1658 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1662 Load TLS symbol, depending on TLS mechanism and TLS access model.
1664 Global Dynamic - Traditional TLS:
1665 adrp tmp, :tlsgd:imm
1666 add dest, tmp, #:tlsgd_lo12:imm
1669 Global Dynamic - TLS Descriptors:
1670 adrp dest, :tlsdesc:imm
1671 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1672 add dest, dest, #:tlsdesc_lo12:imm
1679 adrp tmp, :gottprel:imm
1680 ldr dest, [tmp, #:gottprel_lo12:imm]
1685 add t0, tp, #:tprel_hi12:imm, lsl #12
1686 add t0, t0, #:tprel_lo12_nc:imm
1690 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1691 enum aarch64_symbol_type type
)
1695 case SYMBOL_SMALL_ABSOLUTE
:
1697 /* In ILP32, the mode of dest can be either SImode or DImode. */
1699 machine_mode mode
= GET_MODE (dest
);
1701 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1703 if (can_create_pseudo_p ())
1704 tmp_reg
= gen_reg_rtx (mode
);
1706 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1707 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1711 case SYMBOL_TINY_ABSOLUTE
:
1712 emit_insn (gen_rtx_SET (dest
, imm
));
1715 case SYMBOL_SMALL_GOT_28K
:
1717 machine_mode mode
= GET_MODE (dest
);
1718 rtx gp_rtx
= pic_offset_table_rtx
;
1722 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1723 here before rtl expand. Tree IVOPT will generate rtl pattern to
1724 decide rtx costs, in which case pic_offset_table_rtx is not
1725 initialized. For that case no need to generate the first adrp
1726 instruction as the final cost for global variable access is
1730 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1731 using the page base as GOT base, the first page may be wasted,
1732 in the worst scenario, there is only 28K space for GOT).
1734 The generate instruction sequence for accessing global variable
1737 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1739 Only one instruction needed. But we must initialize
1740 pic_offset_table_rtx properly. We generate initialize insn for
1741 every global access, and allow CSE to remove all redundant.
1743 The final instruction sequences will look like the following
1744 for multiply global variables access.
1746 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1748 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1749 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1750 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1753 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1754 crtl
->uses_pic_offset_table
= 1;
1755 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1757 if (mode
!= GET_MODE (gp_rtx
))
1758 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1762 if (mode
== ptr_mode
)
1765 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1767 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1769 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1773 gcc_assert (mode
== Pmode
);
1775 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1776 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1779 /* The operand is expected to be MEM. Whenever the related insn
1780 pattern changed, above code which calculate mem should be
1782 gcc_assert (GET_CODE (mem
) == MEM
);
1783 MEM_READONLY_P (mem
) = 1;
1784 MEM_NOTRAP_P (mem
) = 1;
1789 case SYMBOL_SMALL_GOT_4G
:
1791 /* In ILP32, the mode of dest can be either SImode or DImode,
1792 while the got entry is always of SImode size. The mode of
1793 dest depends on how dest is used: if dest is assigned to a
1794 pointer (e.g. in the memory), it has SImode; it may have
1795 DImode if dest is dereferenced to access the memeory.
1796 This is why we have to handle three different ldr_got_small
1797 patterns here (two patterns for ILP32). */
1802 machine_mode mode
= GET_MODE (dest
);
1804 if (can_create_pseudo_p ())
1805 tmp_reg
= gen_reg_rtx (mode
);
1807 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1808 if (mode
== ptr_mode
)
1811 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1813 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1815 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1819 gcc_assert (mode
== Pmode
);
1821 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1822 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1825 gcc_assert (GET_CODE (mem
) == MEM
);
1826 MEM_READONLY_P (mem
) = 1;
1827 MEM_NOTRAP_P (mem
) = 1;
1832 case SYMBOL_SMALL_TLSGD
:
1835 machine_mode mode
= GET_MODE (dest
);
1836 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1840 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1842 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1843 insns
= get_insns ();
1846 RTL_CONST_CALL_P (insns
) = 1;
1847 emit_libcall_block (insns
, dest
, result
, imm
);
1851 case SYMBOL_SMALL_TLSDESC
:
1853 machine_mode mode
= GET_MODE (dest
);
1854 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1857 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1859 /* In ILP32, the got entry is always of SImode size. Unlike
1860 small GOT, the dest is fixed at reg 0. */
1862 emit_insn (gen_tlsdesc_small_si (imm
));
1864 emit_insn (gen_tlsdesc_small_di (imm
));
1865 tp
= aarch64_load_tp (NULL
);
1868 tp
= gen_lowpart (mode
, tp
);
1870 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1872 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1876 case SYMBOL_SMALL_TLSIE
:
1878 /* In ILP32, the mode of dest can be either SImode or DImode,
1879 while the got entry is always of SImode size. The mode of
1880 dest depends on how dest is used: if dest is assigned to a
1881 pointer (e.g. in the memory), it has SImode; it may have
1882 DImode if dest is dereferenced to access the memeory.
1883 This is why we have to handle three different tlsie_small
1884 patterns here (two patterns for ILP32). */
1885 machine_mode mode
= GET_MODE (dest
);
1886 rtx tmp_reg
= gen_reg_rtx (mode
);
1887 rtx tp
= aarch64_load_tp (NULL
);
1889 if (mode
== ptr_mode
)
1892 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1895 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1896 tp
= gen_lowpart (mode
, tp
);
1901 gcc_assert (mode
== Pmode
);
1902 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1905 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1907 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1911 case SYMBOL_TLSLE12
:
1912 case SYMBOL_TLSLE24
:
1913 case SYMBOL_TLSLE32
:
1914 case SYMBOL_TLSLE48
:
1916 machine_mode mode
= GET_MODE (dest
);
1917 rtx tp
= aarch64_load_tp (NULL
);
1920 tp
= gen_lowpart (mode
, tp
);
1924 case SYMBOL_TLSLE12
:
1925 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1928 case SYMBOL_TLSLE24
:
1929 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1932 case SYMBOL_TLSLE32
:
1933 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1935 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1938 case SYMBOL_TLSLE48
:
1939 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1941 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1949 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1953 case SYMBOL_TINY_GOT
:
1954 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1957 case SYMBOL_TINY_TLSIE
:
1959 machine_mode mode
= GET_MODE (dest
);
1960 rtx tp
= aarch64_load_tp (NULL
);
1962 if (mode
== ptr_mode
)
1965 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1968 tp
= gen_lowpart (mode
, tp
);
1969 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1974 gcc_assert (mode
== Pmode
);
1975 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1979 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1988 /* Emit a move from SRC to DEST. Assume that the move expanders can
1989 handle all moves if !can_create_pseudo_p (). The distinction is
1990 important because, unlike emit_move_insn, the move expanders know
1991 how to force Pmode objects into the constant pool even when the
1992 constant pool address is not itself legitimate. */
1994 aarch64_emit_move (rtx dest
, rtx src
)
1996 return (can_create_pseudo_p ()
1997 ? emit_move_insn (dest
, src
)
1998 : emit_move_insn_1 (dest
, src
));
2001 /* Apply UNOPTAB to OP and store the result in DEST. */
2004 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2006 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2008 emit_move_insn (dest
, tmp
);
2011 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2014 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2016 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2019 emit_move_insn (dest
, tmp
);
2022 /* Split a 128-bit move operation into two 64-bit move operations,
2023 taking care to handle partial overlap of register to register
2024 copies. Special cases are needed when moving between GP regs and
2025 FP regs. SRC can be a register, constant or memory; DST a register
2026 or memory. If either operand is memory it must not have any side
2029 aarch64_split_128bit_move (rtx dst
, rtx src
)
2034 machine_mode mode
= GET_MODE (dst
);
2036 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2037 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2038 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2040 if (REG_P (dst
) && REG_P (src
))
2042 int src_regno
= REGNO (src
);
2043 int dst_regno
= REGNO (dst
);
2045 /* Handle FP <-> GP regs. */
2046 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2048 src_lo
= gen_lowpart (word_mode
, src
);
2049 src_hi
= gen_highpart (word_mode
, src
);
2051 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2052 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2055 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2057 dst_lo
= gen_lowpart (word_mode
, dst
);
2058 dst_hi
= gen_highpart (word_mode
, dst
);
2060 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2061 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2066 dst_lo
= gen_lowpart (word_mode
, dst
);
2067 dst_hi
= gen_highpart (word_mode
, dst
);
2068 src_lo
= gen_lowpart (word_mode
, src
);
2069 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2071 /* At most one pairing may overlap. */
2072 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2074 aarch64_emit_move (dst_hi
, src_hi
);
2075 aarch64_emit_move (dst_lo
, src_lo
);
2079 aarch64_emit_move (dst_lo
, src_lo
);
2080 aarch64_emit_move (dst_hi
, src_hi
);
2085 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2087 return (! REG_P (src
)
2088 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2091 /* Split a complex SIMD combine. */
2094 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2096 machine_mode src_mode
= GET_MODE (src1
);
2097 machine_mode dst_mode
= GET_MODE (dst
);
2099 gcc_assert (VECTOR_MODE_P (dst_mode
));
2100 gcc_assert (register_operand (dst
, dst_mode
)
2101 && register_operand (src1
, src_mode
)
2102 && register_operand (src2
, src_mode
));
2104 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2108 /* Split a complex SIMD move. */
2111 aarch64_split_simd_move (rtx dst
, rtx src
)
2113 machine_mode src_mode
= GET_MODE (src
);
2114 machine_mode dst_mode
= GET_MODE (dst
);
2116 gcc_assert (VECTOR_MODE_P (dst_mode
));
2118 if (REG_P (dst
) && REG_P (src
))
2120 gcc_assert (VECTOR_MODE_P (src_mode
));
2121 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2126 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2127 machine_mode ymode
, rtx y
)
2129 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2130 gcc_assert (r
!= NULL
);
2131 return rtx_equal_p (x
, r
);
2136 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2138 if (can_create_pseudo_p ())
2139 return force_reg (mode
, value
);
2143 aarch64_emit_move (x
, value
);
2148 /* Return true if we can move VALUE into a register using a single
2149 CNT[BHWD] instruction. */
2152 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2154 HOST_WIDE_INT factor
= value
.coeffs
[0];
2155 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2156 return (value
.coeffs
[1] == factor
2157 && IN_RANGE (factor
, 2, 16 * 16)
2158 && (factor
& 1) == 0
2159 && factor
<= 16 * (factor
& -factor
));
2162 /* Likewise for rtx X. */
2165 aarch64_sve_cnt_immediate_p (rtx x
)
2168 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2171 /* Return the asm string for an instruction with a CNT-like vector size
2172 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2173 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2174 first part of the operands template (the part that comes before the
2175 vector size itself). FACTOR is the number of quadwords.
2176 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2177 If it is zero, we can use any element size. */
2180 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2181 unsigned int factor
,
2182 unsigned int nelts_per_vq
)
2184 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2186 if (nelts_per_vq
== 0)
2187 /* There is some overlap in the ranges of the four CNT instructions.
2188 Here we always use the smallest possible element size, so that the
2189 multiplier is 1 whereever possible. */
2190 nelts_per_vq
= factor
& -factor
;
2191 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2192 gcc_assert (IN_RANGE (shift
, 1, 4));
2193 char suffix
= "dwhb"[shift
- 1];
2196 unsigned int written
;
2198 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2199 prefix
, suffix
, operands
);
2201 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2202 prefix
, suffix
, operands
, factor
);
2203 gcc_assert (written
< sizeof (buffer
));
2207 /* Return the asm string for an instruction with a CNT-like vector size
2208 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2209 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2210 first part of the operands template (the part that comes before the
2211 vector size itself). X is the value of the vector size operand,
2212 as a polynomial integer rtx. */
2215 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2218 poly_int64 value
= rtx_to_poly_int64 (x
);
2219 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2220 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2221 value
.coeffs
[1], 0);
2224 /* Return true if we can add VALUE to a register using a single ADDVL
2225 or ADDPL instruction. */
2228 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2230 HOST_WIDE_INT factor
= value
.coeffs
[0];
2231 if (factor
== 0 || value
.coeffs
[1] != factor
)
2233 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2234 and a value of 16 is one vector width. */
2235 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2236 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2239 /* Likewise for rtx X. */
2242 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2245 return (poly_int_rtx_p (x
, &value
)
2246 && aarch64_sve_addvl_addpl_immediate_p (value
));
2249 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2250 and storing the result in operand 0. */
2253 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2255 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2256 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2257 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2259 /* Use INC or DEC if possible. */
2260 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2262 if (aarch64_sve_cnt_immediate_p (offset_value
))
2263 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2264 offset_value
.coeffs
[1], 0);
2265 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2266 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2267 -offset_value
.coeffs
[1], 0);
2270 int factor
= offset_value
.coeffs
[1];
2271 if ((factor
& 15) == 0)
2272 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2274 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2278 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2279 instruction. If it is, store the number of elements in each vector
2280 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2281 factor in *FACTOR_OUT (if nonnull). */
2284 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2285 unsigned int *nelts_per_vq_out
)
2290 if (!const_vec_duplicate_p (x
, &elt
)
2291 || !poly_int_rtx_p (elt
, &value
))
2294 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2295 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2296 /* There's no vector INCB. */
2299 HOST_WIDE_INT factor
= value
.coeffs
[0];
2300 if (value
.coeffs
[1] != factor
)
2303 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2304 if ((factor
% nelts_per_vq
) != 0
2305 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2309 *factor_out
= factor
;
2310 if (nelts_per_vq_out
)
2311 *nelts_per_vq_out
= nelts_per_vq
;
2315 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2319 aarch64_sve_inc_dec_immediate_p (rtx x
)
2321 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2324 /* Return the asm template for an SVE vector INC or DEC instruction.
2325 OPERANDS gives the operands before the vector count and X is the
2326 value of the vector count operand itself. */
2329 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2332 unsigned int nelts_per_vq
;
2333 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2336 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2339 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2344 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2345 scalar_int_mode mode
)
2348 unsigned HOST_WIDE_INT val
, val2
, mask
;
2349 int one_match
, zero_match
;
2354 if (aarch64_move_imm (val
, mode
))
2357 emit_insn (gen_rtx_SET (dest
, imm
));
2361 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2362 (with XXXX non-zero). In that case check to see if the move can be done in
2364 val2
= val
& 0xffffffff;
2366 && aarch64_move_imm (val2
, SImode
)
2367 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2370 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2372 /* Check if we have to emit a second instruction by checking to see
2373 if any of the upper 32 bits of the original DI mode value is set. */
2377 i
= (val
>> 48) ? 48 : 32;
2380 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2381 GEN_INT ((val
>> i
) & 0xffff)));
2386 if ((val
>> 32) == 0 || mode
== SImode
)
2390 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2392 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2393 GEN_INT ((val
>> 16) & 0xffff)));
2395 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2396 GEN_INT ((val
>> 16) & 0xffff)));
2401 /* Remaining cases are all for DImode. */
2404 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2405 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2406 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2407 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2409 if (zero_match
!= 2 && one_match
!= 2)
2411 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2412 For a 64-bit bitmask try whether changing 16 bits to all ones or
2413 zeroes creates a valid bitmask. To check any repeated bitmask,
2414 try using 16 bits from the other 32-bit half of val. */
2416 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2419 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2422 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2424 val2
= val2
& ~mask
;
2425 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2426 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2433 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2434 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2435 GEN_INT ((val
>> i
) & 0xffff)));
2441 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2442 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2443 otherwise skip zero bits. */
2447 val2
= one_match
> zero_match
? ~val
: val
;
2448 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2451 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2452 ? (val
| ~(mask
<< i
))
2453 : (val
& (mask
<< i
)))));
2454 for (i
+= 16; i
< 64; i
+= 16)
2456 if ((val2
& (mask
<< i
)) == 0)
2459 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2460 GEN_INT ((val
>> i
) & 0xffff)));
2467 /* Return whether imm is a 128-bit immediate which is simple enough to
2470 aarch64_mov128_immediate (rtx imm
)
2472 if (GET_CODE (imm
) == CONST_INT
)
2475 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2477 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2478 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2480 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2481 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2485 /* Return the number of temporary registers that aarch64_add_offset_1
2486 would need to add OFFSET to a register. */
2489 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2491 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2494 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2495 a non-polynomial OFFSET. MODE is the mode of the addition.
2496 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2497 be set and CFA adjustments added to the generated instructions.
2499 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2500 temporary if register allocation is already complete. This temporary
2501 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2502 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2503 the immediate again.
2505 Since this function may be used to adjust the stack pointer, we must
2506 ensure that it cannot cause transient stack deallocation (for example
2507 by first incrementing SP and then decrementing when adjusting by a
2508 large immediate). */
2511 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2512 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2513 bool frame_related_p
, bool emit_move_imm
)
2515 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2516 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2518 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2523 if (!rtx_equal_p (dest
, src
))
2525 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2526 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2531 /* Single instruction adjustment. */
2532 if (aarch64_uimm12_shift (moffset
))
2534 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2535 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2539 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2542 a) the offset cannot be loaded by a 16-bit move or
2543 b) there is no spare register into which we can move it. */
2544 if (moffset
< 0x1000000
2545 && ((!temp1
&& !can_create_pseudo_p ())
2546 || !aarch64_move_imm (moffset
, mode
)))
2548 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2550 low_off
= offset
< 0 ? -low_off
: low_off
;
2551 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2552 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2553 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2554 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2558 /* Emit a move immediate if required and an addition/subtraction. */
2561 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2562 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2564 insn
= emit_insn (offset
< 0
2565 ? gen_sub3_insn (dest
, src
, temp1
)
2566 : gen_add3_insn (dest
, src
, temp1
));
2567 if (frame_related_p
)
2569 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2570 rtx adj
= plus_constant (mode
, src
, offset
);
2571 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2575 /* Return the number of temporary registers that aarch64_add_offset
2576 would need to move OFFSET into a register or add OFFSET to a register;
2577 ADD_P is true if we want the latter rather than the former. */
2580 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2582 /* This follows the same structure as aarch64_add_offset. */
2583 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2586 unsigned int count
= 0;
2587 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2588 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2589 poly_int64
poly_offset (factor
, factor
);
2590 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2591 /* Need one register for the ADDVL/ADDPL result. */
2593 else if (factor
!= 0)
2595 factor
= abs (factor
);
2596 if (factor
> 16 * (factor
& -factor
))
2597 /* Need one register for the CNT result and one for the multiplication
2598 factor. If necessary, the second temporary can be reused for the
2599 constant part of the offset. */
2601 /* Need one register for the CNT result (which might then
2605 return count
+ aarch64_add_offset_1_temporaries (constant
);
2608 /* If X can be represented as a poly_int64, return the number
2609 of temporaries that are required to add it to a register.
2610 Return -1 otherwise. */
2613 aarch64_add_offset_temporaries (rtx x
)
2616 if (!poly_int_rtx_p (x
, &offset
))
2618 return aarch64_offset_temporaries (true, offset
);
2621 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2622 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2623 be set and CFA adjustments added to the generated instructions.
2625 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2626 temporary if register allocation is already complete. This temporary
2627 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2628 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2629 false to avoid emitting the immediate again.
2631 TEMP2, if nonnull, is a second temporary register that doesn't
2632 overlap either DEST or REG.
2634 Since this function may be used to adjust the stack pointer, we must
2635 ensure that it cannot cause transient stack deallocation (for example
2636 by first incrementing SP and then decrementing when adjusting by a
2637 large immediate). */
2640 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2641 poly_int64 offset
, rtx temp1
, rtx temp2
,
2642 bool frame_related_p
, bool emit_move_imm
= true)
2644 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2645 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2646 gcc_assert (temp1
== NULL_RTX
2648 || !reg_overlap_mentioned_p (temp1
, dest
));
2649 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2651 /* Try using ADDVL or ADDPL to add the whole value. */
2652 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2654 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2655 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2656 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2660 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2661 SVE vector register, over and above the minimum size of 128 bits.
2662 This is equivalent to half the value returned by CNTD with a
2663 vector shape of ALL. */
2664 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2665 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2667 /* Try using ADDVL or ADDPL to add the VG-based part. */
2668 poly_int64
poly_offset (factor
, factor
);
2669 if (src
!= const0_rtx
2670 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2672 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2673 if (frame_related_p
)
2675 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2676 RTX_FRAME_RELATED_P (insn
) = true;
2681 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2682 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2687 /* Otherwise use a CNT-based sequence. */
2688 else if (factor
!= 0)
2690 /* Use a subtraction if we have a negative factor. */
2691 rtx_code code
= PLUS
;
2698 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2699 into the multiplication. */
2703 /* Use a right shift by 1. */
2707 HOST_WIDE_INT low_bit
= factor
& -factor
;
2708 if (factor
<= 16 * low_bit
)
2710 if (factor
> 16 * 8)
2712 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2713 the value with the minimum multiplier and shift it into
2715 int extra_shift
= exact_log2 (low_bit
);
2716 shift
+= extra_shift
;
2717 factor
>>= extra_shift
;
2719 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2723 /* Use CNTD, then multiply it by FACTOR. */
2724 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2725 val
= aarch64_force_temporary (mode
, temp1
, val
);
2727 /* Go back to using a negative multiplication factor if we have
2728 no register from which to subtract. */
2729 if (code
== MINUS
&& src
== const0_rtx
)
2734 rtx coeff1
= gen_int_mode (factor
, mode
);
2735 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2736 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2741 /* Multiply by 1 << SHIFT. */
2742 val
= aarch64_force_temporary (mode
, temp1
, val
);
2743 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2745 else if (shift
== -1)
2748 val
= aarch64_force_temporary (mode
, temp1
, val
);
2749 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2752 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2753 if (src
!= const0_rtx
)
2755 val
= aarch64_force_temporary (mode
, temp1
, val
);
2756 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2758 else if (code
== MINUS
)
2760 val
= aarch64_force_temporary (mode
, temp1
, val
);
2761 val
= gen_rtx_NEG (mode
, val
);
2764 if (constant
== 0 || frame_related_p
)
2766 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2767 if (frame_related_p
)
2769 RTX_FRAME_RELATED_P (insn
) = true;
2770 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2771 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2780 src
= aarch64_force_temporary (mode
, temp1
, val
);
2785 emit_move_imm
= true;
2788 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
2789 frame_related_p
, emit_move_imm
);
2792 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2793 than a poly_int64. */
2796 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2797 rtx offset_rtx
, rtx temp1
, rtx temp2
)
2799 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
2800 temp1
, temp2
, false);
2803 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2804 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2805 if TEMP1 already contains abs (DELTA). */
2808 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
2810 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
2811 temp1
, temp2
, true, emit_move_imm
);
2814 /* Subtract DELTA from the stack pointer, marking the instructions
2815 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2819 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
)
2821 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
2822 temp1
, temp2
, frame_related_p
);
2825 /* Set DEST to (vec_series BASE STEP). */
2828 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
2830 machine_mode mode
= GET_MODE (dest
);
2831 scalar_mode inner
= GET_MODE_INNER (mode
);
2833 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2834 if (!aarch64_sve_index_immediate_p (base
))
2835 base
= force_reg (inner
, base
);
2836 if (!aarch64_sve_index_immediate_p (step
))
2837 step
= force_reg (inner
, step
);
2839 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
2842 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2843 integer of mode INT_MODE. Return true on success. */
2846 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
2849 /* If the constant is smaller than 128 bits, we can do the move
2850 using a vector of SRC_MODEs. */
2851 if (src_mode
!= TImode
)
2853 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
2854 GET_MODE_SIZE (src_mode
));
2855 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
2856 emit_move_insn (gen_lowpart (dup_mode
, dest
),
2857 gen_const_vec_duplicate (dup_mode
, src
));
2861 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2862 src
= force_const_mem (src_mode
, src
);
2866 /* Make sure that the address is legitimate. */
2867 if (!aarch64_sve_ld1r_operand_p (src
))
2869 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
2870 src
= replace_equiv_address (src
, addr
);
2873 machine_mode mode
= GET_MODE (dest
);
2874 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
2875 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
2876 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
2877 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
2878 emit_insn (gen_rtx_SET (dest
, src
));
2882 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2883 isn't a simple duplicate or series. */
2886 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
2888 machine_mode mode
= GET_MODE (src
);
2889 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
2890 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
2891 gcc_assert (npatterns
> 1);
2893 if (nelts_per_pattern
== 1)
2895 /* The constant is a repeating seqeuence of at least two elements,
2896 where the repeating elements occupy no more than 128 bits.
2897 Get an integer representation of the replicated value. */
2898 scalar_int_mode int_mode
;
2899 if (BYTES_BIG_ENDIAN
)
2900 /* For now, always use LD1RQ to load the value on big-endian
2901 targets, since the handling of smaller integers includes a
2902 subreg that is semantically an element reverse. */
2906 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
2907 gcc_assert (int_bits
<= 128);
2908 int_mode
= int_mode_for_size (int_bits
, 0).require ();
2910 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
2912 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
2916 /* Expand each pattern individually. */
2917 rtx_vector_builder builder
;
2918 auto_vec
<rtx
, 16> vectors (npatterns
);
2919 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2921 builder
.new_vector (mode
, 1, nelts_per_pattern
);
2922 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
2923 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
2924 vectors
.quick_push (force_reg (mode
, builder
.build ()));
2927 /* Use permutes to interleave the separate vectors. */
2928 while (npatterns
> 1)
2931 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2933 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
2934 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
2935 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
2939 gcc_assert (vectors
[0] == dest
);
2942 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2943 is a pattern that can be used to set DEST to a replicated scalar
2947 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
2948 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
2950 machine_mode mode
= GET_MODE (dest
);
2952 /* Check on what type of symbol it is. */
2953 scalar_int_mode int_mode
;
2954 if ((GET_CODE (imm
) == SYMBOL_REF
2955 || GET_CODE (imm
) == LABEL_REF
2956 || GET_CODE (imm
) == CONST
2957 || GET_CODE (imm
) == CONST_POLY_INT
)
2958 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
2962 HOST_WIDE_INT const_offset
;
2963 enum aarch64_symbol_type sty
;
2965 /* If we have (const (plus symbol offset)), separate out the offset
2966 before we start classifying the symbol. */
2967 rtx base
= strip_offset (imm
, &offset
);
2969 /* We must always add an offset involving VL separately, rather than
2970 folding it into the relocation. */
2971 if (!offset
.is_constant (&const_offset
))
2973 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
2974 emit_insn (gen_rtx_SET (dest
, imm
));
2977 /* Do arithmetic on 32-bit values if the result is smaller
2979 if (partial_subreg_p (int_mode
, SImode
))
2981 /* It is invalid to do symbol calculations in modes
2982 narrower than SImode. */
2983 gcc_assert (base
== const0_rtx
);
2984 dest
= gen_lowpart (SImode
, dest
);
2987 if (base
!= const0_rtx
)
2989 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2990 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2991 NULL_RTX
, NULL_RTX
, false);
2994 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2995 dest
, NULL_RTX
, false);
3000 sty
= aarch64_classify_symbol (base
, const_offset
);
3003 case SYMBOL_FORCE_TO_MEM
:
3004 if (const_offset
!= 0
3005 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3007 gcc_assert (can_create_pseudo_p ());
3008 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3009 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3010 NULL_RTX
, NULL_RTX
, false);
3014 mem
= force_const_mem (ptr_mode
, imm
);
3017 /* If we aren't generating PC relative literals, then
3018 we need to expand the literal pool access carefully.
3019 This is something that needs to be done in a number
3020 of places, so could well live as a separate function. */
3021 if (!aarch64_pcrelative_literal_loads
)
3023 gcc_assert (can_create_pseudo_p ());
3024 base
= gen_reg_rtx (ptr_mode
);
3025 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3026 if (ptr_mode
!= Pmode
)
3027 base
= convert_memory_address (Pmode
, base
);
3028 mem
= gen_rtx_MEM (ptr_mode
, base
);
3031 if (int_mode
!= ptr_mode
)
3032 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3034 emit_insn (gen_rtx_SET (dest
, mem
));
3038 case SYMBOL_SMALL_TLSGD
:
3039 case SYMBOL_SMALL_TLSDESC
:
3040 case SYMBOL_SMALL_TLSIE
:
3041 case SYMBOL_SMALL_GOT_28K
:
3042 case SYMBOL_SMALL_GOT_4G
:
3043 case SYMBOL_TINY_GOT
:
3044 case SYMBOL_TINY_TLSIE
:
3045 if (const_offset
!= 0)
3047 gcc_assert(can_create_pseudo_p ());
3048 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3049 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3050 NULL_RTX
, NULL_RTX
, false);
3055 case SYMBOL_SMALL_ABSOLUTE
:
3056 case SYMBOL_TINY_ABSOLUTE
:
3057 case SYMBOL_TLSLE12
:
3058 case SYMBOL_TLSLE24
:
3059 case SYMBOL_TLSLE32
:
3060 case SYMBOL_TLSLE48
:
3061 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3069 if (!CONST_INT_P (imm
))
3071 rtx base
, step
, value
;
3072 if (GET_CODE (imm
) == HIGH
3073 || aarch64_simd_valid_immediate (imm
, NULL
))
3074 emit_insn (gen_rtx_SET (dest
, imm
));
3075 else if (const_vec_series_p (imm
, &base
, &step
))
3076 aarch64_expand_vec_series (dest
, base
, step
);
3077 else if (const_vec_duplicate_p (imm
, &value
))
3079 /* If the constant is out of range of an SVE vector move,
3080 load it from memory if we can, otherwise move it into
3081 a register and use a DUP. */
3082 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3083 rtx op
= force_const_mem (inner_mode
, value
);
3085 op
= force_reg (inner_mode
, value
);
3086 else if (!aarch64_sve_ld1r_operand_p (op
))
3088 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3089 op
= replace_equiv_address (op
, addr
);
3091 emit_insn (gen_vec_duplicate (dest
, op
));
3093 else if (GET_CODE (imm
) == CONST_VECTOR
3094 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3095 aarch64_expand_sve_const_vector (dest
, imm
);
3098 rtx mem
= force_const_mem (mode
, imm
);
3100 emit_move_insn (dest
, mem
);
3106 aarch64_internal_mov_immediate (dest
, imm
, true,
3107 as_a
<scalar_int_mode
> (mode
));
3110 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3111 that is known to contain PTRUE. */
3114 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3116 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3117 gen_rtvec (2, pred
, src
),
3118 UNSPEC_MERGE_PTRUE
)));
3121 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3122 operand is in memory. In this case we need to use the predicated LD1
3123 and ST1 instead of LDR and STR, both for correctness on big-endian
3124 targets and because LD1 and ST1 support a wider range of addressing modes.
3125 PRED_MODE is the mode of the predicate.
3127 See the comment at the head of aarch64-sve.md for details about the
3128 big-endian handling. */
3131 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3133 machine_mode mode
= GET_MODE (dest
);
3134 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3135 if (!register_operand (src
, mode
)
3136 && !register_operand (dest
, mode
))
3138 rtx tmp
= gen_reg_rtx (mode
);
3140 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3142 emit_move_insn (tmp
, src
);
3145 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3148 /* Called only on big-endian targets. See whether an SVE vector move
3149 from SRC to DEST is effectively a REV[BHW] instruction, because at
3150 least one operand is a subreg of an SVE vector that has wider or
3151 narrower elements. Return true and emit the instruction if so.
3155 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3157 represents a VIEW_CONVERT between the following vectors, viewed
3160 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3161 R1: { [0], [1], [2], [3], ... }
3163 The high part of lane X in R2 should therefore correspond to lane X*2
3164 of R1, but the register representations are:
3167 R2: ...... [1].high [1].low [0].high [0].low
3168 R1: ...... [3] [2] [1] [0]
3170 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3171 We therefore need a reverse operation to swap the high and low values
3174 This is purely an optimization. Without it we would spill the
3175 subreg operand to the stack in one mode and reload it in the
3176 other mode, which has the same effect as the REV. */
3179 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3181 gcc_assert (BYTES_BIG_ENDIAN
);
3182 if (GET_CODE (dest
) == SUBREG
)
3183 dest
= SUBREG_REG (dest
);
3184 if (GET_CODE (src
) == SUBREG
)
3185 src
= SUBREG_REG (src
);
3187 /* The optimization handles two single SVE REGs with different element
3191 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3192 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3193 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3194 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3197 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3198 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3199 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3201 emit_insn (gen_rtx_SET (dest
, unspec
));
3205 /* Return a copy of X with mode MODE, without changing its other
3206 attributes. Unlike gen_lowpart, this doesn't care whether the
3207 mode change is valid. */
3210 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3212 if (GET_MODE (x
) == mode
)
3215 x
= shallow_copy_rtx (x
);
3216 set_mode_and_regno (x
, mode
, REGNO (x
));
3220 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3224 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3226 /* Decide which REV operation we need. The mode with narrower elements
3227 determines the mode of the operands and the mode with the wider
3228 elements determines the reverse width. */
3229 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3230 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3231 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3232 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3233 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3235 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3236 unsigned int unspec
;
3237 if (wider_bytes
== 8)
3238 unspec
= UNSPEC_REV64
;
3239 else if (wider_bytes
== 4)
3240 unspec
= UNSPEC_REV32
;
3241 else if (wider_bytes
== 2)
3242 unspec
= UNSPEC_REV16
;
3245 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3249 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3250 UNSPEC_MERGE_PTRUE))
3252 with the appropriate modes. */
3253 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3254 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3255 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3256 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3257 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3258 UNSPEC_MERGE_PTRUE
);
3259 emit_insn (gen_rtx_SET (dest
, src
));
3263 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3264 tree exp ATTRIBUTE_UNUSED
)
3266 /* Currently, always true. */
3270 /* Implement TARGET_PASS_BY_REFERENCE. */
3273 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3276 bool named ATTRIBUTE_UNUSED
)
3279 machine_mode dummymode
;
3282 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3283 if (mode
== BLKmode
&& type
)
3284 size
= int_size_in_bytes (type
);
3286 /* No frontends can create types with variable-sized modes, so we
3287 shouldn't be asked to pass or return them. */
3288 size
= GET_MODE_SIZE (mode
).to_constant ();
3290 /* Aggregates are passed by reference based on their size. */
3291 if (type
&& AGGREGATE_TYPE_P (type
))
3293 size
= int_size_in_bytes (type
);
3296 /* Variable sized arguments are always returned by reference. */
3300 /* Can this be a candidate to be passed in fp/simd register(s)? */
3301 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3306 /* Arguments which are variable sized or larger than 2 registers are
3307 passed by reference unless they are a homogenous floating point
3309 return size
> 2 * UNITS_PER_WORD
;
3312 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3314 aarch64_return_in_msb (const_tree valtype
)
3316 machine_mode dummy_mode
;
3319 /* Never happens in little-endian mode. */
3320 if (!BYTES_BIG_ENDIAN
)
3323 /* Only composite types smaller than or equal to 16 bytes can
3324 be potentially returned in registers. */
3325 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3326 || int_size_in_bytes (valtype
) <= 0
3327 || int_size_in_bytes (valtype
) > 16)
3330 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3331 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3332 is always passed/returned in the least significant bits of fp/simd
3334 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3335 &dummy_mode
, &dummy_int
, NULL
))
3341 /* Implement TARGET_FUNCTION_VALUE.
3342 Define how to find the value returned by a function. */
3345 aarch64_function_value (const_tree type
, const_tree func
,
3346 bool outgoing ATTRIBUTE_UNUSED
)
3351 machine_mode ag_mode
;
3353 mode
= TYPE_MODE (type
);
3354 if (INTEGRAL_TYPE_P (type
))
3355 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3357 if (aarch64_return_in_msb (type
))
3359 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3361 if (size
% UNITS_PER_WORD
!= 0)
3363 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3364 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3368 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3369 &ag_mode
, &count
, NULL
))
3371 if (!aarch64_composite_type_p (type
, mode
))
3373 gcc_assert (count
== 1 && mode
== ag_mode
);
3374 return gen_rtx_REG (mode
, V0_REGNUM
);
3381 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3382 for (i
= 0; i
< count
; i
++)
3384 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3385 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3386 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3387 XVECEXP (par
, 0, i
) = tmp
;
3393 return gen_rtx_REG (mode
, R0_REGNUM
);
3396 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3397 Return true if REGNO is the number of a hard register in which the values
3398 of called function may come back. */
3401 aarch64_function_value_regno_p (const unsigned int regno
)
3403 /* Maximum of 16 bytes can be returned in the general registers. Examples
3404 of 16-byte return values are: 128-bit integers and 16-byte small
3405 structures (excluding homogeneous floating-point aggregates). */
3406 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3409 /* Up to four fp/simd registers can return a function value, e.g. a
3410 homogeneous floating-point aggregate having four members. */
3411 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3412 return TARGET_FLOAT
;
3417 /* Implement TARGET_RETURN_IN_MEMORY.
3419 If the type T of the result of a function is such that
3421 would require that arg be passed as a value in a register (or set of
3422 registers) according to the parameter passing rules, then the result
3423 is returned in the same registers as would be used for such an
3427 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3430 machine_mode ag_mode
;
3433 if (!AGGREGATE_TYPE_P (type
)
3434 && TREE_CODE (type
) != COMPLEX_TYPE
3435 && TREE_CODE (type
) != VECTOR_TYPE
)
3436 /* Simple scalar types always returned in registers. */
3439 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3446 /* Types larger than 2 registers returned in memory. */
3447 size
= int_size_in_bytes (type
);
3448 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3452 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3453 const_tree type
, int *nregs
)
3455 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3456 return aarch64_vfp_is_call_or_return_candidate (mode
,
3458 &pcum
->aapcs_vfp_rmode
,
3463 /* Given MODE and TYPE of a function argument, return the alignment in
3464 bits. The idea is to suppress any stronger alignment requested by
3465 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3466 This is a helper function for local use only. */
3469 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3472 return GET_MODE_ALIGNMENT (mode
);
3474 if (integer_zerop (TYPE_SIZE (type
)))
3477 gcc_assert (TYPE_MODE (type
) == mode
);
3479 if (!AGGREGATE_TYPE_P (type
))
3480 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3482 if (TREE_CODE (type
) == ARRAY_TYPE
)
3483 return TYPE_ALIGN (TREE_TYPE (type
));
3485 unsigned int alignment
= 0;
3486 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3487 if (TREE_CODE (field
) == FIELD_DECL
)
3488 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3493 /* Layout a function argument according to the AAPCS64 rules. The rule
3494 numbers refer to the rule numbers in the AAPCS64. */
3497 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3499 bool named ATTRIBUTE_UNUSED
)
3501 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3502 int ncrn
, nvrn
, nregs
;
3503 bool allocate_ncrn
, allocate_nvrn
;
3506 /* We need to do this once per argument. */
3507 if (pcum
->aapcs_arg_processed
)
3510 pcum
->aapcs_arg_processed
= true;
3512 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3514 size
= int_size_in_bytes (type
);
3516 /* No frontends can create types with variable-sized modes, so we
3517 shouldn't be asked to pass or return them. */
3518 size
= GET_MODE_SIZE (mode
).to_constant ();
3519 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3521 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3522 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3527 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3528 The following code thus handles passing by SIMD/FP registers first. */
3530 nvrn
= pcum
->aapcs_nvrn
;
3532 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3533 and homogenous short-vector aggregates (HVA). */
3537 aarch64_err_no_fpadvsimd (mode
);
3539 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3541 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3542 if (!aarch64_composite_type_p (type
, mode
))
3544 gcc_assert (nregs
== 1);
3545 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3551 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3552 for (i
= 0; i
< nregs
; i
++)
3554 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3555 V0_REGNUM
+ nvrn
+ i
);
3556 rtx offset
= gen_int_mode
3557 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3558 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3559 XVECEXP (par
, 0, i
) = tmp
;
3561 pcum
->aapcs_reg
= par
;
3567 /* C.3 NSRN is set to 8. */
3568 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3573 ncrn
= pcum
->aapcs_ncrn
;
3574 nregs
= size
/ UNITS_PER_WORD
;
3576 /* C6 - C9. though the sign and zero extension semantics are
3577 handled elsewhere. This is the case where the argument fits
3578 entirely general registers. */
3579 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3582 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3584 /* C.8 if the argument has an alignment of 16 then the NGRN is
3585 rounded up to the next even number. */
3588 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3589 comparison is there because for > 16 * BITS_PER_UNIT
3590 alignment nregs should be > 2 and therefore it should be
3591 passed by reference rather than value. */
3592 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3595 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3598 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3599 A reg is still generated for it, but the caller should be smart
3600 enough not to use it. */
3601 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3602 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3608 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3609 for (i
= 0; i
< nregs
; i
++)
3611 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3612 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3613 GEN_INT (i
* UNITS_PER_WORD
));
3614 XVECEXP (par
, 0, i
) = tmp
;
3616 pcum
->aapcs_reg
= par
;
3619 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3624 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3626 /* The argument is passed on stack; record the needed number of words for
3627 this argument and align the total size if necessary. */
3629 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3631 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3632 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3633 16 / UNITS_PER_WORD
);
3637 /* Implement TARGET_FUNCTION_ARG. */
3640 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3641 const_tree type
, bool named
)
3643 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3644 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3646 if (mode
== VOIDmode
)
3649 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3650 return pcum
->aapcs_reg
;
3654 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3655 const_tree fntype ATTRIBUTE_UNUSED
,
3656 rtx libname ATTRIBUTE_UNUSED
,
3657 const_tree fndecl ATTRIBUTE_UNUSED
,
3658 unsigned n_named ATTRIBUTE_UNUSED
)
3660 pcum
->aapcs_ncrn
= 0;
3661 pcum
->aapcs_nvrn
= 0;
3662 pcum
->aapcs_nextncrn
= 0;
3663 pcum
->aapcs_nextnvrn
= 0;
3664 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3665 pcum
->aapcs_reg
= NULL_RTX
;
3666 pcum
->aapcs_arg_processed
= false;
3667 pcum
->aapcs_stack_words
= 0;
3668 pcum
->aapcs_stack_size
= 0;
3671 && fndecl
&& TREE_PUBLIC (fndecl
)
3672 && fntype
&& fntype
!= error_mark_node
)
3674 const_tree type
= TREE_TYPE (fntype
);
3675 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3676 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3677 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3678 &mode
, &nregs
, NULL
))
3679 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
3685 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3690 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3691 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3693 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3694 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3695 != (pcum
->aapcs_stack_words
!= 0));
3696 pcum
->aapcs_arg_processed
= false;
3697 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3698 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3699 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3700 pcum
->aapcs_stack_words
= 0;
3701 pcum
->aapcs_reg
= NULL_RTX
;
3706 aarch64_function_arg_regno_p (unsigned regno
)
3708 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3709 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3712 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3713 PARM_BOUNDARY bits of alignment, but will be given anything up
3714 to STACK_BOUNDARY bits if the type requires it. This makes sure
3715 that both before and after the layout of each argument, the Next
3716 Stacked Argument Address (NSAA) will have a minimum alignment of
3720 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3722 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3723 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3726 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3728 static fixed_size_mode
3729 aarch64_get_reg_raw_mode (int regno
)
3731 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3732 /* Don't use the SVE part of the register for __builtin_apply and
3733 __builtin_return. The SVE registers aren't used by the normal PCS,
3734 so using them there would be a waste of time. The PCS extensions
3735 for SVE types are fundamentally incompatible with the
3736 __builtin_return/__builtin_apply interface. */
3737 return as_a
<fixed_size_mode
> (V16QImode
);
3738 return default_get_reg_raw_mode (regno
);
3741 /* Implement TARGET_FUNCTION_ARG_PADDING.
3743 Small aggregate types are placed in the lowest memory address.
3745 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3747 static pad_direction
3748 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3750 /* On little-endian targets, the least significant byte of every stack
3751 argument is passed at the lowest byte address of the stack slot. */
3752 if (!BYTES_BIG_ENDIAN
)
3755 /* Otherwise, integral, floating-point and pointer types are padded downward:
3756 the least significant byte of a stack argument is passed at the highest
3757 byte address of the stack slot. */
3759 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3760 || POINTER_TYPE_P (type
))
3761 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3762 return PAD_DOWNWARD
;
3764 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3768 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3770 It specifies padding for the last (may also be the only)
3771 element of a block move between registers and memory. If
3772 assuming the block is in the memory, padding upward means that
3773 the last element is padded after its highest significant byte,
3774 while in downward padding, the last element is padded at the
3775 its least significant byte side.
3777 Small aggregates and small complex types are always padded
3780 We don't need to worry about homogeneous floating-point or
3781 short-vector aggregates; their move is not affected by the
3782 padding direction determined here. Regardless of endianness,
3783 each element of such an aggregate is put in the least
3784 significant bits of a fp/simd register.
3786 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3787 register has useful data, and return the opposite if the most
3788 significant byte does. */
3791 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
3792 bool first ATTRIBUTE_UNUSED
)
3795 /* Small composite types are always padded upward. */
3796 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
3800 size
= int_size_in_bytes (type
);
3802 /* No frontends can create types with variable-sized modes, so we
3803 shouldn't be asked to pass or return them. */
3804 size
= GET_MODE_SIZE (mode
).to_constant ();
3805 if (size
< 2 * UNITS_PER_WORD
)
3809 /* Otherwise, use the default padding. */
3810 return !BYTES_BIG_ENDIAN
;
3813 static scalar_int_mode
3814 aarch64_libgcc_cmp_return_mode (void)
3819 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3821 /* We use the 12-bit shifted immediate arithmetic instructions so values
3822 must be multiple of (1 << 12), i.e. 4096. */
3823 #define ARITH_FACTOR 4096
3825 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3826 #error Cannot use simple address calculation for stack probing
3829 /* The pair of scratch registers used for stack probing. */
3830 #define PROBE_STACK_FIRST_REG 9
3831 #define PROBE_STACK_SECOND_REG 10
3833 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3834 inclusive. These are offsets from the current stack pointer. */
3837 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
3840 if (!poly_size
.is_constant (&size
))
3842 sorry ("stack probes for SVE frames");
3846 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
3848 /* See the same assertion on PROBE_INTERVAL above. */
3849 gcc_assert ((first
% ARITH_FACTOR
) == 0);
3851 /* See if we have a constant small number of probes to generate. If so,
3852 that's the easy case. */
3853 if (size
<= PROBE_INTERVAL
)
3855 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
3857 emit_set_insn (reg1
,
3858 plus_constant (Pmode
,
3859 stack_pointer_rtx
, -(first
+ base
)));
3860 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
3863 /* The run-time loop is made up of 8 insns in the generic case while the
3864 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3865 else if (size
<= 4 * PROBE_INTERVAL
)
3867 HOST_WIDE_INT i
, rem
;
3869 emit_set_insn (reg1
,
3870 plus_constant (Pmode
,
3872 -(first
+ PROBE_INTERVAL
)));
3873 emit_stack_probe (reg1
);
3875 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3876 it exceeds SIZE. If only two probes are needed, this will not
3877 generate any code. Then probe at FIRST + SIZE. */
3878 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
3880 emit_set_insn (reg1
,
3881 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
3882 emit_stack_probe (reg1
);
3885 rem
= size
- (i
- PROBE_INTERVAL
);
3888 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3890 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
3891 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
3894 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
3897 /* Otherwise, do the same as above, but in a loop. Note that we must be
3898 extra careful with variables wrapping around because we might be at
3899 the very top (or the very bottom) of the address space and we have
3900 to be able to handle this case properly; in particular, we use an
3901 equality test for the loop condition. */
3904 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
3906 /* Step 1: round SIZE to the previous multiple of the interval. */
3908 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
3911 /* Step 2: compute initial and final value of the loop counter. */
3913 /* TEST_ADDR = SP + FIRST. */
3914 emit_set_insn (reg1
,
3915 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
3917 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3918 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
3919 if (! aarch64_uimm12_shift (adjustment
))
3921 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
3923 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
3926 emit_set_insn (reg2
,
3927 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
3933 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3936 while (TEST_ADDR != LAST_ADDR)
3938 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3939 until it is equal to ROUNDED_SIZE. */
3941 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
3944 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3945 that SIZE is equal to ROUNDED_SIZE. */
3947 if (size
!= rounded_size
)
3949 HOST_WIDE_INT rem
= size
- rounded_size
;
3953 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3955 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
3956 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
3959 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
3963 /* Make sure nothing is scheduled before we are done. */
3964 emit_insn (gen_blockage ());
3967 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3968 absolute addresses. */
3971 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
3973 static int labelno
= 0;
3977 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
3980 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
3982 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3984 xops
[1] = GEN_INT (PROBE_INTERVAL
);
3985 output_asm_insn ("sub\t%0, %0, %1", xops
);
3987 /* Probe at TEST_ADDR. */
3988 output_asm_insn ("str\txzr, [%0]", xops
);
3990 /* Test if TEST_ADDR == LAST_ADDR. */
3992 output_asm_insn ("cmp\t%0, %1", xops
);
3995 fputs ("\tb.ne\t", asm_out_file
);
3996 assemble_name_raw (asm_out_file
, loop_lab
);
3997 fputc ('\n', asm_out_file
);
4002 /* Determine whether a frame chain needs to be generated. */
4004 aarch64_needs_frame_chain (void)
4006 /* Force a frame chain for EH returns so the return address is at FP+8. */
4007 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4010 /* A leaf function cannot have calls or write LR. */
4011 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4013 /* Don't use a frame chain in leaf functions if leaf frame pointers
4015 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4018 return aarch64_use_frame_pointer
;
4021 /* Mark the registers that need to be saved by the callee and calculate
4022 the size of the callee-saved registers area and frame record (both FP
4023 and LR may be omitted). */
4025 aarch64_layout_frame (void)
4027 HOST_WIDE_INT offset
= 0;
4028 int regno
, last_fp_reg
= INVALID_REGNUM
;
4030 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4032 #define SLOT_NOT_REQUIRED (-2)
4033 #define SLOT_REQUIRED (-1)
4035 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4036 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4038 /* First mark all the registers that really need to be saved... */
4039 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4040 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4042 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4043 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4045 /* ... that includes the eh data registers (if needed)... */
4046 if (crtl
->calls_eh_return
)
4047 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4048 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4051 /* ... and any callee saved register that dataflow says is live. */
4052 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4053 if (df_regs_ever_live_p (regno
)
4054 && (regno
== R30_REGNUM
4055 || !call_used_regs
[regno
]))
4056 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4058 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4059 if (df_regs_ever_live_p (regno
)
4060 && !call_used_regs
[regno
])
4062 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4063 last_fp_reg
= regno
;
4066 if (cfun
->machine
->frame
.emit_frame_chain
)
4068 /* FP and LR are placed in the linkage record. */
4069 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4070 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4071 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4072 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4073 offset
= 2 * UNITS_PER_WORD
;
4076 /* Now assign stack slots for them. */
4077 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4078 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4080 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4081 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4082 cfun
->machine
->frame
.wb_candidate1
= regno
;
4083 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4084 cfun
->machine
->frame
.wb_candidate2
= regno
;
4085 offset
+= UNITS_PER_WORD
;
4088 HOST_WIDE_INT max_int_offset
= offset
;
4089 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4090 bool has_align_gap
= offset
!= max_int_offset
;
4092 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4093 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4095 /* If there is an alignment gap between integer and fp callee-saves,
4096 allocate the last fp register to it if possible. */
4097 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
4099 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4103 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4104 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4105 cfun
->machine
->frame
.wb_candidate1
= regno
;
4106 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4107 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4108 cfun
->machine
->frame
.wb_candidate2
= regno
;
4109 offset
+= UNITS_PER_WORD
;
4112 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4114 cfun
->machine
->frame
.saved_regs_size
= offset
;
4116 HOST_WIDE_INT varargs_and_saved_regs_size
4117 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4119 cfun
->machine
->frame
.hard_fp_offset
4120 = aligned_upper_bound (varargs_and_saved_regs_size
4121 + get_frame_size (),
4122 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4124 /* Both these values are already aligned. */
4125 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4126 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4127 cfun
->machine
->frame
.frame_size
4128 = (cfun
->machine
->frame
.hard_fp_offset
4129 + crtl
->outgoing_args_size
);
4131 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4133 cfun
->machine
->frame
.initial_adjust
= 0;
4134 cfun
->machine
->frame
.final_adjust
= 0;
4135 cfun
->machine
->frame
.callee_adjust
= 0;
4136 cfun
->machine
->frame
.callee_offset
= 0;
4138 HOST_WIDE_INT max_push_offset
= 0;
4139 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4140 max_push_offset
= 512;
4141 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4142 max_push_offset
= 256;
4144 HOST_WIDE_INT const_size
, const_fp_offset
;
4145 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4146 && const_size
< max_push_offset
4147 && known_eq (crtl
->outgoing_args_size
, 0))
4149 /* Simple, small frame with no outgoing arguments:
4150 stp reg1, reg2, [sp, -frame_size]!
4151 stp reg3, reg4, [sp, 16] */
4152 cfun
->machine
->frame
.callee_adjust
= const_size
;
4154 else if (known_lt (crtl
->outgoing_args_size
4155 + cfun
->machine
->frame
.saved_regs_size
, 512)
4156 && !(cfun
->calls_alloca
4157 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4160 /* Frame with small outgoing arguments:
4161 sub sp, sp, frame_size
4162 stp reg1, reg2, [sp, outgoing_args_size]
4163 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4164 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4165 cfun
->machine
->frame
.callee_offset
4166 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4168 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4169 && const_fp_offset
< max_push_offset
)
4171 /* Frame with large outgoing arguments but a small local area:
4172 stp reg1, reg2, [sp, -hard_fp_offset]!
4173 stp reg3, reg4, [sp, 16]
4174 sub sp, sp, outgoing_args_size */
4175 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4176 cfun
->machine
->frame
.final_adjust
4177 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4181 /* Frame with large local area and outgoing arguments using frame pointer:
4182 sub sp, sp, hard_fp_offset
4183 stp x29, x30, [sp, 0]
4185 stp reg3, reg4, [sp, 16]
4186 sub sp, sp, outgoing_args_size */
4187 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4188 cfun
->machine
->frame
.final_adjust
4189 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4192 cfun
->machine
->frame
.laid_out
= true;
4195 /* Return true if the register REGNO is saved on entry to
4196 the current function. */
4199 aarch64_register_saved_on_entry (int regno
)
4201 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4204 /* Return the next register up from REGNO up to LIMIT for the callee
4208 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4210 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4215 /* Push the register number REGNO of mode MODE to the stack with write-back
4216 adjusting the stack by ADJUSTMENT. */
4219 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4220 HOST_WIDE_INT adjustment
)
4222 rtx base_rtx
= stack_pointer_rtx
;
4225 reg
= gen_rtx_REG (mode
, regno
);
4226 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4227 plus_constant (Pmode
, base_rtx
, -adjustment
));
4228 mem
= gen_frame_mem (mode
, mem
);
4230 insn
= emit_move_insn (mem
, reg
);
4231 RTX_FRAME_RELATED_P (insn
) = 1;
4234 /* Generate and return an instruction to store the pair of registers
4235 REG and REG2 of mode MODE to location BASE with write-back adjusting
4236 the stack location BASE by ADJUSTMENT. */
4239 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4240 HOST_WIDE_INT adjustment
)
4245 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4246 GEN_INT (-adjustment
),
4247 GEN_INT (UNITS_PER_WORD
- adjustment
));
4249 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4250 GEN_INT (-adjustment
),
4251 GEN_INT (UNITS_PER_WORD
- adjustment
));
4257 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4258 stack pointer by ADJUSTMENT. */
4261 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4264 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4266 if (regno2
== INVALID_REGNUM
)
4267 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4269 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4270 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4272 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4274 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4275 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4276 RTX_FRAME_RELATED_P (insn
) = 1;
4279 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4280 adjusting it by ADJUSTMENT afterwards. */
4283 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4284 HOST_WIDE_INT adjustment
)
4289 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4290 GEN_INT (UNITS_PER_WORD
));
4292 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4293 GEN_INT (UNITS_PER_WORD
));
4299 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4300 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4304 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4307 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4308 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4310 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4312 if (regno2
== INVALID_REGNUM
)
4314 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4315 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4316 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4320 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4321 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4322 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4327 /* Generate and return a store pair instruction of mode MODE to store
4328 register REG1 to MEM1 and register REG2 to MEM2. */
4331 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4337 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4340 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4347 /* Generate and regurn a load pair isntruction of mode MODE to load register
4348 REG1 from MEM1 and register REG2 from MEM2. */
4351 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4357 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4360 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4367 /* Return TRUE if return address signing should be enabled for the current
4368 function, otherwise return FALSE. */
4371 aarch64_return_address_signing_enabled (void)
4373 /* This function should only be called after frame laid out. */
4374 gcc_assert (cfun
->machine
->frame
.laid_out
);
4376 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4377 if it's LR is pushed onto stack. */
4378 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4379 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4380 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4383 /* Emit code to save the callee-saved registers from register number START
4384 to LIMIT to the stack at the location starting at offset START_OFFSET,
4385 skipping any write-back candidates if SKIP_WB is true. */
4388 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4389 unsigned start
, unsigned limit
, bool skip_wb
)
4395 for (regno
= aarch64_next_callee_save (start
, limit
);
4397 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4403 && (regno
== cfun
->machine
->frame
.wb_candidate1
4404 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4407 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4410 reg
= gen_rtx_REG (mode
, regno
);
4411 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4412 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4415 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4418 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4419 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4420 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4423 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4426 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4427 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4429 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4432 /* The first part of a frame-related parallel insn is
4433 always assumed to be relevant to the frame
4434 calculations; subsequent parts, are only
4435 frame-related if explicitly marked. */
4436 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4440 insn
= emit_move_insn (mem
, reg
);
4442 RTX_FRAME_RELATED_P (insn
) = 1;
4446 /* Emit code to restore the callee registers of mode MODE from register
4447 number START up to and including LIMIT. Restore from the stack offset
4448 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4449 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4452 aarch64_restore_callee_saves (machine_mode mode
,
4453 poly_int64 start_offset
, unsigned start
,
4454 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4456 rtx base_rtx
= stack_pointer_rtx
;
4461 for (regno
= aarch64_next_callee_save (start
, limit
);
4463 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4465 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4471 && (regno
== cfun
->machine
->frame
.wb_candidate1
4472 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4475 reg
= gen_rtx_REG (mode
, regno
);
4476 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4477 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4479 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4482 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4483 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4484 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4486 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4489 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4490 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4491 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4493 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4497 emit_move_insn (reg
, mem
);
4498 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4502 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4506 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4508 HOST_WIDE_INT multiple
;
4509 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4510 && IN_RANGE (multiple
, -8, 7));
4513 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4517 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4519 HOST_WIDE_INT multiple
;
4520 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4521 && IN_RANGE (multiple
, 0, 63));
4524 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4528 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4530 HOST_WIDE_INT multiple
;
4531 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4532 && IN_RANGE (multiple
, -64, 63));
4535 /* Return true if OFFSET is a signed 9-bit value. */
4538 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4541 HOST_WIDE_INT const_offset
;
4542 return (offset
.is_constant (&const_offset
)
4543 && IN_RANGE (const_offset
, -256, 255));
4546 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4550 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4552 HOST_WIDE_INT multiple
;
4553 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4554 && IN_RANGE (multiple
, -256, 255));
4557 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4561 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4563 HOST_WIDE_INT multiple
;
4564 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4565 && IN_RANGE (multiple
, 0, 4095));
4568 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4571 aarch64_get_separate_components (void)
4573 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4574 bitmap_clear (components
);
4576 /* The registers we need saved to the frame. */
4577 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4578 if (aarch64_register_saved_on_entry (regno
))
4580 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4581 if (!frame_pointer_needed
)
4582 offset
+= cfun
->machine
->frame
.frame_size
4583 - cfun
->machine
->frame
.hard_fp_offset
;
4584 /* Check that we can access the stack slot of the register with one
4585 direct load with no adjustments needed. */
4586 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4587 bitmap_set_bit (components
, regno
);
4590 /* Don't mess with the hard frame pointer. */
4591 if (frame_pointer_needed
)
4592 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4594 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4595 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4596 /* If registers have been chosen to be stored/restored with
4597 writeback don't interfere with them to avoid having to output explicit
4598 stack adjustment instructions. */
4599 if (reg2
!= INVALID_REGNUM
)
4600 bitmap_clear_bit (components
, reg2
);
4601 if (reg1
!= INVALID_REGNUM
)
4602 bitmap_clear_bit (components
, reg1
);
4604 bitmap_clear_bit (components
, LR_REGNUM
);
4605 bitmap_clear_bit (components
, SP_REGNUM
);
4610 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4613 aarch64_components_for_bb (basic_block bb
)
4615 bitmap in
= DF_LIVE_IN (bb
);
4616 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4617 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4619 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4620 bitmap_clear (components
);
4622 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4623 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4624 if ((!call_used_regs
[regno
])
4625 && (bitmap_bit_p (in
, regno
)
4626 || bitmap_bit_p (gen
, regno
)
4627 || bitmap_bit_p (kill
, regno
)))
4629 unsigned regno2
, offset
, offset2
;
4630 bitmap_set_bit (components
, regno
);
4632 /* If there is a callee-save at an adjacent offset, add it too
4633 to increase the use of LDP/STP. */
4634 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4635 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
4637 if (regno2
<= LAST_SAVED_REGNUM
)
4639 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4640 if ((offset
& ~8) == (offset2
& ~8))
4641 bitmap_set_bit (components
, regno2
);
4648 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4649 Nothing to do for aarch64. */
4652 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
4656 /* Return the next set bit in BMP from START onwards. Return the total number
4657 of bits in BMP if no set bit is found at or after START. */
4660 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
4662 unsigned int nbits
= SBITMAP_SIZE (bmp
);
4666 gcc_assert (start
< nbits
);
4667 for (unsigned int i
= start
; i
< nbits
; i
++)
4668 if (bitmap_bit_p (bmp
, i
))
4674 /* Do the work for aarch64_emit_prologue_components and
4675 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4676 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4677 for these components or the epilogue sequence. That is, it determines
4678 whether we should emit stores or loads and what kind of CFA notes to attach
4679 to the insns. Otherwise the logic for the two sequences is very
4683 aarch64_process_components (sbitmap components
, bool prologue_p
)
4685 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
4686 ? HARD_FRAME_POINTER_REGNUM
4687 : STACK_POINTER_REGNUM
);
4689 unsigned last_regno
= SBITMAP_SIZE (components
);
4690 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
4691 rtx_insn
*insn
= NULL
;
4693 while (regno
!= last_regno
)
4695 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4696 so DFmode for the vector registers is enough. */
4697 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
4698 rtx reg
= gen_rtx_REG (mode
, regno
);
4699 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4700 if (!frame_pointer_needed
)
4701 offset
+= cfun
->machine
->frame
.frame_size
4702 - cfun
->machine
->frame
.hard_fp_offset
;
4703 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
4704 rtx mem
= gen_frame_mem (mode
, addr
);
4706 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
4707 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
4708 /* No more registers to handle after REGNO.
4709 Emit a single save/restore and exit. */
4710 if (regno2
== last_regno
)
4712 insn
= emit_insn (set
);
4713 RTX_FRAME_RELATED_P (insn
) = 1;
4715 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4717 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4721 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4722 /* The next register is not of the same class or its offset is not
4723 mergeable with the current one into a pair. */
4724 if (!satisfies_constraint_Ump (mem
)
4725 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
4726 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
4727 GET_MODE_SIZE (mode
)))
4729 insn
= emit_insn (set
);
4730 RTX_FRAME_RELATED_P (insn
) = 1;
4732 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4734 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4740 /* REGNO2 can be saved/restored in a pair with REGNO. */
4741 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4742 if (!frame_pointer_needed
)
4743 offset2
+= cfun
->machine
->frame
.frame_size
4744 - cfun
->machine
->frame
.hard_fp_offset
;
4745 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
4746 rtx mem2
= gen_frame_mem (mode
, addr2
);
4747 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
4748 : gen_rtx_SET (reg2
, mem2
);
4751 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
4753 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4755 RTX_FRAME_RELATED_P (insn
) = 1;
4758 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
4759 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
4763 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4764 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
4767 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
4771 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4774 aarch64_emit_prologue_components (sbitmap components
)
4776 aarch64_process_components (components
, true);
4779 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4782 aarch64_emit_epilogue_components (sbitmap components
)
4784 aarch64_process_components (components
, false);
4787 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4790 aarch64_set_handled_components (sbitmap components
)
4792 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4793 if (bitmap_bit_p (components
, regno
))
4794 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
4797 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4798 is saved at BASE + OFFSET. */
4801 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
4802 rtx base
, poly_int64 offset
)
4804 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
4805 add_reg_note (insn
, REG_CFA_EXPRESSION
,
4806 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
4809 /* AArch64 stack frames generated by this compiler look like:
4811 +-------------------------------+
4813 | incoming stack arguments |
4815 +-------------------------------+
4816 | | <-- incoming stack pointer (aligned)
4817 | callee-allocated save area |
4818 | for register varargs |
4820 +-------------------------------+
4821 | local variables | <-- frame_pointer_rtx
4823 +-------------------------------+
4825 +-------------------------------+ |
4826 | callee-saved registers | | frame.saved_regs_size
4827 +-------------------------------+ |
4829 +-------------------------------+ |
4830 | FP' | / <- hard_frame_pointer_rtx (aligned)
4831 +-------------------------------+
4832 | dynamic allocation |
4833 +-------------------------------+
4835 +-------------------------------+
4836 | outgoing stack arguments | <-- arg_pointer
4838 +-------------------------------+
4839 | | <-- stack_pointer_rtx (aligned)
4841 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4842 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4845 /* Generate the prologue instructions for entry into a function.
4846 Establish the stack frame by decreasing the stack pointer with a
4847 properly calculated size and, if necessary, create a frame record
4848 filled with the values of LR and previous frame pointer. The
4849 current FP is also set up if it is in use. */
4852 aarch64_expand_prologue (void)
4854 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
4855 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4856 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4857 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4858 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4859 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4860 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4861 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
4864 /* Sign return address for functions. */
4865 if (aarch64_return_address_signing_enabled ())
4867 insn
= emit_insn (gen_pacisp ());
4868 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4869 RTX_FRAME_RELATED_P (insn
) = 1;
4872 if (flag_stack_usage_info
)
4873 current_function_static_stack_size
= constant_lower_bound (frame_size
);
4875 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
4877 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
4879 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
4880 && maybe_gt (frame_size
, get_stack_check_protect ()))
4881 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4883 - get_stack_check_protect ()));
4885 else if (maybe_gt (frame_size
, 0))
4886 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
4889 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4890 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4892 aarch64_sub_sp (ip0_rtx
, ip1_rtx
, initial_adjust
, true);
4894 if (callee_adjust
!= 0)
4895 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
4897 if (emit_frame_chain
)
4899 poly_int64 reg_offset
= callee_adjust
;
4900 if (callee_adjust
== 0)
4904 reg_offset
= callee_offset
;
4905 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
4907 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
4908 stack_pointer_rtx
, callee_offset
,
4909 ip1_rtx
, ip0_rtx
, frame_pointer_needed
);
4910 if (frame_pointer_needed
&& !frame_size
.is_constant ())
4912 /* Variable-sized frames need to describe the save slot
4913 address using DW_CFA_expression rather than DW_CFA_offset.
4914 This means that, without taking further action, the
4915 locations of the registers that we've already saved would
4916 remain based on the stack pointer even after we redefine
4917 the CFA based on the frame pointer. We therefore need new
4918 DW_CFA_expressions to re-express the save slots with addresses
4919 based on the frame pointer. */
4920 rtx_insn
*insn
= get_last_insn ();
4921 gcc_assert (RTX_FRAME_RELATED_P (insn
));
4923 /* Add an explicit CFA definition if this was previously
4925 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
4927 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
4929 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4930 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
4933 /* Change the save slot expressions for the registers that
4934 we've already saved. */
4935 reg_offset
-= callee_offset
;
4936 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
4937 reg_offset
+ UNITS_PER_WORD
);
4938 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
4941 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
4944 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4945 callee_adjust
!= 0 || emit_frame_chain
);
4946 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4947 callee_adjust
!= 0 || emit_frame_chain
);
4948 aarch64_sub_sp (ip1_rtx
, ip0_rtx
, final_adjust
, !frame_pointer_needed
);
4951 /* Return TRUE if we can use a simple_return insn.
4953 This function checks whether the callee saved stack is empty, which
4954 means no restore actions are need. The pro_and_epilogue will use
4955 this to check whether shrink-wrapping opt is feasible. */
4958 aarch64_use_return_insn_p (void)
4960 if (!reload_completed
)
4966 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
4969 /* Generate the epilogue instructions for returning from a function.
4970 This is almost exactly the reverse of the prolog sequence, except
4971 that we need to insert barriers to avoid scheduling loads that read
4972 from a deallocated stack, and we optimize the unwind records by
4973 emitting them all together if possible. */
4975 aarch64_expand_epilogue (bool for_sibcall
)
4977 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4978 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4979 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4980 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4981 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4982 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4985 /* A stack clash protection prologue may not have left IP0_REGNUM or
4986 IP1_REGNUM in a usable state. The same is true for allocations
4987 with an SVE component, since we then need both temporary registers
4988 for each allocation. */
4989 bool can_inherit_p
= (initial_adjust
.is_constant ()
4990 && final_adjust
.is_constant ()
4991 && !flag_stack_clash_protection
);
4993 /* We need to add memory barrier to prevent read from deallocated stack. */
4995 = maybe_ne (get_frame_size ()
4996 + cfun
->machine
->frame
.saved_varargs_size
, 0);
4998 /* Emit a barrier to prevent loads from a deallocated stack. */
4999 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
5000 || cfun
->calls_alloca
5001 || crtl
->calls_eh_return
)
5003 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5004 need_barrier_p
= false;
5007 /* Restore the stack pointer from the frame pointer if it may not
5008 be the same as the stack pointer. */
5009 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5010 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5011 if (frame_pointer_needed
5012 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
5013 /* If writeback is used when restoring callee-saves, the CFA
5014 is restored on the instruction doing the writeback. */
5015 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
5016 hard_frame_pointer_rtx
, -callee_offset
,
5017 ip1_rtx
, ip0_rtx
, callee_adjust
== 0);
5019 aarch64_add_sp (ip1_rtx
, ip0_rtx
, final_adjust
,
5020 !can_inherit_p
|| df_regs_ever_live_p (IP1_REGNUM
));
5022 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5023 callee_adjust
!= 0, &cfi_ops
);
5024 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5025 callee_adjust
!= 0, &cfi_ops
);
5028 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5030 if (callee_adjust
!= 0)
5031 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
5033 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
5035 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5036 insn
= get_last_insn ();
5037 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
5038 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
5039 RTX_FRAME_RELATED_P (insn
) = 1;
5043 aarch64_add_sp (ip0_rtx
, ip1_rtx
, initial_adjust
,
5044 !can_inherit_p
|| df_regs_ever_live_p (IP0_REGNUM
));
5048 /* Emit delayed restores and reset the CFA to be SP. */
5049 insn
= get_last_insn ();
5050 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5051 REG_NOTES (insn
) = cfi_ops
;
5052 RTX_FRAME_RELATED_P (insn
) = 1;
5055 /* We prefer to emit the combined return/authenticate instruction RETAA,
5056 however there are three cases in which we must instead emit an explicit
5057 authentication instruction.
5059 1) Sibcalls don't return in a normal way, so if we're about to call one
5060 we must authenticate.
5062 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5063 generating code for !TARGET_ARMV8_3 we can't use it and must
5064 explicitly authenticate.
5066 3) On an eh_return path we make extra stack adjustments to update the
5067 canonical frame address to be the exception handler's CFA. We want
5068 to authenticate using the CFA of the function which calls eh_return.
5070 if (aarch64_return_address_signing_enabled ()
5071 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5073 insn
= emit_insn (gen_autisp ());
5074 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5075 RTX_FRAME_RELATED_P (insn
) = 1;
5078 /* Stack adjustment for exception handler. */
5079 if (crtl
->calls_eh_return
)
5081 /* We need to unwind the stack by the offset computed by
5082 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5083 to be SP; letting the CFA move during this adjustment
5084 is just as correct as retaining the CFA from the body
5085 of the function. Therefore, do nothing special. */
5086 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5089 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5091 emit_jump_insn (ret_rtx
);
5094 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5095 normally or return to a previous frame after unwinding.
5097 An EH return uses a single shared return sequence. The epilogue is
5098 exactly like a normal epilogue except that it has an extra input
5099 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5100 that must be applied after the frame has been destroyed. An extra label
5101 is inserted before the epilogue which initializes this register to zero,
5102 and this is the entry point for a normal return.
5104 An actual EH return updates the return address, initializes the stack
5105 adjustment and jumps directly into the epilogue (bypassing the zeroing
5106 of the adjustment). Since the return address is typically saved on the
5107 stack when a function makes a call, the saved LR must be updated outside
5110 This poses problems as the store is generated well before the epilogue,
5111 so the offset of LR is not known yet. Also optimizations will remove the
5112 store as it appears dead, even after the epilogue is generated (as the
5113 base or offset for loading LR is different in many cases).
5115 To avoid these problems this implementation forces the frame pointer
5116 in eh_return functions so that the location of LR is fixed and known early.
5117 It also marks the store volatile, so no optimization is permitted to
5118 remove the store. */
5120 aarch64_eh_return_handler_rtx (void)
5122 rtx tmp
= gen_frame_mem (Pmode
,
5123 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5125 /* Mark the store volatile, so no optimization is permitted to remove it. */
5126 MEM_VOLATILE_P (tmp
) = true;
5130 /* Output code to add DELTA to the first argument, and then jump
5131 to FUNCTION. Used for C++ multiple inheritance. */
5133 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5134 HOST_WIDE_INT delta
,
5135 HOST_WIDE_INT vcall_offset
,
5138 /* The this pointer is always in x0. Note that this differs from
5139 Arm where the this pointer maybe bumped to r1 if r0 is required
5140 to return a pointer to an aggregate. On AArch64 a result value
5141 pointer will be in x8. */
5142 int this_regno
= R0_REGNUM
;
5143 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5146 reload_completed
= 1;
5147 emit_note (NOTE_INSN_PROLOGUE_END
);
5149 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5150 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5151 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5153 if (vcall_offset
== 0)
5154 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5157 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5162 if (delta
>= -256 && delta
< 256)
5163 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5164 plus_constant (Pmode
, this_rtx
, delta
));
5166 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5167 temp1
, temp0
, false);
5170 if (Pmode
== ptr_mode
)
5171 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5173 aarch64_emit_move (temp0
,
5174 gen_rtx_ZERO_EXTEND (Pmode
,
5175 gen_rtx_MEM (ptr_mode
, addr
)));
5177 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5178 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5181 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5183 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5186 if (Pmode
== ptr_mode
)
5187 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5189 aarch64_emit_move (temp1
,
5190 gen_rtx_SIGN_EXTEND (Pmode
,
5191 gen_rtx_MEM (ptr_mode
, addr
)));
5193 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5196 /* Generate a tail call to the target function. */
5197 if (!TREE_USED (function
))
5199 assemble_external (function
);
5200 TREE_USED (function
) = 1;
5202 funexp
= XEXP (DECL_RTL (function
), 0);
5203 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5204 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5205 SIBLING_CALL_P (insn
) = 1;
5207 insn
= get_insns ();
5208 shorten_branches (insn
);
5209 final_start_function (insn
, file
, 1);
5210 final (insn
, file
, 1);
5211 final_end_function ();
5213 /* Stop pretending to be a post-reload pass. */
5214 reload_completed
= 0;
5218 aarch64_tls_referenced_p (rtx x
)
5220 if (!TARGET_HAVE_TLS
)
5222 subrtx_iterator::array_type array
;
5223 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5225 const_rtx x
= *iter
;
5226 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5228 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5229 TLS offsets, not real symbol references. */
5230 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5231 iter
.skip_subrtxes ();
5237 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5238 a left shift of 0 or 12 bits. */
5240 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5242 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5243 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5248 /* Return true if val is an immediate that can be loaded into a
5249 register by a MOVZ instruction. */
5251 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5253 if (GET_MODE_SIZE (mode
) > 4)
5255 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5256 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5261 /* Ignore sign extension. */
5262 val
&= (HOST_WIDE_INT
) 0xffffffff;
5264 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5265 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5268 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5269 64-bit (DImode) integer. */
5271 static unsigned HOST_WIDE_INT
5272 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5274 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5277 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5284 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5286 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5288 0x0000000100000001ull
,
5289 0x0001000100010001ull
,
5290 0x0101010101010101ull
,
5291 0x1111111111111111ull
,
5292 0x5555555555555555ull
,
5296 /* Return true if val is a valid bitmask immediate. */
5299 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
5301 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
5304 /* Check for a single sequence of one bits and return quickly if so.
5305 The special cases of all ones and all zeroes returns false. */
5306 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
5307 tmp
= val
+ (val
& -val
);
5309 if (tmp
== (tmp
& -tmp
))
5310 return (val
+ 1) > 1;
5312 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5314 val
= (val
<< 32) | (val
& 0xffffffff);
5316 /* Invert if the immediate doesn't start with a zero bit - this means we
5317 only need to search for sequences of one bits. */
5321 /* Find the first set bit and set tmp to val with the first sequence of one
5322 bits removed. Return success if there is a single sequence of ones. */
5323 first_one
= val
& -val
;
5324 tmp
= val
& (val
+ first_one
);
5329 /* Find the next set bit and compute the difference in bit position. */
5330 next_one
= tmp
& -tmp
;
5331 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5334 /* Check the bit position difference is a power of 2, and that the first
5335 sequence of one bits fits within 'bits' bits. */
5336 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5339 /* Check the sequence of one bits is repeated 64/bits times. */
5340 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5343 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5344 Assumed precondition: VAL_IN Is not zero. */
5346 unsigned HOST_WIDE_INT
5347 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5349 int lowest_bit_set
= ctz_hwi (val_in
);
5350 int highest_bit_set
= floor_log2 (val_in
);
5351 gcc_assert (val_in
!= 0);
5353 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5354 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5357 /* Create constant where bits outside of lowest bit set to highest bit set
5360 unsigned HOST_WIDE_INT
5361 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5363 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5366 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5369 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5371 scalar_int_mode int_mode
;
5372 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5375 if (aarch64_bitmask_imm (val_in
, int_mode
))
5378 if (aarch64_move_imm (val_in
, int_mode
))
5381 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5383 return aarch64_bitmask_imm (imm2
, int_mode
);
5386 /* Return true if val is an immediate that can be loaded into a
5387 register in a single instruction. */
5389 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
5391 scalar_int_mode int_mode
;
5392 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5395 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
5397 return aarch64_bitmask_imm (val
, int_mode
);
5401 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
5405 if (GET_CODE (x
) == HIGH
)
5408 /* There's no way to calculate VL-based values using relocations. */
5409 subrtx_iterator::array_type array
;
5410 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5411 if (GET_CODE (*iter
) == CONST_POLY_INT
)
5414 split_const (x
, &base
, &offset
);
5415 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
5417 if (aarch64_classify_symbol (base
, INTVAL (offset
))
5418 != SYMBOL_FORCE_TO_MEM
)
5421 /* Avoid generating a 64-bit relocation in ILP32; leave
5422 to aarch64_expand_mov_immediate to handle it properly. */
5423 return mode
!= ptr_mode
;
5426 return aarch64_tls_referenced_p (x
);
5429 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5430 The expansion for a table switch is quite expensive due to the number
5431 of instructions, the table lookup and hard to predict indirect jump.
5432 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5433 set, otherwise use tables for > 16 cases as a tradeoff between size and
5434 performance. When optimizing for size, use the default setting. */
5437 aarch64_case_values_threshold (void)
5439 /* Use the specified limit for the number of cases before using jump
5440 tables at higher optimization levels. */
5442 && selected_cpu
->tune
->max_case_values
!= 0)
5443 return selected_cpu
->tune
->max_case_values
;
5445 return optimize_size
? default_case_values_threshold () : 17;
5448 /* Return true if register REGNO is a valid index register.
5449 STRICT_P is true if REG_OK_STRICT is in effect. */
5452 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
5454 if (!HARD_REGISTER_NUM_P (regno
))
5462 regno
= reg_renumber
[regno
];
5464 return GP_REGNUM_P (regno
);
5467 /* Return true if register REGNO is a valid base register for mode MODE.
5468 STRICT_P is true if REG_OK_STRICT is in effect. */
5471 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
5473 if (!HARD_REGISTER_NUM_P (regno
))
5481 regno
= reg_renumber
[regno
];
5484 /* The fake registers will be eliminated to either the stack or
5485 hard frame pointer, both of which are usually valid base registers.
5486 Reload deals with the cases where the eliminated form isn't valid. */
5487 return (GP_REGNUM_P (regno
)
5488 || regno
== SP_REGNUM
5489 || regno
== FRAME_POINTER_REGNUM
5490 || regno
== ARG_POINTER_REGNUM
);
5493 /* Return true if X is a valid base register for mode MODE.
5494 STRICT_P is true if REG_OK_STRICT is in effect. */
5497 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
5500 && GET_CODE (x
) == SUBREG
5501 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
5504 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
5507 /* Return true if address offset is a valid index. If it is, fill in INFO
5508 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5511 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
5512 machine_mode mode
, bool strict_p
)
5514 enum aarch64_address_type type
;
5519 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
5520 && GET_MODE (x
) == Pmode
)
5522 type
= ADDRESS_REG_REG
;
5526 /* (sign_extend:DI (reg:SI)) */
5527 else if ((GET_CODE (x
) == SIGN_EXTEND
5528 || GET_CODE (x
) == ZERO_EXTEND
)
5529 && GET_MODE (x
) == DImode
5530 && GET_MODE (XEXP (x
, 0)) == SImode
)
5532 type
= (GET_CODE (x
) == SIGN_EXTEND
)
5533 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5534 index
= XEXP (x
, 0);
5537 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5538 else if (GET_CODE (x
) == MULT
5539 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5540 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5541 && GET_MODE (XEXP (x
, 0)) == DImode
5542 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5543 && CONST_INT_P (XEXP (x
, 1)))
5545 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5546 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5547 index
= XEXP (XEXP (x
, 0), 0);
5548 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5550 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5551 else if (GET_CODE (x
) == ASHIFT
5552 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5553 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5554 && GET_MODE (XEXP (x
, 0)) == DImode
5555 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5556 && CONST_INT_P (XEXP (x
, 1)))
5558 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5559 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5560 index
= XEXP (XEXP (x
, 0), 0);
5561 shift
= INTVAL (XEXP (x
, 1));
5563 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5564 else if ((GET_CODE (x
) == SIGN_EXTRACT
5565 || GET_CODE (x
) == ZERO_EXTRACT
)
5566 && GET_MODE (x
) == DImode
5567 && GET_CODE (XEXP (x
, 0)) == MULT
5568 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5569 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5571 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5572 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5573 index
= XEXP (XEXP (x
, 0), 0);
5574 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5575 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5576 || INTVAL (XEXP (x
, 2)) != 0)
5579 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5580 (const_int 0xffffffff<<shift)) */
5581 else if (GET_CODE (x
) == AND
5582 && GET_MODE (x
) == DImode
5583 && GET_CODE (XEXP (x
, 0)) == MULT
5584 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5585 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5586 && CONST_INT_P (XEXP (x
, 1)))
5588 type
= ADDRESS_REG_UXTW
;
5589 index
= XEXP (XEXP (x
, 0), 0);
5590 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5591 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5594 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5595 else if ((GET_CODE (x
) == SIGN_EXTRACT
5596 || GET_CODE (x
) == ZERO_EXTRACT
)
5597 && GET_MODE (x
) == DImode
5598 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5599 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5600 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5602 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5603 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5604 index
= XEXP (XEXP (x
, 0), 0);
5605 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5606 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5607 || INTVAL (XEXP (x
, 2)) != 0)
5610 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5611 (const_int 0xffffffff<<shift)) */
5612 else if (GET_CODE (x
) == AND
5613 && GET_MODE (x
) == DImode
5614 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5615 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5616 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5617 && CONST_INT_P (XEXP (x
, 1)))
5619 type
= ADDRESS_REG_UXTW
;
5620 index
= XEXP (XEXP (x
, 0), 0);
5621 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5622 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5625 /* (mult:P (reg:P) (const_int scale)) */
5626 else if (GET_CODE (x
) == MULT
5627 && GET_MODE (x
) == Pmode
5628 && GET_MODE (XEXP (x
, 0)) == Pmode
5629 && CONST_INT_P (XEXP (x
, 1)))
5631 type
= ADDRESS_REG_REG
;
5632 index
= XEXP (x
, 0);
5633 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5635 /* (ashift:P (reg:P) (const_int shift)) */
5636 else if (GET_CODE (x
) == ASHIFT
5637 && GET_MODE (x
) == Pmode
5638 && GET_MODE (XEXP (x
, 0)) == Pmode
5639 && CONST_INT_P (XEXP (x
, 1)))
5641 type
= ADDRESS_REG_REG
;
5642 index
= XEXP (x
, 0);
5643 shift
= INTVAL (XEXP (x
, 1));
5649 && GET_CODE (index
) == SUBREG
5650 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
5651 index
= SUBREG_REG (index
);
5653 if (aarch64_sve_data_mode_p (mode
))
5655 if (type
!= ADDRESS_REG_REG
5656 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
5662 && !(IN_RANGE (shift
, 1, 3)
5663 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
5668 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
5671 info
->offset
= index
;
5672 info
->shift
= shift
;
5679 /* Return true if MODE is one of the modes for which we
5680 support LDP/STP operations. */
5683 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
5685 return mode
== SImode
|| mode
== DImode
5686 || mode
== SFmode
|| mode
== DFmode
5687 || (aarch64_vector_mode_supported_p (mode
)
5688 && (known_eq (GET_MODE_SIZE (mode
), 8)
5689 || (known_eq (GET_MODE_SIZE (mode
), 16)
5690 && (aarch64_tune_params
.extra_tuning_flags
5691 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
5694 /* Return true if REGNO is a virtual pointer register, or an eliminable
5695 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5696 include stack_pointer or hard_frame_pointer. */
5698 virt_or_elim_regno_p (unsigned regno
)
5700 return ((regno
>= FIRST_VIRTUAL_REGISTER
5701 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
5702 || regno
== FRAME_POINTER_REGNUM
5703 || regno
== ARG_POINTER_REGNUM
);
5706 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5707 If it is, fill in INFO appropriately. STRICT_P is true if
5708 REG_OK_STRICT is in effect. */
5711 aarch64_classify_address (struct aarch64_address_info
*info
,
5712 rtx x
, machine_mode mode
, bool strict_p
,
5713 aarch64_addr_query_type type
)
5715 enum rtx_code code
= GET_CODE (x
);
5719 HOST_WIDE_INT const_size
;
5721 /* On BE, we use load/store pair for all large int mode load/stores.
5722 TI/TFmode may also use a load/store pair. */
5723 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5724 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
5725 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
5726 || type
== ADDR_QUERY_LDP_STP_N
5729 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
5731 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5732 corresponds to the actual size of the memory being loaded/stored and the
5733 mode of the corresponding addressing mode is half of that. */
5734 if (type
== ADDR_QUERY_LDP_STP_N
5735 && known_eq (GET_MODE_SIZE (mode
), 16))
5738 bool allow_reg_index_p
= (!load_store_pair_p
5739 && (known_lt (GET_MODE_SIZE (mode
), 16)
5740 || vec_flags
== VEC_ADVSIMD
5741 || vec_flags
== VEC_SVE_DATA
));
5743 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5744 [Rn, #offset, MUL VL]. */
5745 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
5746 && (code
!= REG
&& code
!= PLUS
))
5749 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5751 if (advsimd_struct_p
5752 && !BYTES_BIG_ENDIAN
5753 && (code
!= POST_INC
&& code
!= REG
))
5756 gcc_checking_assert (GET_MODE (x
) == VOIDmode
5757 || SCALAR_INT_MODE_P (GET_MODE (x
)));
5763 info
->type
= ADDRESS_REG_IMM
;
5765 info
->offset
= const0_rtx
;
5766 info
->const_offset
= 0;
5767 return aarch64_base_register_rtx_p (x
, strict_p
);
5775 && virt_or_elim_regno_p (REGNO (op0
))
5776 && poly_int_rtx_p (op1
, &offset
))
5778 info
->type
= ADDRESS_REG_IMM
;
5781 info
->const_offset
= offset
;
5786 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
5787 && aarch64_base_register_rtx_p (op0
, strict_p
)
5788 && poly_int_rtx_p (op1
, &offset
))
5790 info
->type
= ADDRESS_REG_IMM
;
5793 info
->const_offset
= offset
;
5795 /* TImode and TFmode values are allowed in both pairs of X
5796 registers and individual Q registers. The available
5798 X,X: 7-bit signed scaled offset
5799 Q: 9-bit signed offset
5800 We conservatively require an offset representable in either mode.
5801 When performing the check for pairs of X registers i.e. LDP/STP
5802 pass down DImode since that is the natural size of the LDP/STP
5803 instruction memory accesses. */
5804 if (mode
== TImode
|| mode
== TFmode
)
5805 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
5806 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
5807 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
5809 /* A 7bit offset check because OImode will emit a ldp/stp
5810 instruction (only big endian will get here).
5811 For ldp/stp instructions, the offset is scaled for the size of a
5812 single element of the pair. */
5814 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
5816 /* Three 9/12 bit offsets checks because CImode will emit three
5817 ldr/str instructions (only big endian will get here). */
5819 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5820 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
5822 || offset_12bit_unsigned_scaled_p (V16QImode
,
5825 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5826 instructions (only big endian will get here). */
5828 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5829 && aarch64_offset_7bit_signed_scaled_p (TImode
,
5832 /* Make "m" use the LD1 offset range for SVE data modes, so
5833 that pre-RTL optimizers like ivopts will work to that
5834 instead of the wider LDR/STR range. */
5835 if (vec_flags
== VEC_SVE_DATA
)
5836 return (type
== ADDR_QUERY_M
5837 ? offset_4bit_signed_scaled_p (mode
, offset
)
5838 : offset_9bit_signed_scaled_p (mode
, offset
));
5840 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
5842 poly_int64 end_offset
= (offset
5843 + GET_MODE_SIZE (mode
)
5844 - BYTES_PER_SVE_VECTOR
);
5845 return (type
== ADDR_QUERY_M
5846 ? offset_4bit_signed_scaled_p (mode
, offset
)
5847 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
5848 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
5852 if (vec_flags
== VEC_SVE_PRED
)
5853 return offset_9bit_signed_scaled_p (mode
, offset
);
5855 if (load_store_pair_p
)
5856 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5857 || known_eq (GET_MODE_SIZE (mode
), 8)
5858 || known_eq (GET_MODE_SIZE (mode
), 16))
5859 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5861 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
5862 || offset_12bit_unsigned_scaled_p (mode
, offset
));
5865 if (allow_reg_index_p
)
5867 /* Look for base + (scaled/extended) index register. */
5868 if (aarch64_base_register_rtx_p (op0
, strict_p
)
5869 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
5874 if (aarch64_base_register_rtx_p (op1
, strict_p
)
5875 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
5888 info
->type
= ADDRESS_REG_WB
;
5889 info
->base
= XEXP (x
, 0);
5890 info
->offset
= NULL_RTX
;
5891 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
5895 info
->type
= ADDRESS_REG_WB
;
5896 info
->base
= XEXP (x
, 0);
5897 if (GET_CODE (XEXP (x
, 1)) == PLUS
5898 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
5899 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
5900 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5902 info
->offset
= XEXP (XEXP (x
, 1), 1);
5903 info
->const_offset
= offset
;
5905 /* TImode and TFmode values are allowed in both pairs of X
5906 registers and individual Q registers. The available
5908 X,X: 7-bit signed scaled offset
5909 Q: 9-bit signed offset
5910 We conservatively require an offset representable in either mode.
5912 if (mode
== TImode
|| mode
== TFmode
)
5913 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
5914 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
5916 if (load_store_pair_p
)
5917 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5918 || known_eq (GET_MODE_SIZE (mode
), 8)
5919 || known_eq (GET_MODE_SIZE (mode
), 16))
5920 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5922 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
5929 /* load literal: pc-relative constant pool entry. Only supported
5930 for SI mode or larger. */
5931 info
->type
= ADDRESS_SYMBOLIC
;
5933 if (!load_store_pair_p
5934 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
5939 split_const (x
, &sym
, &addend
);
5940 return ((GET_CODE (sym
) == LABEL_REF
5941 || (GET_CODE (sym
) == SYMBOL_REF
5942 && CONSTANT_POOL_ADDRESS_P (sym
)
5943 && aarch64_pcrelative_literal_loads
)));
5948 info
->type
= ADDRESS_LO_SUM
;
5949 info
->base
= XEXP (x
, 0);
5950 info
->offset
= XEXP (x
, 1);
5951 if (allow_reg_index_p
5952 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5955 split_const (info
->offset
, &sym
, &offs
);
5956 if (GET_CODE (sym
) == SYMBOL_REF
5957 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
5958 == SYMBOL_SMALL_ABSOLUTE
))
5960 /* The symbol and offset must be aligned to the access size. */
5963 if (CONSTANT_POOL_ADDRESS_P (sym
))
5964 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
5965 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
5967 tree exp
= SYMBOL_REF_DECL (sym
);
5968 align
= TYPE_ALIGN (TREE_TYPE (exp
));
5969 align
= aarch64_constant_alignment (exp
, align
);
5971 else if (SYMBOL_REF_DECL (sym
))
5972 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
5973 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
5974 && SYMBOL_REF_BLOCK (sym
) != NULL
)
5975 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
5977 align
= BITS_PER_UNIT
;
5979 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
5980 if (known_eq (ref_size
, 0))
5981 ref_size
= GET_MODE_SIZE (DImode
);
5983 return (multiple_p (INTVAL (offs
), ref_size
)
5984 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
5994 /* Return true if the address X is valid for a PRFM instruction.
5995 STRICT_P is true if we should do strict checking with
5996 aarch64_classify_address. */
5999 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
6001 struct aarch64_address_info addr
;
6003 /* PRFM accepts the same addresses as DImode... */
6004 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
6008 /* ... except writeback forms. */
6009 return addr
.type
!= ADDRESS_REG_WB
;
6013 aarch64_symbolic_address_p (rtx x
)
6017 split_const (x
, &x
, &offset
);
6018 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
6021 /* Classify the base of symbolic expression X. */
6023 enum aarch64_symbol_type
6024 aarch64_classify_symbolic_expression (rtx x
)
6028 split_const (x
, &x
, &offset
);
6029 return aarch64_classify_symbol (x
, INTVAL (offset
));
6033 /* Return TRUE if X is a legitimate address for accessing memory in
6036 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
6038 struct aarch64_address_info addr
;
6040 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
6043 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6044 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6046 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
6047 aarch64_addr_query_type type
)
6049 struct aarch64_address_info addr
;
6051 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
6054 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6057 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6058 poly_int64 orig_offset
,
6062 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6064 HOST_WIDE_INT const_offset
, second_offset
;
6066 /* A general SVE offset is A * VQ + B. Remove the A component from
6067 coefficient 0 in order to get the constant B. */
6068 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6070 /* Split an out-of-range address displacement into a base and
6071 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6072 range otherwise to increase opportunities for sharing the base
6073 address of different sizes. Unaligned accesses use the signed
6074 9-bit range, TImode/TFmode use the intersection of signed
6075 scaled 7-bit and signed 9-bit offset. */
6076 if (mode
== TImode
|| mode
== TFmode
)
6077 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6078 else if ((const_offset
& (size
- 1)) != 0)
6079 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6081 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6083 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6086 /* Split the offset into second_offset and the rest. */
6087 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6088 *offset2
= gen_int_mode (second_offset
, Pmode
);
6093 /* Get the mode we should use as the basis of the range. For structure
6094 modes this is the mode of one vector. */
6095 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6096 machine_mode step_mode
6097 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6099 /* Get the "mul vl" multiplier we'd like to use. */
6100 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6101 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6102 if (vec_flags
& VEC_SVE_DATA
)
6103 /* LDR supports a 9-bit range, but the move patterns for
6104 structure modes require all vectors to be in range of the
6105 same base. The simplest way of accomodating that while still
6106 promoting reuse of anchor points between different modes is
6107 to use an 8-bit range unconditionally. */
6108 vnum
= ((vnum
+ 128) & 255) - 128;
6110 /* Predicates are only handled singly, so we might as well use
6112 vnum
= ((vnum
+ 256) & 511) - 256;
6116 /* Convert the "mul vl" multiplier into a byte offset. */
6117 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6118 if (known_eq (second_offset
, orig_offset
))
6121 /* Split the offset into second_offset and the rest. */
6122 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6123 *offset2
= gen_int_mode (second_offset
, Pmode
);
6128 /* Return the binary representation of floating point constant VALUE in INTVAL.
6129 If the value cannot be converted, return false without setting INTVAL.
6130 The conversion is done in the given MODE. */
6132 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6135 /* We make a general exception for 0. */
6136 if (aarch64_float_const_zero_rtx_p (value
))
6142 scalar_float_mode mode
;
6143 if (GET_CODE (value
) != CONST_DOUBLE
6144 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6145 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6146 /* Only support up to DF mode. */
6147 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6150 unsigned HOST_WIDE_INT ival
= 0;
6153 real_to_target (res
,
6154 CONST_DOUBLE_REAL_VALUE (value
),
6155 REAL_MODE_FORMAT (mode
));
6159 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6160 ival
= zext_hwi (res
[order
], 32);
6161 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6164 ival
= zext_hwi (res
[0], 32);
6170 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6171 single MOV(+MOVK) followed by an FMOV. */
6173 aarch64_float_const_rtx_p (rtx x
)
6175 machine_mode mode
= GET_MODE (x
);
6176 if (mode
== VOIDmode
)
6179 /* Determine whether it's cheaper to write float constants as
6180 mov/movk pairs over ldr/adrp pairs. */
6181 unsigned HOST_WIDE_INT ival
;
6183 if (GET_CODE (x
) == CONST_DOUBLE
6184 && SCALAR_FLOAT_MODE_P (mode
)
6185 && aarch64_reinterpret_float_as_int (x
, &ival
))
6187 scalar_int_mode imode
= (mode
== HFmode
6189 : int_mode_for_mode (mode
).require ());
6190 int num_instr
= aarch64_internal_mov_immediate
6191 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6192 return num_instr
< 3;
6198 /* Return TRUE if rtx X is immediate constant 0.0 */
6200 aarch64_float_const_zero_rtx_p (rtx x
)
6202 if (GET_MODE (x
) == VOIDmode
)
6205 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6206 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6207 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6210 /* Return TRUE if rtx X is immediate constant that fits in a single
6211 MOVI immediate operation. */
6213 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6219 scalar_int_mode imode
;
6220 unsigned HOST_WIDE_INT ival
;
6222 if (GET_CODE (x
) == CONST_DOUBLE
6223 && SCALAR_FLOAT_MODE_P (mode
))
6225 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6228 /* We make a general exception for 0. */
6229 if (aarch64_float_const_zero_rtx_p (x
))
6232 imode
= int_mode_for_mode (mode
).require ();
6234 else if (GET_CODE (x
) == CONST_INT
6235 && is_a
<scalar_int_mode
> (mode
, &imode
))
6240 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6241 a 128 bit vector mode. */
6242 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
6244 vmode
= aarch64_simd_container_mode (imode
, width
);
6245 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
6247 return aarch64_simd_valid_immediate (v_op
, NULL
);
6251 /* Return the fixed registers used for condition codes. */
6254 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
6257 *p2
= INVALID_REGNUM
;
6261 /* This function is used by the call expanders of the machine description.
6262 RESULT is the register in which the result is returned. It's NULL for
6263 "call" and "sibcall".
6264 MEM is the location of the function call.
6265 SIBCALL indicates whether this function call is normal call or sibling call.
6266 It will generate different pattern accordingly. */
6269 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
6271 rtx call
, callee
, tmp
;
6275 gcc_assert (MEM_P (mem
));
6276 callee
= XEXP (mem
, 0);
6277 mode
= GET_MODE (callee
);
6278 gcc_assert (mode
== Pmode
);
6280 /* Decide if we should generate indirect calls by loading the
6281 address of the callee into a register before performing
6282 the branch-and-link. */
6283 if (SYMBOL_REF_P (callee
)
6284 ? (aarch64_is_long_call_p (callee
)
6285 || aarch64_is_noplt_call_p (callee
))
6287 XEXP (mem
, 0) = force_reg (mode
, callee
);
6289 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
6291 if (result
!= NULL_RTX
)
6292 call
= gen_rtx_SET (result
, call
);
6297 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
6299 vec
= gen_rtvec (2, call
, tmp
);
6300 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
6302 aarch64_emit_call_insn (call
);
6305 /* Emit call insn with PAT and do aarch64-specific handling. */
6308 aarch64_emit_call_insn (rtx pat
)
6310 rtx insn
= emit_call_insn (pat
);
6312 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
6313 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
6314 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
6318 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
6320 /* All floating point compares return CCFP if it is an equality
6321 comparison, and CCFPE otherwise. */
6322 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
6349 /* Equality comparisons of short modes against zero can be performed
6350 using the TST instruction with the appropriate bitmask. */
6351 if (y
== const0_rtx
&& REG_P (x
)
6352 && (code
== EQ
|| code
== NE
)
6353 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
6356 /* Similarly, comparisons of zero_extends from shorter modes can
6357 be performed using an ANDS with an immediate mask. */
6358 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
6359 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6360 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
6361 && (code
== EQ
|| code
== NE
))
6364 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6366 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
6367 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
6368 || GET_CODE (x
) == NEG
6369 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
6370 && CONST_INT_P (XEXP (x
, 2)))))
6373 /* A compare with a shifted operand. Because of canonicalization,
6374 the comparison will have to be swapped when we emit the assembly
6376 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6377 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
6378 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
6379 || GET_CODE (x
) == LSHIFTRT
6380 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
6383 /* Similarly for a negated operand, but we can only do this for
6385 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6386 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
6387 && (code
== EQ
|| code
== NE
)
6388 && GET_CODE (x
) == NEG
)
6391 /* A test for unsigned overflow. */
6392 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6394 && GET_CODE (x
) == PLUS
6395 && GET_CODE (y
) == ZERO_EXTEND
)
6398 /* A test for signed overflow. */
6399 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6401 && GET_CODE (x
) == PLUS
6402 && GET_CODE (y
) == SIGN_EXTEND
)
6405 /* For everything else, return CCmode. */
6410 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
6413 aarch64_get_condition_code (rtx x
)
6415 machine_mode mode
= GET_MODE (XEXP (x
, 0));
6416 enum rtx_code comp_code
= GET_CODE (x
);
6418 if (GET_MODE_CLASS (mode
) != MODE_CC
)
6419 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
6420 return aarch64_get_condition_code_1 (mode
, comp_code
);
6424 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
6432 case GE
: return AARCH64_GE
;
6433 case GT
: return AARCH64_GT
;
6434 case LE
: return AARCH64_LS
;
6435 case LT
: return AARCH64_MI
;
6436 case NE
: return AARCH64_NE
;
6437 case EQ
: return AARCH64_EQ
;
6438 case ORDERED
: return AARCH64_VC
;
6439 case UNORDERED
: return AARCH64_VS
;
6440 case UNLT
: return AARCH64_LT
;
6441 case UNLE
: return AARCH64_LE
;
6442 case UNGT
: return AARCH64_HI
;
6443 case UNGE
: return AARCH64_PL
;
6451 case NE
: return AARCH64_NE
;
6452 case EQ
: return AARCH64_EQ
;
6453 case GE
: return AARCH64_GE
;
6454 case GT
: return AARCH64_GT
;
6455 case LE
: return AARCH64_LE
;
6456 case LT
: return AARCH64_LT
;
6457 case GEU
: return AARCH64_CS
;
6458 case GTU
: return AARCH64_HI
;
6459 case LEU
: return AARCH64_LS
;
6460 case LTU
: return AARCH64_CC
;
6468 case NE
: return AARCH64_NE
;
6469 case EQ
: return AARCH64_EQ
;
6470 case GE
: return AARCH64_LE
;
6471 case GT
: return AARCH64_LT
;
6472 case LE
: return AARCH64_GE
;
6473 case LT
: return AARCH64_GT
;
6474 case GEU
: return AARCH64_LS
;
6475 case GTU
: return AARCH64_CC
;
6476 case LEU
: return AARCH64_CS
;
6477 case LTU
: return AARCH64_HI
;
6485 case NE
: return AARCH64_NE
;
6486 case EQ
: return AARCH64_EQ
;
6487 case GE
: return AARCH64_PL
;
6488 case LT
: return AARCH64_MI
;
6496 case NE
: return AARCH64_NE
;
6497 case EQ
: return AARCH64_EQ
;
6505 case NE
: return AARCH64_CS
;
6506 case EQ
: return AARCH64_CC
;
6514 case NE
: return AARCH64_VS
;
6515 case EQ
: return AARCH64_VC
;
6528 aarch64_const_vec_all_same_in_range_p (rtx x
,
6529 HOST_WIDE_INT minval
,
6530 HOST_WIDE_INT maxval
)
6533 return (const_vec_duplicate_p (x
, &elt
)
6534 && CONST_INT_P (elt
)
6535 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
6539 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
6541 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
6544 /* Return true if VEC is a constant in which every element is in the range
6545 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6548 aarch64_const_vec_all_in_range_p (rtx vec
,
6549 HOST_WIDE_INT minval
,
6550 HOST_WIDE_INT maxval
)
6552 if (GET_CODE (vec
) != CONST_VECTOR
6553 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
6557 if (!CONST_VECTOR_STEPPED_P (vec
))
6558 nunits
= const_vector_encoded_nelts (vec
);
6559 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
6562 for (int i
= 0; i
< nunits
; i
++)
6564 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
6565 if (!CONST_INT_P (vec_elem
)
6566 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
6573 #define AARCH64_CC_V 1
6574 #define AARCH64_CC_C (1 << 1)
6575 #define AARCH64_CC_Z (1 << 2)
6576 #define AARCH64_CC_N (1 << 3)
6578 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6579 static const int aarch64_nzcv_codes
[] =
6581 0, /* EQ, Z == 1. */
6582 AARCH64_CC_Z
, /* NE, Z == 0. */
6583 0, /* CS, C == 1. */
6584 AARCH64_CC_C
, /* CC, C == 0. */
6585 0, /* MI, N == 1. */
6586 AARCH64_CC_N
, /* PL, N == 0. */
6587 0, /* VS, V == 1. */
6588 AARCH64_CC_V
, /* VC, V == 0. */
6589 0, /* HI, C ==1 && Z == 0. */
6590 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
6591 AARCH64_CC_V
, /* GE, N == V. */
6592 0, /* LT, N != V. */
6593 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
6594 0, /* LE, !(Z == 0 && N == V). */
6599 /* Print floating-point vector immediate operand X to F, negating it
6600 first if NEGATE is true. Return true on success, false if it isn't
6601 a constant we can handle. */
6604 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
6608 if (!const_vec_duplicate_p (x
, &elt
))
6611 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
6613 r
= real_value_negate (&r
);
6615 /* We only handle the SVE single-bit immediates here. */
6616 if (real_equal (&r
, &dconst0
))
6617 asm_fprintf (f
, "0.0");
6618 else if (real_equal (&r
, &dconst1
))
6619 asm_fprintf (f
, "1.0");
6620 else if (real_equal (&r
, &dconsthalf
))
6621 asm_fprintf (f
, "0.5");
6628 /* Return the equivalent letter for size. */
6630 sizetochar (int size
)
6634 case 64: return 'd';
6635 case 32: return 's';
6636 case 16: return 'h';
6637 case 8 : return 'b';
6638 default: gcc_unreachable ();
6642 /* Print operand X to file F in a target specific manner according to CODE.
6643 The acceptable formatting commands given by CODE are:
6644 'c': An integer or symbol address without a preceding #
6646 'C': Take the duplicated element in a vector constant
6647 and print it in hex.
6648 'D': Take the duplicated element in a vector constant
6649 and print it as an unsigned integer, in decimal.
6650 'e': Print the sign/zero-extend size as a character 8->b,
6652 'p': Prints N such that 2^N == X (X must be power of 2 and
6654 'P': Print the number of non-zero bits in X (a const_int).
6655 'H': Print the higher numbered register of a pair (TImode)
6657 'm': Print a condition (eq, ne, etc).
6658 'M': Same as 'm', but invert condition.
6659 'N': Take the duplicated element in a vector constant
6660 and print the negative of it in decimal.
6661 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6662 'S/T/U/V': Print a FP/SIMD register name for a register list.
6663 The register printed is the FP/SIMD register name
6664 of X + 0/1/2/3 for S/T/U/V.
6665 'R': Print a scalar FP/SIMD register name + 1.
6666 'X': Print bottom 16 bits of integer constant in hex.
6667 'w/x': Print a general register name or the zero register
6669 '0': Print a normal operand, if it's a general register,
6670 then we assume DImode.
6671 'k': Print NZCV for conditional compare instructions.
6672 'A': Output address constant representing the first
6673 argument of X, specifying a relocation offset
6675 'L': Output constant address specified by X
6676 with a relocation offset if appropriate.
6677 'G': Prints address of X, specifying a PC relative
6678 relocation mode if appropriate.
6679 'y': Output address of LDP or STP - this is used for
6680 some LDP/STPs which don't use a PARALLEL in their
6681 pattern (so the mode needs to be adjusted).
6682 'z': Output address of a typical LDP or STP. */
6685 aarch64_print_operand (FILE *f
, rtx x
, int code
)
6691 switch (GET_CODE (x
))
6694 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
6698 output_addr_const (f
, x
);
6702 if (GET_CODE (XEXP (x
, 0)) == PLUS
6703 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
6705 output_addr_const (f
, x
);
6711 output_operand_lossage ("unsupported operand for code '%c'", code
);
6719 if (!CONST_INT_P (x
)
6720 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
6722 output_operand_lossage ("invalid operand for '%%%c'", code
);
6738 output_operand_lossage ("invalid operand for '%%%c'", code
);
6748 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
6750 output_operand_lossage ("invalid operand for '%%%c'", code
);
6754 asm_fprintf (f
, "%d", n
);
6759 if (!CONST_INT_P (x
))
6761 output_operand_lossage ("invalid operand for '%%%c'", code
);
6765 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
6769 if (x
== const0_rtx
)
6771 asm_fprintf (f
, "xzr");
6775 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
6777 output_operand_lossage ("invalid operand for '%%%c'", code
);
6781 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
6788 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6789 if (x
== const_true_rtx
)
6796 if (!COMPARISON_P (x
))
6798 output_operand_lossage ("invalid operand for '%%%c'", code
);
6802 cond_code
= aarch64_get_condition_code (x
);
6803 gcc_assert (cond_code
>= 0);
6805 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
6806 fputs (aarch64_condition_codes
[cond_code
], f
);
6811 if (!const_vec_duplicate_p (x
, &elt
))
6813 output_operand_lossage ("invalid vector constant");
6817 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6818 asm_fprintf (f
, "%wd", -INTVAL (elt
));
6819 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6820 && aarch64_print_vector_float_operand (f
, x
, true))
6824 output_operand_lossage ("invalid vector constant");
6834 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6836 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6839 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
6846 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6848 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6851 asm_fprintf (f
, "%c%d",
6852 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
6853 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
6857 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6859 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6862 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
6866 if (!CONST_INT_P (x
))
6868 output_operand_lossage ("invalid operand for '%%%c'", code
);
6871 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
6876 /* Print a replicated constant in hex. */
6877 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6879 output_operand_lossage ("invalid operand for '%%%c'", code
);
6882 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6883 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6889 /* Print a replicated constant in decimal, treating it as
6891 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6893 output_operand_lossage ("invalid operand for '%%%c'", code
);
6896 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6897 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6904 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
6906 asm_fprintf (f
, "%czr", code
);
6910 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
6912 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
6916 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
6918 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
6927 output_operand_lossage ("missing operand");
6931 switch (GET_CODE (x
))
6934 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
6936 if (REG_NREGS (x
) == 1)
6937 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
6941 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
6942 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
6943 REGNO (x
) - V0_REGNUM
, suffix
,
6944 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
6948 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
6952 output_address (GET_MODE (x
), XEXP (x
, 0));
6957 output_addr_const (asm_out_file
, x
);
6961 asm_fprintf (f
, "%wd", INTVAL (x
));
6965 if (!VECTOR_MODE_P (GET_MODE (x
)))
6967 output_addr_const (asm_out_file
, x
);
6973 if (!const_vec_duplicate_p (x
, &elt
))
6975 output_operand_lossage ("invalid vector constant");
6979 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6980 asm_fprintf (f
, "%wd", INTVAL (elt
));
6981 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6982 && aarch64_print_vector_float_operand (f
, x
, false))
6986 output_operand_lossage ("invalid vector constant");
6992 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6993 be getting CONST_DOUBLEs holding integers. */
6994 gcc_assert (GET_MODE (x
) != VOIDmode
);
6995 if (aarch64_float_const_zero_rtx_p (x
))
7000 else if (aarch64_float_const_representable_p (x
))
7003 char float_buf
[buf_size
] = {'\0'};
7004 real_to_decimal_for_mode (float_buf
,
7005 CONST_DOUBLE_REAL_VALUE (x
),
7008 asm_fprintf (asm_out_file
, "%s", float_buf
);
7012 output_operand_lossage ("invalid constant");
7015 output_operand_lossage ("invalid operand");
7021 if (GET_CODE (x
) == HIGH
)
7024 switch (aarch64_classify_symbolic_expression (x
))
7026 case SYMBOL_SMALL_GOT_4G
:
7027 asm_fprintf (asm_out_file
, ":got:");
7030 case SYMBOL_SMALL_TLSGD
:
7031 asm_fprintf (asm_out_file
, ":tlsgd:");
7034 case SYMBOL_SMALL_TLSDESC
:
7035 asm_fprintf (asm_out_file
, ":tlsdesc:");
7038 case SYMBOL_SMALL_TLSIE
:
7039 asm_fprintf (asm_out_file
, ":gottprel:");
7042 case SYMBOL_TLSLE24
:
7043 asm_fprintf (asm_out_file
, ":tprel:");
7046 case SYMBOL_TINY_GOT
:
7053 output_addr_const (asm_out_file
, x
);
7057 switch (aarch64_classify_symbolic_expression (x
))
7059 case SYMBOL_SMALL_GOT_4G
:
7060 asm_fprintf (asm_out_file
, ":lo12:");
7063 case SYMBOL_SMALL_TLSGD
:
7064 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7067 case SYMBOL_SMALL_TLSDESC
:
7068 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7071 case SYMBOL_SMALL_TLSIE
:
7072 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7075 case SYMBOL_TLSLE12
:
7076 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7079 case SYMBOL_TLSLE24
:
7080 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7083 case SYMBOL_TINY_GOT
:
7084 asm_fprintf (asm_out_file
, ":got:");
7087 case SYMBOL_TINY_TLSIE
:
7088 asm_fprintf (asm_out_file
, ":gottprel:");
7094 output_addr_const (asm_out_file
, x
);
7098 switch (aarch64_classify_symbolic_expression (x
))
7100 case SYMBOL_TLSLE24
:
7101 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7106 output_addr_const (asm_out_file
, x
);
7111 HOST_WIDE_INT cond_code
;
7113 if (!CONST_INT_P (x
))
7115 output_operand_lossage ("invalid operand for '%%%c'", code
);
7119 cond_code
= INTVAL (x
);
7120 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7121 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7128 machine_mode mode
= GET_MODE (x
);
7130 if (GET_CODE (x
) != MEM
7131 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7133 output_operand_lossage ("invalid operand for '%%%c'", code
);
7137 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
7139 ? ADDR_QUERY_LDP_STP_N
7140 : ADDR_QUERY_LDP_STP
))
7141 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7146 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7151 /* Print address 'x' of a memory access with mode 'mode'.
7152 'op' is the context required by aarch64_classify_address. It can either be
7153 MEM for a normal memory access or PARALLEL for LDP/STP. */
7155 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7156 aarch64_addr_query_type type
)
7158 struct aarch64_address_info addr
;
7161 /* Check all addresses are Pmode - including ILP32. */
7162 if (GET_MODE (x
) != Pmode
)
7163 output_operand_lossage ("invalid address mode");
7165 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7168 case ADDRESS_REG_IMM
:
7169 if (known_eq (addr
.const_offset
, 0))
7170 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7171 else if (aarch64_sve_data_mode_p (mode
))
7174 = exact_div (addr
.const_offset
,
7175 BYTES_PER_SVE_VECTOR
).to_constant ();
7176 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7177 reg_names
[REGNO (addr
.base
)], vnum
);
7179 else if (aarch64_sve_pred_mode_p (mode
))
7182 = exact_div (addr
.const_offset
,
7183 BYTES_PER_SVE_PRED
).to_constant ();
7184 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7185 reg_names
[REGNO (addr
.base
)], vnum
);
7188 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7189 INTVAL (addr
.offset
));
7192 case ADDRESS_REG_REG
:
7193 if (addr
.shift
== 0)
7194 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7195 reg_names
[REGNO (addr
.offset
)]);
7197 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7198 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7201 case ADDRESS_REG_UXTW
:
7202 if (addr
.shift
== 0)
7203 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7204 REGNO (addr
.offset
) - R0_REGNUM
);
7206 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7207 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7210 case ADDRESS_REG_SXTW
:
7211 if (addr
.shift
== 0)
7212 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7213 REGNO (addr
.offset
) - R0_REGNUM
);
7215 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7216 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7219 case ADDRESS_REG_WB
:
7220 /* Writeback is only supported for fixed-width modes. */
7221 size
= GET_MODE_SIZE (mode
).to_constant ();
7222 switch (GET_CODE (x
))
7225 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7228 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7231 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7234 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7237 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7238 INTVAL (addr
.offset
));
7241 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
7242 INTVAL (addr
.offset
));
7249 case ADDRESS_LO_SUM
:
7250 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
7251 output_addr_const (f
, addr
.offset
);
7252 asm_fprintf (f
, "]");
7255 case ADDRESS_SYMBOLIC
:
7256 output_addr_const (f
, x
);
7263 /* Print address 'x' of a memory access with mode 'mode'. */
7265 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
7267 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
7268 output_addr_const (f
, x
);
7272 aarch64_label_mentioned_p (rtx x
)
7277 if (GET_CODE (x
) == LABEL_REF
)
7280 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7281 referencing instruction, but they are constant offsets, not
7283 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7286 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
7287 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
7293 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
7294 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
7297 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
7304 /* Implement REGNO_REG_CLASS. */
7307 aarch64_regno_regclass (unsigned regno
)
7309 if (GP_REGNUM_P (regno
))
7310 return GENERAL_REGS
;
7312 if (regno
== SP_REGNUM
)
7315 if (regno
== FRAME_POINTER_REGNUM
7316 || regno
== ARG_POINTER_REGNUM
)
7317 return POINTER_REGS
;
7319 if (FP_REGNUM_P (regno
))
7320 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
7322 if (PR_REGNUM_P (regno
))
7323 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
7328 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7329 If OFFSET is out of range, return an offset of an anchor point
7330 that is in range. Return 0 otherwise. */
7332 static HOST_WIDE_INT
7333 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
7336 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7338 return (offset
+ 0x400) & ~0x7f0;
7340 /* For offsets that aren't a multiple of the access size, the limit is
7342 if (offset
& (size
- 1))
7344 /* BLKmode typically uses LDP of X-registers. */
7345 if (mode
== BLKmode
)
7346 return (offset
+ 512) & ~0x3ff;
7347 return (offset
+ 0x100) & ~0x1ff;
7350 /* Small negative offsets are supported. */
7351 if (IN_RANGE (offset
, -256, 0))
7354 if (mode
== TImode
|| mode
== TFmode
)
7355 return (offset
+ 0x100) & ~0x1ff;
7357 /* Use 12-bit offset by access size. */
7358 return offset
& (~0xfff * size
);
7362 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
7364 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7365 where mask is selected by alignment and size of the offset.
7366 We try to pick as large a range for the offset as possible to
7367 maximize the chance of a CSE. However, for aligned addresses
7368 we limit the range to 4k so that structures with different sized
7369 elements are likely to use the same base. We need to be careful
7370 not to split a CONST for some forms of address expression, otherwise
7371 it will generate sub-optimal code. */
7373 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
7375 rtx base
= XEXP (x
, 0);
7376 rtx offset_rtx
= XEXP (x
, 1);
7377 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
7379 if (GET_CODE (base
) == PLUS
)
7381 rtx op0
= XEXP (base
, 0);
7382 rtx op1
= XEXP (base
, 1);
7384 /* Force any scaling into a temp for CSE. */
7385 op0
= force_reg (Pmode
, op0
);
7386 op1
= force_reg (Pmode
, op1
);
7388 /* Let the pointer register be in op0. */
7389 if (REG_POINTER (op1
))
7390 std::swap (op0
, op1
);
7392 /* If the pointer is virtual or frame related, then we know that
7393 virtual register instantiation or register elimination is going
7394 to apply a second constant. We want the two constants folded
7395 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7396 if (virt_or_elim_regno_p (REGNO (op0
)))
7398 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
7399 NULL_RTX
, true, OPTAB_DIRECT
);
7400 return gen_rtx_PLUS (Pmode
, base
, op1
);
7403 /* Otherwise, in order to encourage CSE (and thence loop strength
7404 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7405 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
7406 NULL_RTX
, true, OPTAB_DIRECT
);
7407 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
7411 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7413 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
7415 if (base_offset
!= 0)
7417 base
= plus_constant (Pmode
, base
, base_offset
);
7418 base
= force_operand (base
, NULL_RTX
);
7419 return plus_constant (Pmode
, base
, offset
- base_offset
);
7428 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
7431 secondary_reload_info
*sri
)
7433 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7434 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7435 comment at the head of aarch64-sve.md for more details about the
7436 big-endian handling. */
7437 if (BYTES_BIG_ENDIAN
7438 && reg_class_subset_p (rclass
, FP_REGS
)
7439 && !((REG_P (x
) && HARD_REGISTER_P (x
))
7440 || aarch64_simd_valid_immediate (x
, NULL
))
7441 && aarch64_sve_data_mode_p (mode
))
7443 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
7447 /* If we have to disable direct literal pool loads and stores because the
7448 function is too big, then we need a scratch register. */
7449 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
7450 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
7451 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
7452 && !aarch64_pcrelative_literal_loads
)
7454 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
7458 /* Without the TARGET_SIMD instructions we cannot move a Q register
7459 to a Q register directly. We need a scratch. */
7460 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
7461 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
7462 && reg_class_subset_p (rclass
, FP_REGS
))
7464 sri
->icode
= code_for_aarch64_reload_mov (mode
);
7468 /* A TFmode or TImode memory access should be handled via an FP_REGS
7469 because AArch64 has richer addressing modes for LDR/STR instructions
7470 than LDP/STP instructions. */
7471 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
7472 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
7475 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
7476 return GENERAL_REGS
;
7482 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
7484 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
7486 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7487 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7488 if (frame_pointer_needed
)
7489 return to
== HARD_FRAME_POINTER_REGNUM
;
7494 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
7496 if (to
== HARD_FRAME_POINTER_REGNUM
)
7498 if (from
== ARG_POINTER_REGNUM
)
7499 return cfun
->machine
->frame
.hard_fp_offset
;
7501 if (from
== FRAME_POINTER_REGNUM
)
7502 return cfun
->machine
->frame
.hard_fp_offset
7503 - cfun
->machine
->frame
.locals_offset
;
7506 if (to
== STACK_POINTER_REGNUM
)
7508 if (from
== FRAME_POINTER_REGNUM
)
7509 return cfun
->machine
->frame
.frame_size
7510 - cfun
->machine
->frame
.locals_offset
;
7513 return cfun
->machine
->frame
.frame_size
;
7516 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7520 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
7524 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
7529 aarch64_asm_trampoline_template (FILE *f
)
7533 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
7534 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
7538 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
7539 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
7541 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
7542 assemble_aligned_integer (4, const0_rtx
);
7543 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7544 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7548 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
7550 rtx fnaddr
, mem
, a_tramp
;
7551 const int tramp_code_sz
= 16;
7553 /* Don't need to copy the trailing D-words, we fill those in below. */
7554 emit_block_move (m_tramp
, assemble_trampoline_template (),
7555 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
7556 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
7557 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
7558 if (GET_MODE (fnaddr
) != ptr_mode
)
7559 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
7560 emit_move_insn (mem
, fnaddr
);
7562 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
7563 emit_move_insn (mem
, chain_value
);
7565 /* XXX We should really define a "clear_cache" pattern and use
7566 gen_clear_cache(). */
7567 a_tramp
= XEXP (m_tramp
, 0);
7568 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
7569 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
7570 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
7574 static unsigned char
7575 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
7577 /* ??? Logically we should only need to provide a value when
7578 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7579 can hold MODE, but at the moment we need to handle all modes.
7580 Just ignore any runtime parts for registers that can't store them. */
7581 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
7585 case TAILCALL_ADDR_REGS
:
7589 case POINTER_AND_FP_REGS
:
7592 if (aarch64_sve_data_mode_p (mode
)
7593 && constant_multiple_p (GET_MODE_SIZE (mode
),
7594 BYTES_PER_SVE_VECTOR
, &nregs
))
7596 return (aarch64_vector_data_mode_p (mode
)
7597 ? CEIL (lowest_size
, UNITS_PER_VREG
)
7598 : CEIL (lowest_size
, UNITS_PER_WORD
));
7615 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
7617 if (regclass
== POINTER_REGS
)
7618 return GENERAL_REGS
;
7620 if (regclass
== STACK_REG
)
7623 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
7629 /* Register eliminiation can result in a request for
7630 SP+constant->FP_REGS. We cannot support such operations which
7631 use SP as source and an FP_REG as destination, so reject out
7633 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
7635 rtx lhs
= XEXP (x
, 0);
7637 /* Look through a possible SUBREG introduced by ILP32. */
7638 if (GET_CODE (lhs
) == SUBREG
)
7639 lhs
= SUBREG_REG (lhs
);
7641 gcc_assert (REG_P (lhs
));
7642 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
7651 aarch64_asm_output_labelref (FILE* f
, const char *name
)
7653 asm_fprintf (f
, "%U%s", name
);
7657 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
7659 if (priority
== DEFAULT_INIT_PRIORITY
)
7660 default_ctor_section_asm_out_constructor (symbol
, priority
);
7664 /* While priority is known to be in range [0, 65535], so 18 bytes
7665 would be enough, the compiler might not know that. To avoid
7666 -Wformat-truncation false positive, use a larger size. */
7668 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
7669 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7670 switch_to_section (s
);
7671 assemble_align (POINTER_SIZE
);
7672 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7677 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
7679 if (priority
== DEFAULT_INIT_PRIORITY
)
7680 default_dtor_section_asm_out_destructor (symbol
, priority
);
7684 /* While priority is known to be in range [0, 65535], so 18 bytes
7685 would be enough, the compiler might not know that. To avoid
7686 -Wformat-truncation false positive, use a larger size. */
7688 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
7689 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7690 switch_to_section (s
);
7691 assemble_align (POINTER_SIZE
);
7692 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7697 aarch64_output_casesi (rtx
*operands
)
7701 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
7703 static const char *const patterns
[4][2] =
7706 "ldrb\t%w3, [%0,%w1,uxtw]",
7707 "add\t%3, %4, %w3, sxtb #2"
7710 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7711 "add\t%3, %4, %w3, sxth #2"
7714 "ldr\t%w3, [%0,%w1,uxtw #2]",
7715 "add\t%3, %4, %w3, sxtw #2"
7717 /* We assume that DImode is only generated when not optimizing and
7718 that we don't really need 64-bit address offsets. That would
7719 imply an object file with 8GB of code in a single function! */
7721 "ldr\t%w3, [%0,%w1,uxtw #2]",
7722 "add\t%3, %4, %w3, sxtw #2"
7726 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
7728 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
7729 index
= exact_log2 (GET_MODE_SIZE (mode
));
7731 gcc_assert (index
>= 0 && index
<= 3);
7733 /* Need to implement table size reduction, by chaning the code below. */
7734 output_asm_insn (patterns
[index
][0], operands
);
7735 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
7736 snprintf (buf
, sizeof (buf
),
7737 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
7738 output_asm_insn (buf
, operands
);
7739 output_asm_insn (patterns
[index
][1], operands
);
7740 output_asm_insn ("br\t%3", operands
);
7741 assemble_label (asm_out_file
, label
);
7746 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7747 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7751 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
7753 if (shift
>= 0 && shift
<= 3)
7756 for (size
= 8; size
<= 32; size
*= 2)
7758 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
7759 if (mask
== bits
<< shift
)
7766 /* Constant pools are per function only when PC relative
7767 literal loads are true or we are in the large memory
7771 aarch64_can_use_per_function_literal_pools_p (void)
7773 return (aarch64_pcrelative_literal_loads
7774 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
7778 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
7780 /* We can't use blocks for constants when we're using a per-function
7782 return !aarch64_can_use_per_function_literal_pools_p ();
7785 /* Select appropriate section for constants depending
7786 on where we place literal pools. */
7789 aarch64_select_rtx_section (machine_mode mode
,
7791 unsigned HOST_WIDE_INT align
)
7793 if (aarch64_can_use_per_function_literal_pools_p ())
7794 return function_section (current_function_decl
);
7796 return default_elf_select_rtx_section (mode
, x
, align
);
7799 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7801 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
7802 HOST_WIDE_INT offset
)
7804 /* When using per-function literal pools, we must ensure that any code
7805 section is aligned to the minimal instruction length, lest we get
7806 errors from the assembler re "unaligned instructions". */
7807 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
7808 ASM_OUTPUT_ALIGN (f
, 2);
7813 /* Helper function for rtx cost calculation. Strip a shift expression
7814 from X. Returns the inner operand if successful, or the original
7815 expression on failure. */
7817 aarch64_strip_shift (rtx x
)
7821 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7822 we can convert both to ROR during final output. */
7823 if ((GET_CODE (op
) == ASHIFT
7824 || GET_CODE (op
) == ASHIFTRT
7825 || GET_CODE (op
) == LSHIFTRT
7826 || GET_CODE (op
) == ROTATERT
7827 || GET_CODE (op
) == ROTATE
)
7828 && CONST_INT_P (XEXP (op
, 1)))
7829 return XEXP (op
, 0);
7831 if (GET_CODE (op
) == MULT
7832 && CONST_INT_P (XEXP (op
, 1))
7833 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
7834 return XEXP (op
, 0);
7839 /* Helper function for rtx cost calculation. Strip an extend
7840 expression from X. Returns the inner operand if successful, or the
7841 original expression on failure. We deal with a number of possible
7842 canonicalization variations here. If STRIP_SHIFT is true, then
7843 we can strip off a shift also. */
7845 aarch64_strip_extend (rtx x
, bool strip_shift
)
7847 scalar_int_mode mode
;
7850 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
7853 /* Zero and sign extraction of a widened value. */
7854 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
7855 && XEXP (op
, 2) == const0_rtx
7856 && GET_CODE (XEXP (op
, 0)) == MULT
7857 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
7859 return XEXP (XEXP (op
, 0), 0);
7861 /* It can also be represented (for zero-extend) as an AND with an
7863 if (GET_CODE (op
) == AND
7864 && GET_CODE (XEXP (op
, 0)) == MULT
7865 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
7866 && CONST_INT_P (XEXP (op
, 1))
7867 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
7868 INTVAL (XEXP (op
, 1))) != 0)
7869 return XEXP (XEXP (op
, 0), 0);
7871 /* Now handle extended register, as this may also have an optional
7872 left shift by 1..4. */
7874 && GET_CODE (op
) == ASHIFT
7875 && CONST_INT_P (XEXP (op
, 1))
7876 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
7879 if (GET_CODE (op
) == ZERO_EXTEND
7880 || GET_CODE (op
) == SIGN_EXTEND
)
7889 /* Return true iff CODE is a shift supported in combination
7890 with arithmetic instructions. */
7893 aarch64_shift_p (enum rtx_code code
)
7895 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
7899 /* Return true iff X is a cheap shift without a sign extend. */
7902 aarch64_cheap_mult_shift_p (rtx x
)
7909 if (!(aarch64_tune_params
.extra_tuning_flags
7910 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
7913 if (GET_CODE (op0
) == SIGN_EXTEND
)
7916 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
7917 && UINTVAL (op1
) <= 4)
7920 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
7923 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
7925 if (l2
> 0 && l2
<= 4)
7931 /* Helper function for rtx cost calculation. Calculate the cost of
7932 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7933 Return the calculated cost of the expression, recursing manually in to
7934 operands where needed. */
7937 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
7940 const struct cpu_cost_table
*extra_cost
7941 = aarch64_tune_params
.insn_extra_cost
;
7943 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
7944 machine_mode mode
= GET_MODE (x
);
7946 gcc_checking_assert (code
== MULT
);
7951 if (VECTOR_MODE_P (mode
))
7952 mode
= GET_MODE_INNER (mode
);
7954 /* Integer multiply/fma. */
7955 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7957 /* The multiply will be canonicalized as a shift, cost it as such. */
7958 if (aarch64_shift_p (GET_CODE (x
))
7959 || (CONST_INT_P (op1
)
7960 && exact_log2 (INTVAL (op1
)) > 0))
7962 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
7963 || GET_CODE (op0
) == SIGN_EXTEND
;
7968 /* If the shift is considered cheap,
7969 then don't add any cost. */
7970 if (aarch64_cheap_mult_shift_p (x
))
7972 else if (REG_P (op1
))
7973 /* ARITH + shift-by-register. */
7974 cost
+= extra_cost
->alu
.arith_shift_reg
;
7976 /* ARITH + extended register. We don't have a cost field
7977 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7978 cost
+= extra_cost
->alu
.extend_arith
;
7980 /* ARITH + shift-by-immediate. */
7981 cost
+= extra_cost
->alu
.arith_shift
;
7984 /* LSL (immediate). */
7985 cost
+= extra_cost
->alu
.shift
;
7988 /* Strip extends as we will have costed them in the case above. */
7990 op0
= aarch64_strip_extend (op0
, true);
7992 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
7997 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7998 compound and let the below cases handle it. After all, MNEG is a
7999 special-case alias of MSUB. */
8000 if (GET_CODE (op0
) == NEG
)
8002 op0
= XEXP (op0
, 0);
8006 /* Integer multiplies or FMAs have zero/sign extending variants. */
8007 if ((GET_CODE (op0
) == ZERO_EXTEND
8008 && GET_CODE (op1
) == ZERO_EXTEND
)
8009 || (GET_CODE (op0
) == SIGN_EXTEND
8010 && GET_CODE (op1
) == SIGN_EXTEND
))
8012 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
8013 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
8018 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8019 cost
+= extra_cost
->mult
[0].extend_add
;
8021 /* MUL/SMULL/UMULL. */
8022 cost
+= extra_cost
->mult
[0].extend
;
8028 /* This is either an integer multiply or a MADD. In both cases
8029 we want to recurse and cost the operands. */
8030 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8031 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8037 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8040 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8049 /* Floating-point FMA/FMUL can also support negations of the
8050 operands, unless the rounding mode is upward or downward in
8051 which case FNMUL is different than FMUL with operand negation. */
8052 bool neg0
= GET_CODE (op0
) == NEG
;
8053 bool neg1
= GET_CODE (op1
) == NEG
;
8054 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8057 op0
= XEXP (op0
, 0);
8059 op1
= XEXP (op1
, 0);
8063 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8064 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8067 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8070 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8071 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8077 aarch64_address_cost (rtx x
,
8079 addr_space_t as ATTRIBUTE_UNUSED
,
8082 enum rtx_code c
= GET_CODE (x
);
8083 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8084 struct aarch64_address_info info
;
8088 if (!aarch64_classify_address (&info
, x
, mode
, false))
8090 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8092 /* This is a CONST or SYMBOL ref which will be split
8093 in a different way depending on the code model in use.
8094 Cost it through the generic infrastructure. */
8095 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8096 /* Divide through by the cost of one instruction to
8097 bring it to the same units as the address costs. */
8098 cost_symbol_ref
/= COSTS_N_INSNS (1);
8099 /* The cost is then the cost of preparing the address,
8100 followed by an immediate (possibly 0) offset. */
8101 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8105 /* This is most likely a jump table from a case
8107 return addr_cost
->register_offset
;
8113 case ADDRESS_LO_SUM
:
8114 case ADDRESS_SYMBOLIC
:
8115 case ADDRESS_REG_IMM
:
8116 cost
+= addr_cost
->imm_offset
;
8119 case ADDRESS_REG_WB
:
8120 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8121 cost
+= addr_cost
->pre_modify
;
8122 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8123 cost
+= addr_cost
->post_modify
;
8129 case ADDRESS_REG_REG
:
8130 cost
+= addr_cost
->register_offset
;
8133 case ADDRESS_REG_SXTW
:
8134 cost
+= addr_cost
->register_sextend
;
8137 case ADDRESS_REG_UXTW
:
8138 cost
+= addr_cost
->register_zextend
;
8148 /* For the sake of calculating the cost of the shifted register
8149 component, we can treat same sized modes in the same way. */
8150 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8151 cost
+= addr_cost
->addr_scale_costs
.hi
;
8152 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8153 cost
+= addr_cost
->addr_scale_costs
.si
;
8154 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8155 cost
+= addr_cost
->addr_scale_costs
.di
;
8157 /* We can't tell, or this is a 128-bit vector. */
8158 cost
+= addr_cost
->addr_scale_costs
.ti
;
8164 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8165 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8169 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8171 /* When optimizing for speed, use the cost of unpredictable branches. */
8172 const struct cpu_branch_cost
*branch_costs
=
8173 aarch64_tune_params
.branch_costs
;
8175 if (!speed_p
|| predictable_p
)
8176 return branch_costs
->predictable
;
8178 return branch_costs
->unpredictable
;
8181 /* Return true if the RTX X in mode MODE is a zero or sign extract
8182 usable in an ADD or SUB (extended register) instruction. */
8184 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8186 /* Catch add with a sign extract.
8187 This is add_<optab><mode>_multp2. */
8188 if (GET_CODE (x
) == SIGN_EXTRACT
8189 || GET_CODE (x
) == ZERO_EXTRACT
)
8191 rtx op0
= XEXP (x
, 0);
8192 rtx op1
= XEXP (x
, 1);
8193 rtx op2
= XEXP (x
, 2);
8195 if (GET_CODE (op0
) == MULT
8196 && CONST_INT_P (op1
)
8197 && op2
== const0_rtx
8198 && CONST_INT_P (XEXP (op0
, 1))
8199 && aarch64_is_extend_from_extract (mode
,
8206 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8208 else if (GET_CODE (x
) == SIGN_EXTEND
8209 || GET_CODE (x
) == ZERO_EXTEND
)
8210 return REG_P (XEXP (x
, 0));
8216 aarch64_frint_unspec_p (unsigned int u
)
8234 /* Return true iff X is an rtx that will match an extr instruction
8235 i.e. as described in the *extr<mode>5_insn family of patterns.
8236 OP0 and OP1 will be set to the operands of the shifts involved
8237 on success and will be NULL_RTX otherwise. */
8240 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
8243 scalar_int_mode mode
;
8244 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
8247 *res_op0
= NULL_RTX
;
8248 *res_op1
= NULL_RTX
;
8250 if (GET_CODE (x
) != IOR
)
8256 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
8257 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
8259 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8260 if (GET_CODE (op1
) == ASHIFT
)
8261 std::swap (op0
, op1
);
8263 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
8266 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
8267 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
8269 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
8270 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
8272 *res_op0
= XEXP (op0
, 0);
8273 *res_op1
= XEXP (op1
, 0);
8281 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8282 storing it in *COST. Result is true if the total cost of the operation
8283 has now been calculated. */
8285 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
8289 enum rtx_code cmpcode
;
8291 if (COMPARISON_P (op0
))
8293 inner
= XEXP (op0
, 0);
8294 comparator
= XEXP (op0
, 1);
8295 cmpcode
= GET_CODE (op0
);
8300 comparator
= const0_rtx
;
8304 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
8306 /* Conditional branch. */
8307 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8311 if (cmpcode
== NE
|| cmpcode
== EQ
)
8313 if (comparator
== const0_rtx
)
8315 /* TBZ/TBNZ/CBZ/CBNZ. */
8316 if (GET_CODE (inner
) == ZERO_EXTRACT
)
8318 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
8319 ZERO_EXTRACT
, 0, speed
);
8322 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
8327 else if (cmpcode
== LT
|| cmpcode
== GE
)
8330 if (comparator
== const0_rtx
)
8335 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8338 if (GET_CODE (op1
) == COMPARE
)
8340 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8341 if (XEXP (op1
, 1) == const0_rtx
)
8345 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
8346 const struct cpu_cost_table
*extra_cost
8347 = aarch64_tune_params
.insn_extra_cost
;
8349 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8350 *cost
+= extra_cost
->alu
.arith
;
8352 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8357 /* It's a conditional operation based on the status flags,
8358 so it must be some flavor of CSEL. */
8360 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8361 if (GET_CODE (op1
) == NEG
8362 || GET_CODE (op1
) == NOT
8363 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
8364 op1
= XEXP (op1
, 0);
8365 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
8367 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8368 op1
= XEXP (op1
, 0);
8369 op2
= XEXP (op2
, 0);
8372 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
8373 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
8377 /* We don't know what this is, cost all operands. */
8381 /* Check whether X is a bitfield operation of the form shift + extend that
8382 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8383 operand to which the bitfield operation is applied. Otherwise return
8387 aarch64_extend_bitfield_pattern_p (rtx x
)
8389 rtx_code outer_code
= GET_CODE (x
);
8390 machine_mode outer_mode
= GET_MODE (x
);
8392 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
8393 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
8396 rtx inner
= XEXP (x
, 0);
8397 rtx_code inner_code
= GET_CODE (inner
);
8398 machine_mode inner_mode
= GET_MODE (inner
);
8404 if (CONST_INT_P (XEXP (inner
, 1))
8405 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8406 op
= XEXP (inner
, 0);
8409 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8410 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8411 op
= XEXP (inner
, 0);
8414 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8415 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8416 op
= XEXP (inner
, 0);
8425 /* Return true if the mask and a shift amount from an RTX of the form
8426 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8427 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8430 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
8433 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
8434 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
8435 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
8436 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
8439 /* Calculate the cost of calculating X, storing it in *COST. Result
8440 is true if the total cost of the operation has now been calculated. */
8442 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
8443 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
8446 const struct cpu_cost_table
*extra_cost
8447 = aarch64_tune_params
.insn_extra_cost
;
8448 int code
= GET_CODE (x
);
8449 scalar_int_mode int_mode
;
8451 /* By default, assume that everything has equivalent cost to the
8452 cheapest instruction. Any additional costs are applied as a delta
8453 above this default. */
8454 *cost
= COSTS_N_INSNS (1);
8459 /* The cost depends entirely on the operands to SET. */
8464 switch (GET_CODE (op0
))
8469 rtx address
= XEXP (op0
, 0);
8470 if (VECTOR_MODE_P (mode
))
8471 *cost
+= extra_cost
->ldst
.storev
;
8472 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8473 *cost
+= extra_cost
->ldst
.store
;
8474 else if (mode
== SFmode
)
8475 *cost
+= extra_cost
->ldst
.storef
;
8476 else if (mode
== DFmode
)
8477 *cost
+= extra_cost
->ldst
.stored
;
8480 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8484 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8488 if (! REG_P (SUBREG_REG (op0
)))
8489 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
8493 /* The cost is one per vector-register copied. */
8494 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
8496 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
8497 *cost
= COSTS_N_INSNS (nregs
);
8499 /* const0_rtx is in general free, but we will use an
8500 instruction to set a register to 0. */
8501 else if (REG_P (op1
) || op1
== const0_rtx
)
8503 /* The cost is 1 per register copied. */
8504 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
8505 *cost
= COSTS_N_INSNS (nregs
);
8508 /* Cost is just the cost of the RHS of the set. */
8509 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8514 /* Bit-field insertion. Strip any redundant widening of
8515 the RHS to meet the width of the target. */
8516 if (GET_CODE (op1
) == SUBREG
)
8517 op1
= SUBREG_REG (op1
);
8518 if ((GET_CODE (op1
) == ZERO_EXTEND
8519 || GET_CODE (op1
) == SIGN_EXTEND
)
8520 && CONST_INT_P (XEXP (op0
, 1))
8521 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
8522 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
8523 op1
= XEXP (op1
, 0);
8525 if (CONST_INT_P (op1
))
8527 /* MOV immediate is assumed to always be cheap. */
8528 *cost
= COSTS_N_INSNS (1);
8534 *cost
+= extra_cost
->alu
.bfi
;
8535 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
8541 /* We can't make sense of this, assume default cost. */
8542 *cost
= COSTS_N_INSNS (1);
8548 /* If an instruction can incorporate a constant within the
8549 instruction, the instruction's expression avoids calling
8550 rtx_cost() on the constant. If rtx_cost() is called on a
8551 constant, then it is usually because the constant must be
8552 moved into a register by one or more instructions.
8554 The exception is constant 0, which can be expressed
8555 as XZR/WZR and is therefore free. The exception to this is
8556 if we have (set (reg) (const0_rtx)) in which case we must cost
8557 the move. However, we can catch that when we cost the SET, so
8558 we don't need to consider that here. */
8559 if (x
== const0_rtx
)
8563 /* To an approximation, building any other constant is
8564 proportionally expensive to the number of instructions
8565 required to build that constant. This is true whether we
8566 are compiling for SPEED or otherwise. */
8567 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8568 int_mode
= word_mode
;
8569 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
8570 (NULL_RTX
, x
, false, int_mode
));
8576 /* First determine number of instructions to do the move
8577 as an integer constant. */
8578 if (!aarch64_float_const_representable_p (x
)
8579 && !aarch64_can_const_movi_rtx_p (x
, mode
)
8580 && aarch64_float_const_rtx_p (x
))
8582 unsigned HOST_WIDE_INT ival
;
8583 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
8584 gcc_assert (succeed
);
8586 scalar_int_mode imode
= (mode
== HFmode
8588 : int_mode_for_mode (mode
).require ());
8589 int ncost
= aarch64_internal_mov_immediate
8590 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8591 *cost
+= COSTS_N_INSNS (ncost
);
8597 /* mov[df,sf]_aarch64. */
8598 if (aarch64_float_const_representable_p (x
))
8599 /* FMOV (scalar immediate). */
8600 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
8601 else if (!aarch64_float_const_zero_rtx_p (x
))
8603 /* This will be a load from memory. */
8605 *cost
+= extra_cost
->ldst
.loadd
;
8607 *cost
+= extra_cost
->ldst
.loadf
;
8610 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8611 or MOV v0.s[0], wzr - neither of which are modeled by the
8612 cost tables. Just use the default cost. */
8622 /* For loads we want the base cost of a load, plus an
8623 approximation for the additional cost of the addressing
8625 rtx address
= XEXP (x
, 0);
8626 if (VECTOR_MODE_P (mode
))
8627 *cost
+= extra_cost
->ldst
.loadv
;
8628 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8629 *cost
+= extra_cost
->ldst
.load
;
8630 else if (mode
== SFmode
)
8631 *cost
+= extra_cost
->ldst
.loadf
;
8632 else if (mode
== DFmode
)
8633 *cost
+= extra_cost
->ldst
.loadd
;
8636 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8645 if (VECTOR_MODE_P (mode
))
8650 *cost
+= extra_cost
->vect
.alu
;
8655 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8657 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8658 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8661 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
8665 /* Cost this as SUB wzr, X. */
8666 op0
= CONST0_RTX (mode
);
8671 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8673 /* Support (neg(fma...)) as a single instruction only if
8674 sign of zeros is unimportant. This matches the decision
8675 making in aarch64.md. */
8676 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
8679 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8682 if (GET_CODE (op0
) == MULT
)
8685 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8690 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8700 if (VECTOR_MODE_P (mode
))
8701 *cost
+= extra_cost
->vect
.alu
;
8703 *cost
+= extra_cost
->alu
.clz
;
8712 if (op1
== const0_rtx
8713 && GET_CODE (op0
) == AND
)
8716 mode
= GET_MODE (op0
);
8720 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
8722 /* TODO: A write to the CC flags possibly costs extra, this
8723 needs encoding in the cost tables. */
8725 mode
= GET_MODE (op0
);
8727 if (GET_CODE (op0
) == AND
)
8733 if (GET_CODE (op0
) == PLUS
)
8735 /* ADDS (and CMN alias). */
8740 if (GET_CODE (op0
) == MINUS
)
8747 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
8748 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
8749 && CONST_INT_P (XEXP (op0
, 2)))
8751 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8752 Handle it here directly rather than going to cost_logic
8753 since we know the immediate generated for the TST is valid
8754 so we can avoid creating an intermediate rtx for it only
8755 for costing purposes. */
8757 *cost
+= extra_cost
->alu
.logical
;
8759 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
8760 ZERO_EXTRACT
, 0, speed
);
8764 if (GET_CODE (op1
) == NEG
)
8768 *cost
+= extra_cost
->alu
.arith
;
8770 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
8771 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
8777 Compare can freely swap the order of operands, and
8778 canonicalization puts the more complex operation first.
8779 But the integer MINUS logic expects the shift/extend
8780 operation in op1. */
8782 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
8790 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
8794 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8796 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
8798 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
8799 /* FCMP supports constant 0.0 for no extra cost. */
8805 if (VECTOR_MODE_P (mode
))
8807 /* Vector compare. */
8809 *cost
+= extra_cost
->vect
.alu
;
8811 if (aarch64_float_const_zero_rtx_p (op1
))
8813 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8827 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
8829 /* Detect valid immediates. */
8830 if ((GET_MODE_CLASS (mode
) == MODE_INT
8831 || (GET_MODE_CLASS (mode
) == MODE_CC
8832 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
8833 && CONST_INT_P (op1
)
8834 && aarch64_uimm12_shift (INTVAL (op1
)))
8837 /* SUB(S) (immediate). */
8838 *cost
+= extra_cost
->alu
.arith
;
8842 /* Look for SUB (extended register). */
8843 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8844 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
8847 *cost
+= extra_cost
->alu
.extend_arith
;
8849 op1
= aarch64_strip_extend (op1
, true);
8850 *cost
+= rtx_cost (op1
, VOIDmode
,
8851 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
8855 rtx new_op1
= aarch64_strip_extend (op1
, false);
8857 /* Cost this as an FMA-alike operation. */
8858 if ((GET_CODE (new_op1
) == MULT
8859 || aarch64_shift_p (GET_CODE (new_op1
)))
8862 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
8863 (enum rtx_code
) code
,
8868 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
8872 if (VECTOR_MODE_P (mode
))
8875 *cost
+= extra_cost
->vect
.alu
;
8877 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8880 *cost
+= extra_cost
->alu
.arith
;
8882 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8885 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8899 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8900 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8903 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
8904 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8908 if (GET_MODE_CLASS (mode
) == MODE_INT
8909 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
8910 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
8912 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
8915 /* ADD (immediate). */
8916 *cost
+= extra_cost
->alu
.arith
;
8920 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8922 /* Look for ADD (extended register). */
8923 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8924 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
8927 *cost
+= extra_cost
->alu
.extend_arith
;
8929 op0
= aarch64_strip_extend (op0
, true);
8930 *cost
+= rtx_cost (op0
, VOIDmode
,
8931 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
8935 /* Strip any extend, leave shifts behind as we will
8936 cost them through mult_cost. */
8937 new_op0
= aarch64_strip_extend (op0
, false);
8939 if (GET_CODE (new_op0
) == MULT
8940 || aarch64_shift_p (GET_CODE (new_op0
)))
8942 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
8947 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
8951 if (VECTOR_MODE_P (mode
))
8954 *cost
+= extra_cost
->vect
.alu
;
8956 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8959 *cost
+= extra_cost
->alu
.arith
;
8961 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8964 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8971 *cost
= COSTS_N_INSNS (1);
8975 if (VECTOR_MODE_P (mode
))
8976 *cost
+= extra_cost
->vect
.alu
;
8978 *cost
+= extra_cost
->alu
.rev
;
8983 if (aarch_rev16_p (x
))
8985 *cost
= COSTS_N_INSNS (1);
8989 if (VECTOR_MODE_P (mode
))
8990 *cost
+= extra_cost
->vect
.alu
;
8992 *cost
+= extra_cost
->alu
.rev
;
8997 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
8999 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
9000 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
9002 *cost
+= extra_cost
->alu
.shift
;
9013 if (VECTOR_MODE_P (mode
))
9016 *cost
+= extra_cost
->vect
.alu
;
9021 && GET_CODE (op0
) == MULT
9022 && CONST_INT_P (XEXP (op0
, 1))
9023 && CONST_INT_P (op1
)
9024 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9027 /* This is a UBFM/SBFM. */
9028 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9030 *cost
+= extra_cost
->alu
.bfx
;
9034 if (is_int_mode (mode
, &int_mode
))
9036 if (CONST_INT_P (op1
))
9038 /* We have a mask + shift version of a UBFIZ
9039 i.e. the *andim_ashift<mode>_bfiz pattern. */
9040 if (GET_CODE (op0
) == ASHIFT
9041 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
9044 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
9045 (enum rtx_code
) code
, 0, speed
);
9047 *cost
+= extra_cost
->alu
.bfx
;
9051 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
9053 /* We possibly get the immediate for free, this is not
9055 *cost
+= rtx_cost (op0
, int_mode
,
9056 (enum rtx_code
) code
, 0, speed
);
9058 *cost
+= extra_cost
->alu
.logical
;
9067 /* Handle ORN, EON, or BIC. */
9068 if (GET_CODE (op0
) == NOT
)
9069 op0
= XEXP (op0
, 0);
9071 new_op0
= aarch64_strip_shift (op0
);
9073 /* If we had a shift on op0 then this is a logical-shift-
9074 by-register/immediate operation. Otherwise, this is just
9075 a logical operation. */
9080 /* Shift by immediate. */
9081 if (CONST_INT_P (XEXP (op0
, 1)))
9082 *cost
+= extra_cost
->alu
.log_shift
;
9084 *cost
+= extra_cost
->alu
.log_shift_reg
;
9087 *cost
+= extra_cost
->alu
.logical
;
9090 /* In both cases we want to cost both operands. */
9091 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9093 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9103 op0
= aarch64_strip_shift (x
);
9105 if (VECTOR_MODE_P (mode
))
9108 *cost
+= extra_cost
->vect
.alu
;
9112 /* MVN-shifted-reg. */
9115 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9118 *cost
+= extra_cost
->alu
.log_shift
;
9122 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9123 Handle the second form here taking care that 'a' in the above can
9125 else if (GET_CODE (op0
) == XOR
)
9127 rtx newop0
= XEXP (op0
, 0);
9128 rtx newop1
= XEXP (op0
, 1);
9129 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9131 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9132 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9136 if (op0_stripped
!= newop0
)
9137 *cost
+= extra_cost
->alu
.log_shift
;
9139 *cost
+= extra_cost
->alu
.logical
;
9146 *cost
+= extra_cost
->alu
.logical
;
9153 /* If a value is written in SI mode, then zero extended to DI
9154 mode, the operation will in general be free as a write to
9155 a 'w' register implicitly zeroes the upper bits of an 'x'
9156 register. However, if this is
9158 (set (reg) (zero_extend (reg)))
9160 we must cost the explicit register move. */
9162 && GET_MODE (op0
) == SImode
9165 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9167 /* If OP_COST is non-zero, then the cost of the zero extend
9168 is effectively the cost of the inner operation. Otherwise
9169 we have a MOV instruction and we take the cost from the MOV
9170 itself. This is true independently of whether we are
9171 optimizing for space or time. */
9177 else if (MEM_P (op0
))
9179 /* All loads can zero extend to any size for free. */
9180 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9184 op0
= aarch64_extend_bitfield_pattern_p (x
);
9187 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9189 *cost
+= extra_cost
->alu
.bfx
;
9195 if (VECTOR_MODE_P (mode
))
9198 *cost
+= extra_cost
->vect
.alu
;
9202 /* We generate an AND instead of UXTB/UXTH. */
9203 *cost
+= extra_cost
->alu
.logical
;
9209 if (MEM_P (XEXP (x
, 0)))
9214 rtx address
= XEXP (XEXP (x
, 0), 0);
9215 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9218 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9224 op0
= aarch64_extend_bitfield_pattern_p (x
);
9227 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
9229 *cost
+= extra_cost
->alu
.bfx
;
9235 if (VECTOR_MODE_P (mode
))
9236 *cost
+= extra_cost
->vect
.alu
;
9238 *cost
+= extra_cost
->alu
.extend
;
9246 if (CONST_INT_P (op1
))
9250 if (VECTOR_MODE_P (mode
))
9252 /* Vector shift (immediate). */
9253 *cost
+= extra_cost
->vect
.alu
;
9257 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9259 *cost
+= extra_cost
->alu
.shift
;
9263 /* We can incorporate zero/sign extend for free. */
9264 if (GET_CODE (op0
) == ZERO_EXTEND
9265 || GET_CODE (op0
) == SIGN_EXTEND
)
9266 op0
= XEXP (op0
, 0);
9268 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
9273 if (VECTOR_MODE_P (mode
))
9276 /* Vector shift (register). */
9277 *cost
+= extra_cost
->vect
.alu
;
9283 *cost
+= extra_cost
->alu
.shift_reg
;
9285 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9286 && CONST_INT_P (XEXP (op1
, 1))
9287 && known_eq (INTVAL (XEXP (op1
, 1)),
9288 GET_MODE_BITSIZE (mode
) - 1))
9290 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9291 /* We already demanded XEXP (op1, 0) to be REG_P, so
9292 don't recurse into it. */
9296 return false; /* All arguments need to be in registers. */
9306 if (CONST_INT_P (op1
))
9308 /* ASR (immediate) and friends. */
9311 if (VECTOR_MODE_P (mode
))
9312 *cost
+= extra_cost
->vect
.alu
;
9314 *cost
+= extra_cost
->alu
.shift
;
9317 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9322 if (VECTOR_MODE_P (mode
))
9325 /* Vector shift (register). */
9326 *cost
+= extra_cost
->vect
.alu
;
9331 /* ASR (register) and friends. */
9332 *cost
+= extra_cost
->alu
.shift_reg
;
9334 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9335 && CONST_INT_P (XEXP (op1
, 1))
9336 && known_eq (INTVAL (XEXP (op1
, 1)),
9337 GET_MODE_BITSIZE (mode
) - 1))
9339 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9340 /* We already demanded XEXP (op1, 0) to be REG_P, so
9341 don't recurse into it. */
9345 return false; /* All arguments need to be in registers. */
9350 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
9351 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
9355 *cost
+= extra_cost
->ldst
.load
;
9357 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
9358 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
9360 /* ADRP, followed by ADD. */
9361 *cost
+= COSTS_N_INSNS (1);
9363 *cost
+= 2 * extra_cost
->alu
.arith
;
9365 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9366 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9370 *cost
+= extra_cost
->alu
.arith
;
9375 /* One extra load instruction, after accessing the GOT. */
9376 *cost
+= COSTS_N_INSNS (1);
9378 *cost
+= extra_cost
->ldst
.load
;
9384 /* ADRP/ADD (immediate). */
9386 *cost
+= extra_cost
->alu
.arith
;
9394 if (VECTOR_MODE_P (mode
))
9395 *cost
+= extra_cost
->vect
.alu
;
9397 *cost
+= extra_cost
->alu
.bfx
;
9400 /* We can trust that the immediates used will be correct (there
9401 are no by-register forms), so we need only cost op0. */
9402 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9406 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
9407 /* aarch64_rtx_mult_cost always handles recursion to its
9412 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9413 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9414 an unconditional negate. This case should only ever be reached through
9415 the set_smod_pow2_cheap check in expmed.c. */
9416 if (CONST_INT_P (XEXP (x
, 1))
9417 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
9418 && (mode
== SImode
|| mode
== DImode
))
9420 /* We expand to 4 instructions. Reset the baseline. */
9421 *cost
= COSTS_N_INSNS (4);
9424 *cost
+= 2 * extra_cost
->alu
.logical
9425 + 2 * extra_cost
->alu
.arith
;
9434 /* Slighly prefer UMOD over SMOD. */
9435 if (VECTOR_MODE_P (mode
))
9436 *cost
+= extra_cost
->vect
.alu
;
9437 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9438 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
9439 + extra_cost
->mult
[mode
== DImode
].idiv
9440 + (code
== MOD
? 1 : 0));
9442 return false; /* All arguments need to be in registers. */
9449 if (VECTOR_MODE_P (mode
))
9450 *cost
+= extra_cost
->vect
.alu
;
9451 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9452 /* There is no integer SQRT, so only DIV and UDIV can get
9454 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
9455 /* Slighly prefer UDIV over SDIV. */
9456 + (code
== DIV
? 1 : 0));
9458 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
9460 return false; /* All arguments need to be in registers. */
9463 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
9464 XEXP (x
, 2), cost
, speed
);
9477 return false; /* All arguments must be in registers. */
9486 if (VECTOR_MODE_P (mode
))
9487 *cost
+= extra_cost
->vect
.alu
;
9489 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9492 /* FMSUB, FNMADD, and FNMSUB are free. */
9493 if (GET_CODE (op0
) == NEG
)
9494 op0
= XEXP (op0
, 0);
9496 if (GET_CODE (op2
) == NEG
)
9497 op2
= XEXP (op2
, 0);
9499 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9500 and the by-element operand as operand 0. */
9501 if (GET_CODE (op1
) == NEG
)
9502 op1
= XEXP (op1
, 0);
9504 /* Catch vector-by-element operations. The by-element operand can
9505 either be (vec_duplicate (vec_select (x))) or just
9506 (vec_select (x)), depending on whether we are multiplying by
9507 a vector or a scalar.
9509 Canonicalization is not very good in these cases, FMA4 will put the
9510 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9511 if (GET_CODE (op0
) == VEC_DUPLICATE
)
9512 op0
= XEXP (op0
, 0);
9513 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
9514 op1
= XEXP (op1
, 0);
9516 if (GET_CODE (op0
) == VEC_SELECT
)
9517 op0
= XEXP (op0
, 0);
9518 else if (GET_CODE (op1
) == VEC_SELECT
)
9519 op1
= XEXP (op1
, 0);
9521 /* If the remaining parameters are not registers,
9522 get the cost to put them into registers. */
9523 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
9524 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
9525 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
9529 case UNSIGNED_FLOAT
:
9531 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
9537 if (VECTOR_MODE_P (mode
))
9539 /*Vector truncate. */
9540 *cost
+= extra_cost
->vect
.alu
;
9543 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
9547 case FLOAT_TRUNCATE
:
9550 if (VECTOR_MODE_P (mode
))
9552 /*Vector conversion. */
9553 *cost
+= extra_cost
->vect
.alu
;
9556 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
9563 /* Strip the rounding part. They will all be implemented
9564 by the fcvt* family of instructions anyway. */
9565 if (GET_CODE (x
) == UNSPEC
)
9567 unsigned int uns_code
= XINT (x
, 1);
9569 if (uns_code
== UNSPEC_FRINTA
9570 || uns_code
== UNSPEC_FRINTM
9571 || uns_code
== UNSPEC_FRINTN
9572 || uns_code
== UNSPEC_FRINTP
9573 || uns_code
== UNSPEC_FRINTZ
)
9574 x
= XVECEXP (x
, 0, 0);
9579 if (VECTOR_MODE_P (mode
))
9580 *cost
+= extra_cost
->vect
.alu
;
9582 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
9585 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9586 fixed-point fcvt. */
9587 if (GET_CODE (x
) == MULT
9588 && ((VECTOR_MODE_P (mode
)
9589 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
9590 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
9592 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
9597 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9601 if (VECTOR_MODE_P (mode
))
9605 *cost
+= extra_cost
->vect
.alu
;
9607 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9611 /* FABD, which is analogous to FADD. */
9612 if (GET_CODE (op0
) == MINUS
)
9614 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
9615 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
9617 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9621 /* Simple FABS is analogous to FNEG. */
9623 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9627 /* Integer ABS will either be split to
9628 two arithmetic instructions, or will be an ABS
9629 (scalar), which we don't model. */
9630 *cost
= COSTS_N_INSNS (2);
9632 *cost
+= 2 * extra_cost
->alu
.arith
;
9640 if (VECTOR_MODE_P (mode
))
9641 *cost
+= extra_cost
->vect
.alu
;
9644 /* FMAXNM/FMINNM/FMAX/FMIN.
9645 TODO: This may not be accurate for all implementations, but
9646 we do not model this in the cost tables. */
9647 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9653 /* The floating point round to integer frint* instructions. */
9654 if (aarch64_frint_unspec_p (XINT (x
, 1)))
9657 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
9662 if (XINT (x
, 1) == UNSPEC_RBIT
)
9665 *cost
+= extra_cost
->alu
.rev
;
9673 /* Decompose <su>muldi3_highpart. */
9674 if (/* (truncate:DI */
9677 && GET_MODE (XEXP (x
, 0)) == TImode
9678 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
9680 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
9681 /* (ANY_EXTEND:TI (reg:DI))
9682 (ANY_EXTEND:TI (reg:DI))) */
9683 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
9684 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
9685 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
9686 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
9687 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
9688 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
9689 /* (const_int 64) */
9690 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
9691 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
9695 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
9696 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
9697 mode
, MULT
, 0, speed
);
9698 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
9699 mode
, MULT
, 1, speed
);
9709 && flag_aarch64_verbose_cost
)
9711 "\nFailed to cost RTX. Assuming default cost.\n");
9716 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9717 calculated for X. This cost is stored in *COST. Returns true
9718 if the total cost of X was calculated. */
9720 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
9721 int param
, int *cost
, bool speed
)
9723 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
9726 && flag_aarch64_verbose_cost
)
9728 print_rtl_single (dump_file
, x
);
9729 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
9730 speed
? "Hot" : "Cold",
9731 *cost
, result
? "final" : "partial");
9738 aarch64_register_move_cost (machine_mode mode
,
9739 reg_class_t from_i
, reg_class_t to_i
)
9741 enum reg_class from
= (enum reg_class
) from_i
;
9742 enum reg_class to
= (enum reg_class
) to_i
;
9743 const struct cpu_regmove_cost
*regmove_cost
9744 = aarch64_tune_params
.regmove_cost
;
9746 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9747 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
9750 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
9751 from
= GENERAL_REGS
;
9753 /* Moving between GPR and stack cost is the same as GP2GP. */
9754 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
9755 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
9756 return regmove_cost
->GP2GP
;
9758 /* To/From the stack register, we move via the gprs. */
9759 if (to
== STACK_REG
|| from
== STACK_REG
)
9760 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
9761 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
9763 if (known_eq (GET_MODE_SIZE (mode
), 16))
9765 /* 128-bit operations on general registers require 2 instructions. */
9766 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9767 return regmove_cost
->GP2GP
* 2;
9768 else if (from
== GENERAL_REGS
)
9769 return regmove_cost
->GP2FP
* 2;
9770 else if (to
== GENERAL_REGS
)
9771 return regmove_cost
->FP2GP
* 2;
9773 /* When AdvSIMD instructions are disabled it is not possible to move
9774 a 128-bit value directly between Q registers. This is handled in
9775 secondary reload. A general register is used as a scratch to move
9776 the upper DI value and the lower DI value is moved directly,
9777 hence the cost is the sum of three moves. */
9779 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
9781 return regmove_cost
->FP2FP
;
9784 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9785 return regmove_cost
->GP2GP
;
9786 else if (from
== GENERAL_REGS
)
9787 return regmove_cost
->GP2FP
;
9788 else if (to
== GENERAL_REGS
)
9789 return regmove_cost
->FP2GP
;
9791 return regmove_cost
->FP2FP
;
9795 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
9796 reg_class_t rclass ATTRIBUTE_UNUSED
,
9797 bool in ATTRIBUTE_UNUSED
)
9799 return aarch64_tune_params
.memmov_cost
;
9802 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9803 to optimize 1.0/sqrt. */
9806 use_rsqrt_p (machine_mode mode
)
9808 return (!flag_trapping_math
9809 && flag_unsafe_math_optimizations
9810 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
9811 & AARCH64_APPROX_MODE (mode
))
9812 || flag_mrecip_low_precision_sqrt
));
9815 /* Function to decide when to use the approximate reciprocal square root
9819 aarch64_builtin_reciprocal (tree fndecl
)
9821 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
9823 if (!use_rsqrt_p (mode
))
9825 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
9828 /* Emit instruction sequence to compute either the approximate square root
9829 or its approximate reciprocal, depending on the flag RECP, and return
9830 whether the sequence was emitted or not. */
9833 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
9835 machine_mode mode
= GET_MODE (dst
);
9837 if (GET_MODE_INNER (mode
) == HFmode
)
9845 if (!(flag_mlow_precision_sqrt
9846 || (aarch64_tune_params
.approx_modes
->sqrt
9847 & AARCH64_APPROX_MODE (mode
))))
9850 if (flag_finite_math_only
9851 || flag_trapping_math
9852 || !flag_unsafe_math_optimizations
9853 || optimize_function_for_size_p (cfun
))
9857 /* Caller assumes we cannot fail. */
9858 gcc_assert (use_rsqrt_p (mode
));
9860 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
9861 rtx xmsk
= gen_reg_rtx (mmsk
);
9863 /* When calculating the approximate square root, compare the
9864 argument with 0.0 and create a mask. */
9865 emit_insn (gen_rtx_SET (xmsk
,
9867 gen_rtx_EQ (mmsk
, src
,
9868 CONST0_RTX (mode
)))));
9870 /* Estimate the approximate reciprocal square root. */
9871 rtx xdst
= gen_reg_rtx (mode
);
9872 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
9874 /* Iterate over the series twice for SF and thrice for DF. */
9875 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9877 /* Optionally iterate over the series once less for faster performance
9878 while sacrificing the accuracy. */
9879 if ((recp
&& flag_mrecip_low_precision_sqrt
)
9880 || (!recp
&& flag_mlow_precision_sqrt
))
9883 /* Iterate over the series to calculate the approximate reciprocal square
9885 rtx x1
= gen_reg_rtx (mode
);
9886 while (iterations
--)
9888 rtx x2
= gen_reg_rtx (mode
);
9889 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
9891 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
9894 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
9899 /* Qualify the approximate reciprocal square root when the argument is
9900 0.0 by squashing the intermediary result to 0.0. */
9901 rtx xtmp
= gen_reg_rtx (mmsk
);
9902 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
9903 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
9904 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
9906 /* Calculate the approximate square root. */
9907 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
9910 /* Finalize the approximation. */
9911 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
9916 /* Emit the instruction sequence to compute the approximation for the division
9917 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9920 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
9922 machine_mode mode
= GET_MODE (quo
);
9924 if (GET_MODE_INNER (mode
) == HFmode
)
9927 bool use_approx_division_p
= (flag_mlow_precision_div
9928 || (aarch64_tune_params
.approx_modes
->division
9929 & AARCH64_APPROX_MODE (mode
)));
9931 if (!flag_finite_math_only
9932 || flag_trapping_math
9933 || !flag_unsafe_math_optimizations
9934 || optimize_function_for_size_p (cfun
)
9935 || !use_approx_division_p
)
9938 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
9941 /* Estimate the approximate reciprocal. */
9942 rtx xrcp
= gen_reg_rtx (mode
);
9943 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
9945 /* Iterate over the series twice for SF and thrice for DF. */
9946 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9948 /* Optionally iterate over the series once less for faster performance,
9949 while sacrificing the accuracy. */
9950 if (flag_mlow_precision_div
)
9953 /* Iterate over the series to calculate the approximate reciprocal. */
9954 rtx xtmp
= gen_reg_rtx (mode
);
9955 while (iterations
--)
9957 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
9960 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
9963 if (num
!= CONST1_RTX (mode
))
9965 /* As the approximate reciprocal of DEN is already calculated, only
9966 calculate the approximate division when NUM is not 1.0. */
9967 rtx xnum
= force_reg (mode
, num
);
9968 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
9971 /* Finalize the approximation. */
9972 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
9976 /* Return the number of instructions that can be issued per cycle. */
9978 aarch64_sched_issue_rate (void)
9980 return aarch64_tune_params
.issue_rate
;
9984 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9986 int issue_rate
= aarch64_sched_issue_rate ();
9988 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
9992 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9993 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9994 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9997 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10000 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10004 /* Vectorizer cost model target hooks. */
10006 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10008 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10010 int misalign ATTRIBUTE_UNUSED
)
10013 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10016 if (vectype
!= NULL
)
10017 fp
= FLOAT_TYPE_P (vectype
);
10019 switch (type_of_cost
)
10022 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10025 return costs
->scalar_load_cost
;
10028 return costs
->scalar_store_cost
;
10031 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10034 return costs
->vec_align_load_cost
;
10037 return costs
->vec_store_cost
;
10039 case vec_to_scalar
:
10040 return costs
->vec_to_scalar_cost
;
10042 case scalar_to_vec
:
10043 return costs
->scalar_to_vec_cost
;
10045 case unaligned_load
:
10046 case vector_gather_load
:
10047 return costs
->vec_unalign_load_cost
;
10049 case unaligned_store
:
10050 case vector_scatter_store
:
10051 return costs
->vec_unalign_store_cost
;
10053 case cond_branch_taken
:
10054 return costs
->cond_taken_branch_cost
;
10056 case cond_branch_not_taken
:
10057 return costs
->cond_not_taken_branch_cost
;
10060 return costs
->vec_permute_cost
;
10062 case vec_promote_demote
:
10063 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10065 case vec_construct
:
10066 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10067 return elements
/ 2 + 1;
10070 gcc_unreachable ();
10074 /* Implement targetm.vectorize.add_stmt_cost. */
10076 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10077 struct _stmt_vec_info
*stmt_info
, int misalign
,
10078 enum vect_cost_model_location where
)
10080 unsigned *cost
= (unsigned *) data
;
10081 unsigned retval
= 0;
10083 if (flag_vect_cost_model
)
10085 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10087 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10089 /* Statements in an inner loop relative to the loop being
10090 vectorized are weighted more heavily. The value here is
10091 arbitrary and could potentially be improved with analysis. */
10092 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10093 count
*= 50; /* FIXME */
10095 retval
= (unsigned) (count
* stmt_cost
);
10096 cost
[where
] += retval
;
10102 static void initialize_aarch64_code_model (struct gcc_options
*);
10104 /* Parse the TO_PARSE string and put the architecture struct that it
10105 selects into RES and the architectural features into ISA_FLAGS.
10106 Return an aarch64_parse_opt_result describing the parse result.
10107 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10109 static enum aarch64_parse_opt_result
10110 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10111 unsigned long *isa_flags
)
10114 const struct processor
*arch
;
10115 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10118 strcpy (str
, to_parse
);
10120 ext
= strchr (str
, '+');
10125 len
= strlen (str
);
10128 return AARCH64_PARSE_MISSING_ARG
;
10131 /* Loop through the list of supported ARCHes to find a match. */
10132 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10134 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
10136 unsigned long isa_temp
= arch
->flags
;
10140 /* TO_PARSE string contains at least one extension. */
10141 enum aarch64_parse_opt_result ext_res
10142 = aarch64_parse_extension (ext
, &isa_temp
);
10144 if (ext_res
!= AARCH64_PARSE_OK
)
10147 /* Extension parsing was successful. Confirm the result
10148 arch and ISA flags. */
10150 *isa_flags
= isa_temp
;
10151 return AARCH64_PARSE_OK
;
10155 /* ARCH name not found in list. */
10156 return AARCH64_PARSE_INVALID_ARG
;
10159 /* Parse the TO_PARSE string and put the result tuning in RES and the
10160 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10161 describing the parse result. If there is an error parsing, RES and
10162 ISA_FLAGS are left unchanged. */
10164 static enum aarch64_parse_opt_result
10165 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10166 unsigned long *isa_flags
)
10169 const struct processor
*cpu
;
10170 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10173 strcpy (str
, to_parse
);
10175 ext
= strchr (str
, '+');
10180 len
= strlen (str
);
10183 return AARCH64_PARSE_MISSING_ARG
;
10186 /* Loop through the list of supported CPUs to find a match. */
10187 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10189 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
10191 unsigned long isa_temp
= cpu
->flags
;
10196 /* TO_PARSE string contains at least one extension. */
10197 enum aarch64_parse_opt_result ext_res
10198 = aarch64_parse_extension (ext
, &isa_temp
);
10200 if (ext_res
!= AARCH64_PARSE_OK
)
10203 /* Extension parsing was successfull. Confirm the result
10204 cpu and ISA flags. */
10206 *isa_flags
= isa_temp
;
10207 return AARCH64_PARSE_OK
;
10211 /* CPU name not found in list. */
10212 return AARCH64_PARSE_INVALID_ARG
;
10215 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10216 Return an aarch64_parse_opt_result describing the parse result.
10217 If the parsing fails the RES does not change. */
10219 static enum aarch64_parse_opt_result
10220 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
10222 const struct processor
*cpu
;
10223 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10225 strcpy (str
, to_parse
);
10227 /* Loop through the list of supported CPUs to find a match. */
10228 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10230 if (strcmp (cpu
->name
, str
) == 0)
10233 return AARCH64_PARSE_OK
;
10237 /* CPU name not found in list. */
10238 return AARCH64_PARSE_INVALID_ARG
;
10241 /* Parse TOKEN, which has length LENGTH to see if it is an option
10242 described in FLAG. If it is, return the index bit for that fusion type.
10243 If not, error (printing OPTION_NAME) and return zero. */
10245 static unsigned int
10246 aarch64_parse_one_option_token (const char *token
,
10248 const struct aarch64_flag_desc
*flag
,
10249 const char *option_name
)
10251 for (; flag
->name
!= NULL
; flag
++)
10253 if (length
== strlen (flag
->name
)
10254 && !strncmp (flag
->name
, token
, length
))
10258 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10262 /* Parse OPTION which is a comma-separated list of flags to enable.
10263 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10264 default state we inherit from the CPU tuning structures. OPTION_NAME
10265 gives the top-level option we are parsing in the -moverride string,
10266 for use in error messages. */
10268 static unsigned int
10269 aarch64_parse_boolean_options (const char *option
,
10270 const struct aarch64_flag_desc
*flags
,
10271 unsigned int initial_state
,
10272 const char *option_name
)
10274 const char separator
= '.';
10275 const char* specs
= option
;
10276 const char* ntoken
= option
;
10277 unsigned int found_flags
= initial_state
;
10279 while ((ntoken
= strchr (specs
, separator
)))
10281 size_t token_length
= ntoken
- specs
;
10282 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10286 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10287 in the token stream, reset the supported operations. So:
10289 adrp+add.cmp+branch.none.adrp+add
10291 would have the result of turning on only adrp+add fusion. */
10295 found_flags
|= token_ops
;
10299 /* We ended with a comma, print something. */
10302 error ("%s string ill-formed\n", option_name
);
10306 /* We still have one more token to parse. */
10307 size_t token_length
= strlen (specs
);
10308 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10315 found_flags
|= token_ops
;
10316 return found_flags
;
10319 /* Support for overriding instruction fusion. */
10322 aarch64_parse_fuse_string (const char *fuse_string
,
10323 struct tune_params
*tune
)
10325 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
10326 aarch64_fusible_pairs
,
10331 /* Support for overriding other tuning flags. */
10334 aarch64_parse_tune_string (const char *tune_string
,
10335 struct tune_params
*tune
)
10337 tune
->extra_tuning_flags
10338 = aarch64_parse_boolean_options (tune_string
,
10339 aarch64_tuning_flags
,
10340 tune
->extra_tuning_flags
,
10344 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10345 we understand. If it is, extract the option string and handoff to
10346 the appropriate function. */
10349 aarch64_parse_one_override_token (const char* token
,
10351 struct tune_params
*tune
)
10353 const struct aarch64_tuning_override_function
*fn
10354 = aarch64_tuning_override_functions
;
10356 const char *option_part
= strchr (token
, '=');
10359 error ("tuning string missing in option (%s)", token
);
10363 /* Get the length of the option name. */
10364 length
= option_part
- token
;
10365 /* Skip the '=' to get to the option string. */
10368 for (; fn
->name
!= NULL
; fn
++)
10370 if (!strncmp (fn
->name
, token
, length
))
10372 fn
->parse_override (option_part
, tune
);
10377 error ("unknown tuning option (%s)",token
);
10381 /* A checking mechanism for the implementation of the tls size. */
10384 initialize_aarch64_tls_size (struct gcc_options
*opts
)
10386 if (aarch64_tls_size
== 0)
10387 aarch64_tls_size
= 24;
10389 switch (opts
->x_aarch64_cmodel_var
)
10391 case AARCH64_CMODEL_TINY
:
10392 /* Both the default and maximum TLS size allowed under tiny is 1M which
10393 needs two instructions to address, so we clamp the size to 24. */
10394 if (aarch64_tls_size
> 24)
10395 aarch64_tls_size
= 24;
10397 case AARCH64_CMODEL_SMALL
:
10398 /* The maximum TLS size allowed under small is 4G. */
10399 if (aarch64_tls_size
> 32)
10400 aarch64_tls_size
= 32;
10402 case AARCH64_CMODEL_LARGE
:
10403 /* The maximum TLS size allowed under large is 16E.
10404 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10405 if (aarch64_tls_size
> 48)
10406 aarch64_tls_size
= 48;
10409 gcc_unreachable ();
10415 /* Parse STRING looking for options in the format:
10416 string :: option:string
10417 option :: name=substring
10419 substring :: defined by option. */
10422 aarch64_parse_override_string (const char* input_string
,
10423 struct tune_params
* tune
)
10425 const char separator
= ':';
10426 size_t string_length
= strlen (input_string
) + 1;
10427 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
10428 char *string
= string_root
;
10429 strncpy (string
, input_string
, string_length
);
10430 string
[string_length
- 1] = '\0';
10432 char* ntoken
= string
;
10434 while ((ntoken
= strchr (string
, separator
)))
10436 size_t token_length
= ntoken
- string
;
10437 /* Make this substring look like a string. */
10439 aarch64_parse_one_override_token (string
, token_length
, tune
);
10443 /* One last option to parse. */
10444 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
10445 free (string_root
);
10450 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
10452 /* PR 70044: We have to be careful about being called multiple times for the
10453 same function. This means all changes should be repeatable. */
10455 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10456 Disable the frame pointer flag so the mid-end will not use a frame
10457 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10458 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10459 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10460 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
10461 if (opts
->x_flag_omit_frame_pointer
== 0)
10462 opts
->x_flag_omit_frame_pointer
= 2;
10464 /* If not optimizing for size, set the default
10465 alignment to what the target wants. */
10466 if (!opts
->x_optimize_size
)
10468 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
10469 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
10470 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
10471 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
10472 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
10473 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
10476 /* We default to no pc-relative literal loads. */
10478 aarch64_pcrelative_literal_loads
= false;
10480 /* If -mpc-relative-literal-loads is set on the command line, this
10481 implies that the user asked for PC relative literal loads. */
10482 if (opts
->x_pcrelative_literal_loads
== 1)
10483 aarch64_pcrelative_literal_loads
= true;
10485 /* In the tiny memory model it makes no sense to disallow PC relative
10486 literal pool loads. */
10487 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10488 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10489 aarch64_pcrelative_literal_loads
= true;
10491 /* When enabling the lower precision Newton series for the square root, also
10492 enable it for the reciprocal square root, since the latter is an
10493 intermediary step for the former. */
10494 if (flag_mlow_precision_sqrt
)
10495 flag_mrecip_low_precision_sqrt
= true;
10498 /* 'Unpack' up the internal tuning structs and update the options
10499 in OPTS. The caller must have set up selected_tune and selected_arch
10500 as all the other target-specific codegen decisions are
10501 derived from them. */
10504 aarch64_override_options_internal (struct gcc_options
*opts
)
10506 aarch64_tune_flags
= selected_tune
->flags
;
10507 aarch64_tune
= selected_tune
->sched_core
;
10508 /* Make a copy of the tuning parameters attached to the core, which
10509 we may later overwrite. */
10510 aarch64_tune_params
= *(selected_tune
->tune
);
10511 aarch64_architecture_version
= selected_arch
->architecture_version
;
10513 if (opts
->x_aarch64_override_tune_string
)
10514 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
10515 &aarch64_tune_params
);
10517 /* This target defaults to strict volatile bitfields. */
10518 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
10519 opts
->x_flag_strict_volatile_bitfields
= 1;
10521 initialize_aarch64_code_model (opts
);
10522 initialize_aarch64_tls_size (opts
);
10524 int queue_depth
= 0;
10525 switch (aarch64_tune_params
.autoprefetcher_model
)
10527 case tune_params::AUTOPREFETCHER_OFF
:
10530 case tune_params::AUTOPREFETCHER_WEAK
:
10533 case tune_params::AUTOPREFETCHER_STRONG
:
10534 queue_depth
= max_insn_queue_index
+ 1;
10537 gcc_unreachable ();
10540 /* We don't mind passing in global_options_set here as we don't use
10541 the *options_set structs anyway. */
10542 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
10544 opts
->x_param_values
,
10545 global_options_set
.x_param_values
);
10547 /* Set up parameters to be used in prefetching algorithm. Do not
10548 override the defaults unless we are tuning for a core we have
10549 researched values for. */
10550 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
10551 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
10552 aarch64_tune_params
.prefetch
->num_slots
,
10553 opts
->x_param_values
,
10554 global_options_set
.x_param_values
);
10555 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
10556 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
10557 aarch64_tune_params
.prefetch
->l1_cache_size
,
10558 opts
->x_param_values
,
10559 global_options_set
.x_param_values
);
10560 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
10561 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
10562 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
10563 opts
->x_param_values
,
10564 global_options_set
.x_param_values
);
10565 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
10566 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
10567 aarch64_tune_params
.prefetch
->l2_cache_size
,
10568 opts
->x_param_values
,
10569 global_options_set
.x_param_values
);
10570 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
10571 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
10573 opts
->x_param_values
,
10574 global_options_set
.x_param_values
);
10575 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
10576 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
10577 aarch64_tune_params
.prefetch
->minimum_stride
,
10578 opts
->x_param_values
,
10579 global_options_set
.x_param_values
);
10581 /* Use the alternative scheduling-pressure algorithm by default. */
10582 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
10583 opts
->x_param_values
,
10584 global_options_set
.x_param_values
);
10586 /* Enable sw prefetching at specified optimization level for
10587 CPUS that have prefetch. Lower optimization level threshold by 1
10588 when profiling is enabled. */
10589 if (opts
->x_flag_prefetch_loop_arrays
< 0
10590 && !opts
->x_optimize_size
10591 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
10592 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
10593 opts
->x_flag_prefetch_loop_arrays
= 1;
10595 if (opts
->x_aarch64_arch_string
== NULL
)
10596 opts
->x_aarch64_arch_string
= selected_arch
->name
;
10597 if (opts
->x_aarch64_cpu_string
== NULL
)
10598 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
10599 if (opts
->x_aarch64_tune_string
== NULL
)
10600 opts
->x_aarch64_tune_string
= selected_tune
->name
;
10602 aarch64_override_options_after_change_1 (opts
);
10605 /* Print a hint with a suggestion for a core or architecture name that
10606 most closely resembles what the user passed in STR. ARCH is true if
10607 the user is asking for an architecture name. ARCH is false if the user
10608 is asking for a core name. */
10611 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
10613 auto_vec
<const char *> candidates
;
10614 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
10615 for (; entry
->name
!= NULL
; entry
++)
10616 candidates
.safe_push (entry
->name
);
10618 #ifdef HAVE_LOCAL_CPU_DETECT
10619 /* Add also "native" as possible value. */
10621 candidates
.safe_push ("native");
10625 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
10627 inform (input_location
, "valid arguments are: %s;"
10628 " did you mean %qs?", s
, hint
);
10630 inform (input_location
, "valid arguments are: %s", s
);
10635 /* Print a hint with a suggestion for a core name that most closely resembles
10636 what the user passed in STR. */
10639 aarch64_print_hint_for_core (const char *str
)
10641 aarch64_print_hint_for_core_or_arch (str
, false);
10644 /* Print a hint with a suggestion for an architecture name that most closely
10645 resembles what the user passed in STR. */
10648 aarch64_print_hint_for_arch (const char *str
)
10650 aarch64_print_hint_for_core_or_arch (str
, true);
10653 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10654 specified in STR and throw errors if appropriate. Put the results if
10655 they are valid in RES and ISA_FLAGS. Return whether the option is
10659 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
10660 unsigned long *isa_flags
)
10662 enum aarch64_parse_opt_result parse_res
10663 = aarch64_parse_cpu (str
, res
, isa_flags
);
10665 if (parse_res
== AARCH64_PARSE_OK
)
10670 case AARCH64_PARSE_MISSING_ARG
:
10671 error ("missing cpu name in %<-mcpu=%s%>", str
);
10673 case AARCH64_PARSE_INVALID_ARG
:
10674 error ("unknown value %qs for -mcpu", str
);
10675 aarch64_print_hint_for_core (str
);
10677 case AARCH64_PARSE_INVALID_FEATURE
:
10678 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
10681 gcc_unreachable ();
10687 /* Validate a command-line -march option. Parse the arch and extensions
10688 (if any) specified in STR and throw errors if appropriate. Put the
10689 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10690 option is valid. */
10693 aarch64_validate_march (const char *str
, const struct processor
**res
,
10694 unsigned long *isa_flags
)
10696 enum aarch64_parse_opt_result parse_res
10697 = aarch64_parse_arch (str
, res
, isa_flags
);
10699 if (parse_res
== AARCH64_PARSE_OK
)
10704 case AARCH64_PARSE_MISSING_ARG
:
10705 error ("missing arch name in %<-march=%s%>", str
);
10707 case AARCH64_PARSE_INVALID_ARG
:
10708 error ("unknown value %qs for -march", str
);
10709 aarch64_print_hint_for_arch (str
);
10711 case AARCH64_PARSE_INVALID_FEATURE
:
10712 error ("invalid feature modifier in %<-march=%s%>", str
);
10715 gcc_unreachable ();
10721 /* Validate a command-line -mtune option. Parse the cpu
10722 specified in STR and throw errors if appropriate. Put the
10723 result, if it is valid, in RES. Return whether the option is
10727 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
10729 enum aarch64_parse_opt_result parse_res
10730 = aarch64_parse_tune (str
, res
);
10732 if (parse_res
== AARCH64_PARSE_OK
)
10737 case AARCH64_PARSE_MISSING_ARG
:
10738 error ("missing cpu name in %<-mtune=%s%>", str
);
10740 case AARCH64_PARSE_INVALID_ARG
:
10741 error ("unknown value %qs for -mtune", str
);
10742 aarch64_print_hint_for_core (str
);
10745 gcc_unreachable ();
10750 /* Return the CPU corresponding to the enum CPU.
10751 If it doesn't specify a cpu, return the default. */
10753 static const struct processor
*
10754 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
10756 if (cpu
!= aarch64_none
)
10757 return &all_cores
[cpu
];
10759 /* The & 0x3f is to extract the bottom 6 bits that encode the
10760 default cpu as selected by the --with-cpu GCC configure option
10762 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10763 flags mechanism should be reworked to make it more sane. */
10764 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10767 /* Return the architecture corresponding to the enum ARCH.
10768 If it doesn't specify a valid architecture, return the default. */
10770 static const struct processor
*
10771 aarch64_get_arch (enum aarch64_arch arch
)
10773 if (arch
!= aarch64_no_arch
)
10774 return &all_architectures
[arch
];
10776 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10778 return &all_architectures
[cpu
->arch
];
10781 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10784 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
10786 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10787 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10788 deciding which .md file patterns to use and when deciding whether
10789 something is a legitimate address or constant. */
10790 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
10791 return poly_uint16 (2, 2);
10793 return (int) value
/ 64;
10796 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10797 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10798 tuning structs. In particular it must set selected_tune and
10799 aarch64_isa_flags that define the available ISA features and tuning
10800 decisions. It must also set selected_arch as this will be used to
10801 output the .arch asm tags for each function. */
10804 aarch64_override_options (void)
10806 unsigned long cpu_isa
= 0;
10807 unsigned long arch_isa
= 0;
10808 aarch64_isa_flags
= 0;
10810 bool valid_cpu
= true;
10811 bool valid_tune
= true;
10812 bool valid_arch
= true;
10814 selected_cpu
= NULL
;
10815 selected_arch
= NULL
;
10816 selected_tune
= NULL
;
10818 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10819 If either of -march or -mtune is given, they override their
10820 respective component of -mcpu. */
10821 if (aarch64_cpu_string
)
10822 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
10825 if (aarch64_arch_string
)
10826 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
10829 if (aarch64_tune_string
)
10830 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
10832 /* If the user did not specify a processor, choose the default
10833 one for them. This will be the CPU set during configuration using
10834 --with-cpu, otherwise it is "generic". */
10839 selected_cpu
= &all_cores
[selected_arch
->ident
];
10840 aarch64_isa_flags
= arch_isa
;
10841 explicit_arch
= selected_arch
->arch
;
10845 /* Get default configure-time CPU. */
10846 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
10847 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
10851 explicit_tune_core
= selected_tune
->ident
;
10853 /* If both -mcpu and -march are specified check that they are architecturally
10854 compatible, warn if they're not and prefer the -march ISA flags. */
10855 else if (selected_arch
)
10857 if (selected_arch
->arch
!= selected_cpu
->arch
)
10859 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10860 all_architectures
[selected_cpu
->arch
].name
,
10861 selected_arch
->name
);
10863 aarch64_isa_flags
= arch_isa
;
10864 explicit_arch
= selected_arch
->arch
;
10865 explicit_tune_core
= selected_tune
? selected_tune
->ident
10866 : selected_cpu
->ident
;
10870 /* -mcpu but no -march. */
10871 aarch64_isa_flags
= cpu_isa
;
10872 explicit_tune_core
= selected_tune
? selected_tune
->ident
10873 : selected_cpu
->ident
;
10874 gcc_assert (selected_cpu
);
10875 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10876 explicit_arch
= selected_arch
->arch
;
10879 /* Set the arch as well as we will need it when outputing
10880 the .arch directive in assembly. */
10881 if (!selected_arch
)
10883 gcc_assert (selected_cpu
);
10884 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10887 if (!selected_tune
)
10888 selected_tune
= selected_cpu
;
10890 #ifndef HAVE_AS_MABI_OPTION
10891 /* The compiler may have been configured with 2.23.* binutils, which does
10892 not have support for ILP32. */
10894 error ("assembler does not support -mabi=ilp32");
10897 /* Convert -msve-vector-bits to a VG count. */
10898 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
10900 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
10901 sorry ("return address signing is only supported for -mabi=lp64");
10903 /* Make sure we properly set up the explicit options. */
10904 if ((aarch64_cpu_string
&& valid_cpu
)
10905 || (aarch64_tune_string
&& valid_tune
))
10906 gcc_assert (explicit_tune_core
!= aarch64_none
);
10908 if ((aarch64_cpu_string
&& valid_cpu
)
10909 || (aarch64_arch_string
&& valid_arch
))
10910 gcc_assert (explicit_arch
!= aarch64_no_arch
);
10912 aarch64_override_options_internal (&global_options
);
10914 /* Save these options as the default ones in case we push and pop them later
10915 while processing functions with potential target attributes. */
10916 target_option_default_node
= target_option_current_node
10917 = build_target_option_node (&global_options
);
10920 /* Implement targetm.override_options_after_change. */
10923 aarch64_override_options_after_change (void)
10925 aarch64_override_options_after_change_1 (&global_options
);
10928 static struct machine_function
*
10929 aarch64_init_machine_status (void)
10931 struct machine_function
*machine
;
10932 machine
= ggc_cleared_alloc
<machine_function
> ();
10937 aarch64_init_expanders (void)
10939 init_machine_status
= aarch64_init_machine_status
;
10942 /* A checking mechanism for the implementation of the various code models. */
10944 initialize_aarch64_code_model (struct gcc_options
*opts
)
10946 if (opts
->x_flag_pic
)
10948 switch (opts
->x_aarch64_cmodel_var
)
10950 case AARCH64_CMODEL_TINY
:
10951 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
10953 case AARCH64_CMODEL_SMALL
:
10954 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10955 aarch64_cmodel
= (flag_pic
== 2
10956 ? AARCH64_CMODEL_SMALL_PIC
10957 : AARCH64_CMODEL_SMALL_SPIC
);
10959 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
10962 case AARCH64_CMODEL_LARGE
:
10963 sorry ("code model %qs with -f%s", "large",
10964 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
10967 gcc_unreachable ();
10971 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
10974 /* Implement TARGET_OPTION_SAVE. */
10977 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
10979 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
10982 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10983 using the information saved in PTR. */
10986 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
10988 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
10989 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
10990 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
10991 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
10992 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
10994 aarch64_override_options_internal (opts
);
10997 /* Implement TARGET_OPTION_PRINT. */
11000 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
11002 const struct processor
*cpu
11003 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11004 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
11005 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11006 std::string extension
11007 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
11009 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
11010 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
11011 arch
->name
, extension
.c_str ());
11014 static GTY(()) tree aarch64_previous_fndecl
;
11017 aarch64_reset_previous_fndecl (void)
11019 aarch64_previous_fndecl
= NULL
;
11022 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11023 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11024 make sure optab availability predicates are recomputed when necessary. */
11027 aarch64_save_restore_target_globals (tree new_tree
)
11029 if (TREE_TARGET_GLOBALS (new_tree
))
11030 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
11031 else if (new_tree
== target_option_default_node
)
11032 restore_target_globals (&default_target_globals
);
11034 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
11037 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11038 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11039 of the function, if such exists. This function may be called multiple
11040 times on a single function so use aarch64_previous_fndecl to avoid
11041 setting up identical state. */
11044 aarch64_set_current_function (tree fndecl
)
11046 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
11049 tree old_tree
= (aarch64_previous_fndecl
11050 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
11053 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11055 /* If current function has no attributes but the previous one did,
11056 use the default node. */
11057 if (!new_tree
&& old_tree
)
11058 new_tree
= target_option_default_node
;
11060 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11061 the default have been handled by aarch64_save_restore_target_globals from
11062 aarch64_pragma_target_parse. */
11063 if (old_tree
== new_tree
)
11066 aarch64_previous_fndecl
= fndecl
;
11068 /* First set the target options. */
11069 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
11071 aarch64_save_restore_target_globals (new_tree
);
11074 /* Enum describing the various ways we can handle attributes.
11075 In many cases we can reuse the generic option handling machinery. */
11077 enum aarch64_attr_opt_type
11079 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
11080 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
11081 aarch64_attr_enum
, /* Attribute sets an enum variable. */
11082 aarch64_attr_custom
/* Attribute requires a custom handling function. */
11085 /* All the information needed to handle a target attribute.
11086 NAME is the name of the attribute.
11087 ATTR_TYPE specifies the type of behavior of the attribute as described
11088 in the definition of enum aarch64_attr_opt_type.
11089 ALLOW_NEG is true if the attribute supports a "no-" form.
11090 HANDLER is the function that takes the attribute string as an argument
11091 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11092 OPT_NUM is the enum specifying the option that the attribute modifies.
11093 This is needed for attributes that mirror the behavior of a command-line
11094 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11095 aarch64_attr_enum. */
11097 struct aarch64_attribute_info
11100 enum aarch64_attr_opt_type attr_type
;
11102 bool (*handler
) (const char *);
11103 enum opt_code opt_num
;
11106 /* Handle the ARCH_STR argument to the arch= target attribute. */
11109 aarch64_handle_attr_arch (const char *str
)
11111 const struct processor
*tmp_arch
= NULL
;
11112 enum aarch64_parse_opt_result parse_res
11113 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
11115 if (parse_res
== AARCH64_PARSE_OK
)
11117 gcc_assert (tmp_arch
);
11118 selected_arch
= tmp_arch
;
11119 explicit_arch
= selected_arch
->arch
;
11125 case AARCH64_PARSE_MISSING_ARG
:
11126 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11128 case AARCH64_PARSE_INVALID_ARG
:
11129 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
11130 aarch64_print_hint_for_arch (str
);
11132 case AARCH64_PARSE_INVALID_FEATURE
:
11133 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11136 gcc_unreachable ();
11142 /* Handle the argument CPU_STR to the cpu= target attribute. */
11145 aarch64_handle_attr_cpu (const char *str
)
11147 const struct processor
*tmp_cpu
= NULL
;
11148 enum aarch64_parse_opt_result parse_res
11149 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
11151 if (parse_res
== AARCH64_PARSE_OK
)
11153 gcc_assert (tmp_cpu
);
11154 selected_tune
= tmp_cpu
;
11155 explicit_tune_core
= selected_tune
->ident
;
11157 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
11158 explicit_arch
= selected_arch
->arch
;
11164 case AARCH64_PARSE_MISSING_ARG
:
11165 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11167 case AARCH64_PARSE_INVALID_ARG
:
11168 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
11169 aarch64_print_hint_for_core (str
);
11171 case AARCH64_PARSE_INVALID_FEATURE
:
11172 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11175 gcc_unreachable ();
11181 /* Handle the argument STR to the tune= target attribute. */
11184 aarch64_handle_attr_tune (const char *str
)
11186 const struct processor
*tmp_tune
= NULL
;
11187 enum aarch64_parse_opt_result parse_res
11188 = aarch64_parse_tune (str
, &tmp_tune
);
11190 if (parse_res
== AARCH64_PARSE_OK
)
11192 gcc_assert (tmp_tune
);
11193 selected_tune
= tmp_tune
;
11194 explicit_tune_core
= selected_tune
->ident
;
11200 case AARCH64_PARSE_INVALID_ARG
:
11201 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
11202 aarch64_print_hint_for_core (str
);
11205 gcc_unreachable ();
11211 /* Parse an architecture extensions target attribute string specified in STR.
11212 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11213 if successful. Update aarch64_isa_flags to reflect the ISA features
11217 aarch64_handle_attr_isa_flags (char *str
)
11219 enum aarch64_parse_opt_result parse_res
;
11220 unsigned long isa_flags
= aarch64_isa_flags
;
11222 /* We allow "+nothing" in the beginning to clear out all architectural
11223 features if the user wants to handpick specific features. */
11224 if (strncmp ("+nothing", str
, 8) == 0)
11230 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
11232 if (parse_res
== AARCH64_PARSE_OK
)
11234 aarch64_isa_flags
= isa_flags
;
11240 case AARCH64_PARSE_MISSING_ARG
:
11241 error ("missing value in %<target()%> pragma or attribute");
11244 case AARCH64_PARSE_INVALID_FEATURE
:
11245 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11249 gcc_unreachable ();
11255 /* The target attributes that we support. On top of these we also support just
11256 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11257 handled explicitly in aarch64_process_one_target_attr. */
11259 static const struct aarch64_attribute_info aarch64_attributes
[] =
11261 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
11262 OPT_mgeneral_regs_only
},
11263 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
11264 OPT_mfix_cortex_a53_835769
},
11265 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
11266 OPT_mfix_cortex_a53_843419
},
11267 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
11268 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
11269 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
11270 OPT_momit_leaf_frame_pointer
},
11271 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
11272 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
11274 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
11275 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
11277 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
11278 OPT_msign_return_address_
},
11279 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
11282 /* Parse ARG_STR which contains the definition of one target attribute.
11283 Show appropriate errors if any or return true if the attribute is valid. */
11286 aarch64_process_one_target_attr (char *arg_str
)
11288 bool invert
= false;
11290 size_t len
= strlen (arg_str
);
11294 error ("malformed %<target()%> pragma or attribute");
11298 char *str_to_check
= (char *) alloca (len
+ 1);
11299 strcpy (str_to_check
, arg_str
);
11301 /* Skip leading whitespace. */
11302 while (*str_to_check
== ' ' || *str_to_check
== '\t')
11305 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11306 It is easier to detect and handle it explicitly here rather than going
11307 through the machinery for the rest of the target attributes in this
11309 if (*str_to_check
== '+')
11310 return aarch64_handle_attr_isa_flags (str_to_check
);
11312 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
11317 char *arg
= strchr (str_to_check
, '=');
11319 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11320 and point ARG to "foo". */
11326 const struct aarch64_attribute_info
*p_attr
;
11327 bool found
= false;
11328 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
11330 /* If the names don't match up, or the user has given an argument
11331 to an attribute that doesn't accept one, or didn't give an argument
11332 to an attribute that expects one, fail to match. */
11333 if (strcmp (str_to_check
, p_attr
->name
) != 0)
11337 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
11338 || p_attr
->attr_type
== aarch64_attr_enum
;
11340 if (attr_need_arg_p
^ (arg
!= NULL
))
11342 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
11346 /* If the name matches but the attribute does not allow "no-" versions
11347 then we can't match. */
11348 if (invert
&& !p_attr
->allow_neg
)
11350 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
11354 switch (p_attr
->attr_type
)
11356 /* Has a custom handler registered.
11357 For example, cpu=, arch=, tune=. */
11358 case aarch64_attr_custom
:
11359 gcc_assert (p_attr
->handler
);
11360 if (!p_attr
->handler (arg
))
11364 /* Either set or unset a boolean option. */
11365 case aarch64_attr_bool
:
11367 struct cl_decoded_option decoded
;
11369 generate_option (p_attr
->opt_num
, NULL
, !invert
,
11370 CL_TARGET
, &decoded
);
11371 aarch64_handle_option (&global_options
, &global_options_set
,
11372 &decoded
, input_location
);
11375 /* Set or unset a bit in the target_flags. aarch64_handle_option
11376 should know what mask to apply given the option number. */
11377 case aarch64_attr_mask
:
11379 struct cl_decoded_option decoded
;
11380 /* We only need to specify the option number.
11381 aarch64_handle_option will know which mask to apply. */
11382 decoded
.opt_index
= p_attr
->opt_num
;
11383 decoded
.value
= !invert
;
11384 aarch64_handle_option (&global_options
, &global_options_set
,
11385 &decoded
, input_location
);
11388 /* Use the option setting machinery to set an option to an enum. */
11389 case aarch64_attr_enum
:
11394 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
11395 &value
, CL_TARGET
);
11398 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
11399 NULL
, DK_UNSPECIFIED
, input_location
,
11404 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
11409 gcc_unreachable ();
11413 /* If we reached here we either have found an attribute and validated
11414 it or didn't match any. If we matched an attribute but its arguments
11415 were malformed we will have returned false already. */
11419 /* Count how many times the character C appears in
11420 NULL-terminated string STR. */
11422 static unsigned int
11423 num_occurences_in_str (char c
, char *str
)
11425 unsigned int res
= 0;
11426 while (*str
!= '\0')
11437 /* Parse the tree in ARGS that contains the target attribute information
11438 and update the global target options space. */
11441 aarch64_process_target_attr (tree args
)
11443 if (TREE_CODE (args
) == TREE_LIST
)
11447 tree head
= TREE_VALUE (args
);
11450 if (!aarch64_process_target_attr (head
))
11453 args
= TREE_CHAIN (args
);
11459 if (TREE_CODE (args
) != STRING_CST
)
11461 error ("attribute %<target%> argument not a string");
11465 size_t len
= strlen (TREE_STRING_POINTER (args
));
11466 char *str_to_check
= (char *) alloca (len
+ 1);
11467 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
11471 error ("malformed %<target()%> pragma or attribute");
11475 /* Used to catch empty spaces between commas i.e.
11476 attribute ((target ("attr1,,attr2"))). */
11477 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
11479 /* Handle multiple target attributes separated by ','. */
11480 char *token
= strtok (str_to_check
, ",");
11482 unsigned int num_attrs
= 0;
11486 if (!aarch64_process_one_target_attr (token
))
11488 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
11492 token
= strtok (NULL
, ",");
11495 if (num_attrs
!= num_commas
+ 1)
11497 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
11504 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11505 process attribute ((target ("..."))). */
11508 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
11510 struct cl_target_option cur_target
;
11513 tree new_target
, new_optimize
;
11514 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11516 /* If what we're processing is the current pragma string then the
11517 target option node is already stored in target_option_current_node
11518 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11519 having to re-parse the string. This is especially useful to keep
11520 arm_neon.h compile times down since that header contains a lot
11521 of intrinsics enclosed in pragmas. */
11522 if (!existing_target
&& args
== current_target_pragma
)
11524 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
11527 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11529 old_optimize
= build_optimization_node (&global_options
);
11530 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11532 /* If the function changed the optimization levels as well as setting
11533 target options, start with the optimizations specified. */
11534 if (func_optimize
&& func_optimize
!= old_optimize
)
11535 cl_optimization_restore (&global_options
,
11536 TREE_OPTIMIZATION (func_optimize
));
11538 /* Save the current target options to restore at the end. */
11539 cl_target_option_save (&cur_target
, &global_options
);
11541 /* If fndecl already has some target attributes applied to it, unpack
11542 them so that we add this attribute on top of them, rather than
11543 overwriting them. */
11544 if (existing_target
)
11546 struct cl_target_option
*existing_options
11547 = TREE_TARGET_OPTION (existing_target
);
11549 if (existing_options
)
11550 cl_target_option_restore (&global_options
, existing_options
);
11553 cl_target_option_restore (&global_options
,
11554 TREE_TARGET_OPTION (target_option_current_node
));
11556 ret
= aarch64_process_target_attr (args
);
11558 /* Set up any additional state. */
11561 aarch64_override_options_internal (&global_options
);
11562 /* Initialize SIMD builtins if we haven't already.
11563 Set current_target_pragma to NULL for the duration so that
11564 the builtin initialization code doesn't try to tag the functions
11565 being built with the attributes specified by any current pragma, thus
11566 going into an infinite recursion. */
11569 tree saved_current_target_pragma
= current_target_pragma
;
11570 current_target_pragma
= NULL
;
11571 aarch64_init_simd_builtins ();
11572 current_target_pragma
= saved_current_target_pragma
;
11574 new_target
= build_target_option_node (&global_options
);
11579 new_optimize
= build_optimization_node (&global_options
);
11583 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
11585 if (old_optimize
!= new_optimize
)
11586 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
11589 cl_target_option_restore (&global_options
, &cur_target
);
11591 if (old_optimize
!= new_optimize
)
11592 cl_optimization_restore (&global_options
,
11593 TREE_OPTIMIZATION (old_optimize
));
11597 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11598 tri-bool options (yes, no, don't care) and the default value is
11599 DEF, determine whether to reject inlining. */
11602 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
11603 int dont_care
, int def
)
11605 /* If the callee doesn't care, always allow inlining. */
11606 if (callee
== dont_care
)
11609 /* If the caller doesn't care, always allow inlining. */
11610 if (caller
== dont_care
)
11613 /* Otherwise, allow inlining if either the callee and caller values
11614 agree, or if the callee is using the default value. */
11615 return (callee
== caller
|| callee
== def
);
11618 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11619 to inline CALLEE into CALLER based on target-specific info.
11620 Make sure that the caller and callee have compatible architectural
11621 features. Then go through the other possible target attributes
11622 and see if they can block inlining. Try not to reject always_inline
11623 callees unless they are incompatible architecturally. */
11626 aarch64_can_inline_p (tree caller
, tree callee
)
11628 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
11629 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
11631 struct cl_target_option
*caller_opts
11632 = TREE_TARGET_OPTION (caller_tree
? caller_tree
11633 : target_option_default_node
);
11635 struct cl_target_option
*callee_opts
11636 = TREE_TARGET_OPTION (callee_tree
? callee_tree
11637 : target_option_default_node
);
11639 /* Callee's ISA flags should be a subset of the caller's. */
11640 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
11641 != callee_opts
->x_aarch64_isa_flags
)
11644 /* Allow non-strict aligned functions inlining into strict
11646 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
11647 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
11648 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
11649 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
11652 bool always_inline
= lookup_attribute ("always_inline",
11653 DECL_ATTRIBUTES (callee
));
11655 /* If the architectural features match up and the callee is always_inline
11656 then the other attributes don't matter. */
11660 if (caller_opts
->x_aarch64_cmodel_var
11661 != callee_opts
->x_aarch64_cmodel_var
)
11664 if (caller_opts
->x_aarch64_tls_dialect
11665 != callee_opts
->x_aarch64_tls_dialect
)
11668 /* Honour explicit requests to workaround errata. */
11669 if (!aarch64_tribools_ok_for_inlining_p (
11670 caller_opts
->x_aarch64_fix_a53_err835769
,
11671 callee_opts
->x_aarch64_fix_a53_err835769
,
11672 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
11675 if (!aarch64_tribools_ok_for_inlining_p (
11676 caller_opts
->x_aarch64_fix_a53_err843419
,
11677 callee_opts
->x_aarch64_fix_a53_err843419
,
11678 2, TARGET_FIX_ERR_A53_843419
))
11681 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11682 caller and calle and they don't match up, reject inlining. */
11683 if (!aarch64_tribools_ok_for_inlining_p (
11684 caller_opts
->x_flag_omit_leaf_frame_pointer
,
11685 callee_opts
->x_flag_omit_leaf_frame_pointer
,
11689 /* If the callee has specific tuning overrides, respect them. */
11690 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
11691 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
11694 /* If the user specified tuning override strings for the
11695 caller and callee and they don't match up, reject inlining.
11696 We just do a string compare here, we don't analyze the meaning
11697 of the string, as it would be too costly for little gain. */
11698 if (callee_opts
->x_aarch64_override_tune_string
11699 && caller_opts
->x_aarch64_override_tune_string
11700 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
11701 caller_opts
->x_aarch64_override_tune_string
) != 0))
11707 /* Return true if SYMBOL_REF X binds locally. */
11710 aarch64_symbol_binds_local_p (const_rtx x
)
11712 return (SYMBOL_REF_DECL (x
)
11713 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
11714 : SYMBOL_REF_LOCAL_P (x
));
11717 /* Return true if SYMBOL_REF X is thread local */
11719 aarch64_tls_symbol_p (rtx x
)
11721 if (! TARGET_HAVE_TLS
)
11724 if (GET_CODE (x
) != SYMBOL_REF
)
11727 return SYMBOL_REF_TLS_MODEL (x
) != 0;
11730 /* Classify a TLS symbol into one of the TLS kinds. */
11731 enum aarch64_symbol_type
11732 aarch64_classify_tls_symbol (rtx x
)
11734 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
11738 case TLS_MODEL_GLOBAL_DYNAMIC
:
11739 case TLS_MODEL_LOCAL_DYNAMIC
:
11740 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
11742 case TLS_MODEL_INITIAL_EXEC
:
11743 switch (aarch64_cmodel
)
11745 case AARCH64_CMODEL_TINY
:
11746 case AARCH64_CMODEL_TINY_PIC
:
11747 return SYMBOL_TINY_TLSIE
;
11749 return SYMBOL_SMALL_TLSIE
;
11752 case TLS_MODEL_LOCAL_EXEC
:
11753 if (aarch64_tls_size
== 12)
11754 return SYMBOL_TLSLE12
;
11755 else if (aarch64_tls_size
== 24)
11756 return SYMBOL_TLSLE24
;
11757 else if (aarch64_tls_size
== 32)
11758 return SYMBOL_TLSLE32
;
11759 else if (aarch64_tls_size
== 48)
11760 return SYMBOL_TLSLE48
;
11762 gcc_unreachable ();
11764 case TLS_MODEL_EMULATED
:
11765 case TLS_MODEL_NONE
:
11766 return SYMBOL_FORCE_TO_MEM
;
11769 gcc_unreachable ();
11773 /* Return the correct method for accessing X + OFFSET, where X is either
11774 a SYMBOL_REF or LABEL_REF. */
11776 enum aarch64_symbol_type
11777 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
11779 if (GET_CODE (x
) == LABEL_REF
)
11781 switch (aarch64_cmodel
)
11783 case AARCH64_CMODEL_LARGE
:
11784 return SYMBOL_FORCE_TO_MEM
;
11786 case AARCH64_CMODEL_TINY_PIC
:
11787 case AARCH64_CMODEL_TINY
:
11788 return SYMBOL_TINY_ABSOLUTE
;
11790 case AARCH64_CMODEL_SMALL_SPIC
:
11791 case AARCH64_CMODEL_SMALL_PIC
:
11792 case AARCH64_CMODEL_SMALL
:
11793 return SYMBOL_SMALL_ABSOLUTE
;
11796 gcc_unreachable ();
11800 if (GET_CODE (x
) == SYMBOL_REF
)
11802 if (aarch64_tls_symbol_p (x
))
11803 return aarch64_classify_tls_symbol (x
);
11805 switch (aarch64_cmodel
)
11807 case AARCH64_CMODEL_TINY
:
11808 /* When we retrieve symbol + offset address, we have to make sure
11809 the offset does not cause overflow of the final address. But
11810 we have no way of knowing the address of symbol at compile time
11811 so we can't accurately say if the distance between the PC and
11812 symbol + offset is outside the addressible range of +/-1M in the
11813 TINY code model. So we rely on images not being greater than
11814 1M and cap the offset at 1M and anything beyond 1M will have to
11815 be loaded using an alternative mechanism. Furthermore if the
11816 symbol is a weak reference to something that isn't known to
11817 resolve to a symbol in this module, then force to memory. */
11818 if ((SYMBOL_REF_WEAK (x
)
11819 && !aarch64_symbol_binds_local_p (x
))
11820 || !IN_RANGE (offset
, -1048575, 1048575))
11821 return SYMBOL_FORCE_TO_MEM
;
11822 return SYMBOL_TINY_ABSOLUTE
;
11824 case AARCH64_CMODEL_SMALL
:
11825 /* Same reasoning as the tiny code model, but the offset cap here is
11827 if ((SYMBOL_REF_WEAK (x
)
11828 && !aarch64_symbol_binds_local_p (x
))
11829 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
11830 HOST_WIDE_INT_C (4294967264)))
11831 return SYMBOL_FORCE_TO_MEM
;
11832 return SYMBOL_SMALL_ABSOLUTE
;
11834 case AARCH64_CMODEL_TINY_PIC
:
11835 if (!aarch64_symbol_binds_local_p (x
))
11836 return SYMBOL_TINY_GOT
;
11837 return SYMBOL_TINY_ABSOLUTE
;
11839 case AARCH64_CMODEL_SMALL_SPIC
:
11840 case AARCH64_CMODEL_SMALL_PIC
:
11841 if (!aarch64_symbol_binds_local_p (x
))
11842 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
11843 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
11844 return SYMBOL_SMALL_ABSOLUTE
;
11846 case AARCH64_CMODEL_LARGE
:
11847 /* This is alright even in PIC code as the constant
11848 pool reference is always PC relative and within
11849 the same translation unit. */
11850 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
11851 return SYMBOL_SMALL_ABSOLUTE
;
11853 return SYMBOL_FORCE_TO_MEM
;
11856 gcc_unreachable ();
11860 /* By default push everything into the constant pool. */
11861 return SYMBOL_FORCE_TO_MEM
;
11865 aarch64_constant_address_p (rtx x
)
11867 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
11871 aarch64_legitimate_pic_operand_p (rtx x
)
11873 if (GET_CODE (x
) == SYMBOL_REF
11874 || (GET_CODE (x
) == CONST
11875 && GET_CODE (XEXP (x
, 0)) == PLUS
11876 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
11882 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11883 that should be rematerialized rather than spilled. */
11886 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
11888 /* Support CSE and rematerialization of common constants. */
11889 if (CONST_INT_P (x
)
11890 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11891 || GET_CODE (x
) == CONST_VECTOR
)
11894 /* Do not allow vector struct mode constants for Advanced SIMD.
11895 We could support 0 and -1 easily, but they need support in
11896 aarch64-simd.md. */
11897 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
11898 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
11901 /* Only accept variable-length vector constants if they can be
11904 ??? It would be possible to handle rematerialization of other
11905 constants via secondary reloads. */
11906 if (vec_flags
& VEC_ANY_SVE
)
11907 return aarch64_simd_valid_immediate (x
, NULL
);
11909 if (GET_CODE (x
) == HIGH
)
11912 /* Accept polynomial constants that can be calculated by using the
11913 destination of a move as the sole temporary. Constants that
11914 require a second temporary cannot be rematerialized (they can't be
11915 forced to memory and also aren't legitimate constants). */
11917 if (poly_int_rtx_p (x
, &offset
))
11918 return aarch64_offset_temporaries (false, offset
) <= 1;
11920 /* If an offset is being added to something else, we need to allow the
11921 base to be moved into the destination register, meaning that there
11922 are no free temporaries for the offset. */
11923 x
= strip_offset (x
, &offset
);
11924 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
11927 /* Do not allow const (plus (anchor_symbol, const_int)). */
11928 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
11931 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11932 so spilling them is better than rematerialization. */
11933 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
11936 /* Label references are always constant. */
11937 if (GET_CODE (x
) == LABEL_REF
)
11944 aarch64_load_tp (rtx target
)
11947 || GET_MODE (target
) != Pmode
11948 || !register_operand (target
, Pmode
))
11949 target
= gen_reg_rtx (Pmode
);
11951 /* Can return in any reg. */
11952 emit_insn (gen_aarch64_load_tp_hard (target
));
11956 /* On AAPCS systems, this is the "struct __va_list". */
11957 static GTY(()) tree va_list_type
;
11959 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11960 Return the type to use as __builtin_va_list.
11962 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11974 aarch64_build_builtin_va_list (void)
11977 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
11979 /* Create the type. */
11980 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
11981 /* Give it the required name. */
11982 va_list_name
= build_decl (BUILTINS_LOCATION
,
11984 get_identifier ("__va_list"),
11986 DECL_ARTIFICIAL (va_list_name
) = 1;
11987 TYPE_NAME (va_list_type
) = va_list_name
;
11988 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
11990 /* Create the fields. */
11991 f_stack
= build_decl (BUILTINS_LOCATION
,
11992 FIELD_DECL
, get_identifier ("__stack"),
11994 f_grtop
= build_decl (BUILTINS_LOCATION
,
11995 FIELD_DECL
, get_identifier ("__gr_top"),
11997 f_vrtop
= build_decl (BUILTINS_LOCATION
,
11998 FIELD_DECL
, get_identifier ("__vr_top"),
12000 f_groff
= build_decl (BUILTINS_LOCATION
,
12001 FIELD_DECL
, get_identifier ("__gr_offs"),
12002 integer_type_node
);
12003 f_vroff
= build_decl (BUILTINS_LOCATION
,
12004 FIELD_DECL
, get_identifier ("__vr_offs"),
12005 integer_type_node
);
12007 /* Tell tree-stdarg pass about our internal offset fields.
12008 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12009 purpose to identify whether the code is updating va_list internal
12010 offset fields through irregular way. */
12011 va_list_gpr_counter_field
= f_groff
;
12012 va_list_fpr_counter_field
= f_vroff
;
12014 DECL_ARTIFICIAL (f_stack
) = 1;
12015 DECL_ARTIFICIAL (f_grtop
) = 1;
12016 DECL_ARTIFICIAL (f_vrtop
) = 1;
12017 DECL_ARTIFICIAL (f_groff
) = 1;
12018 DECL_ARTIFICIAL (f_vroff
) = 1;
12020 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
12021 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
12022 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
12023 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
12024 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
12026 TYPE_FIELDS (va_list_type
) = f_stack
;
12027 DECL_CHAIN (f_stack
) = f_grtop
;
12028 DECL_CHAIN (f_grtop
) = f_vrtop
;
12029 DECL_CHAIN (f_vrtop
) = f_groff
;
12030 DECL_CHAIN (f_groff
) = f_vroff
;
12032 /* Compute its layout. */
12033 layout_type (va_list_type
);
12035 return va_list_type
;
12038 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12040 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
12042 const CUMULATIVE_ARGS
*cum
;
12043 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12044 tree stack
, grtop
, vrtop
, groff
, vroff
;
12046 int gr_save_area_size
= cfun
->va_list_gpr_size
;
12047 int vr_save_area_size
= cfun
->va_list_fpr_size
;
12050 cum
= &crtl
->args
.info
;
12051 if (cfun
->va_list_gpr_size
)
12052 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
12053 cfun
->va_list_gpr_size
);
12054 if (cfun
->va_list_fpr_size
)
12055 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
12056 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
12060 gcc_assert (cum
->aapcs_nvrn
== 0);
12061 vr_save_area_size
= 0;
12064 f_stack
= TYPE_FIELDS (va_list_type_node
);
12065 f_grtop
= DECL_CHAIN (f_stack
);
12066 f_vrtop
= DECL_CHAIN (f_grtop
);
12067 f_groff
= DECL_CHAIN (f_vrtop
);
12068 f_vroff
= DECL_CHAIN (f_groff
);
12070 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
12072 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
12074 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
12076 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
12078 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
12081 /* Emit code to initialize STACK, which points to the next varargs stack
12082 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12083 by named arguments. STACK is 8-byte aligned. */
12084 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
12085 if (cum
->aapcs_stack_size
> 0)
12086 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
12087 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
12088 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12090 /* Emit code to initialize GRTOP, the top of the GR save area.
12091 virtual_incoming_args_rtx should have been 16 byte aligned. */
12092 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
12093 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
12094 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12096 /* Emit code to initialize VRTOP, the top of the VR save area.
12097 This address is gr_save_area_bytes below GRTOP, rounded
12098 down to the next 16-byte boundary. */
12099 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
12100 vr_offset
= ROUND_UP (gr_save_area_size
,
12101 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12104 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
12105 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
12106 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12108 /* Emit code to initialize GROFF, the offset from GRTOP of the
12109 next GPR argument. */
12110 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
12111 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
12112 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12114 /* Likewise emit code to initialize VROFF, the offset from FTOP
12115 of the next VR argument. */
12116 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
12117 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
12118 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12121 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12124 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
12125 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
12129 bool is_ha
; /* is HFA or HVA. */
12130 bool dw_align
; /* double-word align. */
12131 machine_mode ag_mode
= VOIDmode
;
12135 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12136 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
12137 HOST_WIDE_INT size
, rsize
, adjust
, align
;
12138 tree t
, u
, cond1
, cond2
;
12140 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
12142 type
= build_pointer_type (type
);
12144 mode
= TYPE_MODE (type
);
12146 f_stack
= TYPE_FIELDS (va_list_type_node
);
12147 f_grtop
= DECL_CHAIN (f_stack
);
12148 f_vrtop
= DECL_CHAIN (f_grtop
);
12149 f_groff
= DECL_CHAIN (f_vrtop
);
12150 f_vroff
= DECL_CHAIN (f_groff
);
12152 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
12153 f_stack
, NULL_TREE
);
12154 size
= int_size_in_bytes (type
);
12155 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
12159 if (aarch64_vfp_is_call_or_return_candidate (mode
,
12165 /* No frontends can create types with variable-sized modes, so we
12166 shouldn't be asked to pass or return them. */
12167 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
12169 /* TYPE passed in fp/simd registers. */
12171 aarch64_err_no_fpadvsimd (mode
);
12173 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
12174 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
12175 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
12176 unshare_expr (valist
), f_vroff
, NULL_TREE
);
12178 rsize
= nregs
* UNITS_PER_VREG
;
12182 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
12183 adjust
= UNITS_PER_VREG
- ag_size
;
12185 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12186 && size
< UNITS_PER_VREG
)
12188 adjust
= UNITS_PER_VREG
- size
;
12193 /* TYPE passed in general registers. */
12194 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
12195 unshare_expr (valist
), f_grtop
, NULL_TREE
);
12196 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
12197 unshare_expr (valist
), f_groff
, NULL_TREE
);
12198 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
12199 nregs
= rsize
/ UNITS_PER_WORD
;
12204 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12205 && size
< UNITS_PER_WORD
)
12207 adjust
= UNITS_PER_WORD
- size
;
12211 /* Get a local temporary for the field value. */
12212 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
12214 /* Emit code to branch if off >= 0. */
12215 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
12216 build_int_cst (TREE_TYPE (off
), 0));
12217 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
12221 /* Emit: offs = (offs + 15) & -16. */
12222 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12223 build_int_cst (TREE_TYPE (off
), 15));
12224 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
12225 build_int_cst (TREE_TYPE (off
), -16));
12226 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
12231 /* Update ap.__[g|v]r_offs */
12232 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12233 build_int_cst (TREE_TYPE (off
), rsize
));
12234 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
12238 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12240 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12241 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
12242 build_int_cst (TREE_TYPE (f_off
), 0));
12243 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
12245 /* String up: make sure the assignment happens before the use. */
12246 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
12247 COND_EXPR_ELSE (cond1
) = t
;
12249 /* Prepare the trees handling the argument that is passed on the stack;
12250 the top level node will store in ON_STACK. */
12251 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
12254 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12255 t
= fold_build_pointer_plus_hwi (arg
, 15);
12256 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12257 build_int_cst (TREE_TYPE (t
), -16));
12258 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
12262 /* Advance ap.__stack */
12263 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
12264 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12265 build_int_cst (TREE_TYPE (t
), -8));
12266 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
12267 /* String up roundup and advance. */
12269 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12270 /* String up with arg */
12271 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
12272 /* Big-endianness related address adjustment. */
12273 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12274 && size
< UNITS_PER_WORD
)
12276 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
12277 size_int (UNITS_PER_WORD
- size
));
12278 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
12281 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
12282 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
12284 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12287 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
12288 build_int_cst (TREE_TYPE (off
), adjust
));
12290 t
= fold_convert (sizetype
, t
);
12291 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
12295 /* type ha; // treat as "struct {ftype field[n];}"
12296 ... [computing offs]
12297 for (i = 0; i <nregs; ++i, offs += 16)
12298 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12301 tree tmp_ha
, field_t
, field_ptr_t
;
12303 /* Declare a local variable. */
12304 tmp_ha
= create_tmp_var_raw (type
, "ha");
12305 gimple_add_tmp_var (tmp_ha
);
12307 /* Establish the base type. */
12311 field_t
= float_type_node
;
12312 field_ptr_t
= float_ptr_type_node
;
12315 field_t
= double_type_node
;
12316 field_ptr_t
= double_ptr_type_node
;
12319 field_t
= long_double_type_node
;
12320 field_ptr_t
= long_double_ptr_type_node
;
12323 field_t
= aarch64_fp16_type_node
;
12324 field_ptr_t
= aarch64_fp16_ptr_type_node
;
12329 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
12330 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
12331 field_ptr_t
= build_pointer_type (field_t
);
12338 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12339 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
12341 t
= fold_convert (field_ptr_t
, addr
);
12342 t
= build2 (MODIFY_EXPR
, field_t
,
12343 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
12344 build1 (INDIRECT_REF
, field_t
, t
));
12346 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12347 for (i
= 1; i
< nregs
; ++i
)
12349 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
12350 u
= fold_convert (field_ptr_t
, addr
);
12351 u
= build2 (MODIFY_EXPR
, field_t
,
12352 build2 (MEM_REF
, field_t
, tmp_ha
,
12353 build_int_cst (field_ptr_t
,
12355 int_size_in_bytes (field_t
)))),
12356 build1 (INDIRECT_REF
, field_t
, u
));
12357 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
12360 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
12361 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
12364 COND_EXPR_ELSE (cond2
) = t
;
12365 addr
= fold_convert (build_pointer_type (type
), cond1
);
12366 addr
= build_va_arg_indirect_ref (addr
);
12369 addr
= build_va_arg_indirect_ref (addr
);
12374 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12377 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
12378 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
12381 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
12382 CUMULATIVE_ARGS local_cum
;
12383 int gr_saved
= cfun
->va_list_gpr_size
;
12384 int vr_saved
= cfun
->va_list_fpr_size
;
12386 /* The caller has advanced CUM up to, but not beyond, the last named
12387 argument. Advance a local copy of CUM past the last "real" named
12388 argument, to find out how many registers are left over. */
12390 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
12392 /* Found out how many registers we need to save.
12393 Honor tree-stdvar analysis results. */
12394 if (cfun
->va_list_gpr_size
)
12395 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
12396 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
12397 if (cfun
->va_list_fpr_size
)
12398 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
12399 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
12403 gcc_assert (local_cum
.aapcs_nvrn
== 0);
12413 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12414 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
12415 - gr_saved
* UNITS_PER_WORD
);
12416 mem
= gen_frame_mem (BLKmode
, ptr
);
12417 set_mem_alias_set (mem
, get_varargs_alias_set ());
12419 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
12424 /* We can't use move_block_from_reg, because it will use
12425 the wrong mode, storing D regs only. */
12426 machine_mode mode
= TImode
;
12427 int off
, i
, vr_start
;
12429 /* Set OFF to the offset from virtual_incoming_args_rtx of
12430 the first vector register. The VR save area lies below
12431 the GR one, and is aligned to 16 bytes. */
12432 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12433 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12434 off
-= vr_saved
* UNITS_PER_VREG
;
12436 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
12437 for (i
= 0; i
< vr_saved
; ++i
)
12441 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
12442 mem
= gen_frame_mem (mode
, ptr
);
12443 set_mem_alias_set (mem
, get_varargs_alias_set ());
12444 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
12445 off
+= UNITS_PER_VREG
;
12450 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12451 any complication of having crtl->args.pretend_args_size changed. */
12452 cfun
->machine
->frame
.saved_varargs_size
12453 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12454 STACK_BOUNDARY
/ BITS_PER_UNIT
)
12455 + vr_saved
* UNITS_PER_VREG
);
12459 aarch64_conditional_register_usage (void)
12464 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
12467 call_used_regs
[i
] = 1;
12471 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
12474 call_used_regs
[i
] = 1;
12477 /* When tracking speculation, we need a couple of call-clobbered registers
12478 to track the speculation state. It would be nice to just use
12479 IP0 and IP1, but currently there are numerous places that just
12480 assume these registers are free for other uses (eg pointer
12481 authentication). */
12482 if (aarch64_track_speculation
)
12484 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
12485 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
12486 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
12487 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
12491 /* Walk down the type tree of TYPE counting consecutive base elements.
12492 If *MODEP is VOIDmode, then set it to the first valid floating point
12493 type. If a non-floating point type is found, or if a floating point
12494 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12495 otherwise return the count in the sub-tree. */
12497 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
12500 HOST_WIDE_INT size
;
12502 switch (TREE_CODE (type
))
12505 mode
= TYPE_MODE (type
);
12506 if (mode
!= DFmode
&& mode
!= SFmode
12507 && mode
!= TFmode
&& mode
!= HFmode
)
12510 if (*modep
== VOIDmode
)
12513 if (*modep
== mode
)
12519 mode
= TYPE_MODE (TREE_TYPE (type
));
12520 if (mode
!= DFmode
&& mode
!= SFmode
12521 && mode
!= TFmode
&& mode
!= HFmode
)
12524 if (*modep
== VOIDmode
)
12527 if (*modep
== mode
)
12533 /* Use V2SImode and V4SImode as representatives of all 64-bit
12534 and 128-bit vector types. */
12535 size
= int_size_in_bytes (type
);
12548 if (*modep
== VOIDmode
)
12551 /* Vector modes are considered to be opaque: two vectors are
12552 equivalent for the purposes of being homogeneous aggregates
12553 if they are the same size. */
12554 if (*modep
== mode
)
12562 tree index
= TYPE_DOMAIN (type
);
12564 /* Can't handle incomplete types nor sizes that are not
12566 if (!COMPLETE_TYPE_P (type
)
12567 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12570 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
12573 || !TYPE_MAX_VALUE (index
)
12574 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
12575 || !TYPE_MIN_VALUE (index
)
12576 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
12580 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
12581 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
12583 /* There must be no padding. */
12584 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12585 count
* GET_MODE_BITSIZE (*modep
)))
12597 /* Can't handle incomplete types nor sizes that are not
12599 if (!COMPLETE_TYPE_P (type
)
12600 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12603 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12605 if (TREE_CODE (field
) != FIELD_DECL
)
12608 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12611 count
+= sub_count
;
12614 /* There must be no padding. */
12615 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12616 count
* GET_MODE_BITSIZE (*modep
)))
12623 case QUAL_UNION_TYPE
:
12625 /* These aren't very interesting except in a degenerate case. */
12630 /* Can't handle incomplete types nor sizes that are not
12632 if (!COMPLETE_TYPE_P (type
)
12633 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12636 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12638 if (TREE_CODE (field
) != FIELD_DECL
)
12641 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12644 count
= count
> sub_count
? count
: sub_count
;
12647 /* There must be no padding. */
12648 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12649 count
* GET_MODE_BITSIZE (*modep
)))
12662 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12663 type as described in AAPCS64 \S 4.1.2.
12665 See the comment above aarch64_composite_type_p for the notes on MODE. */
12668 aarch64_short_vector_p (const_tree type
,
12671 poly_int64 size
= -1;
12673 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
12674 size
= int_size_in_bytes (type
);
12675 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
12676 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
12677 size
= GET_MODE_SIZE (mode
);
12679 return known_eq (size
, 8) || known_eq (size
, 16);
12682 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12683 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12684 array types. The C99 floating-point complex types are also considered
12685 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12686 types, which are GCC extensions and out of the scope of AAPCS64, are
12687 treated as composite types here as well.
12689 Note that MODE itself is not sufficient in determining whether a type
12690 is such a composite type or not. This is because
12691 stor-layout.c:compute_record_mode may have already changed the MODE
12692 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12693 structure with only one field may have its MODE set to the mode of the
12694 field. Also an integer mode whose size matches the size of the
12695 RECORD_TYPE type may be used to substitute the original mode
12696 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12697 solely relied on. */
12700 aarch64_composite_type_p (const_tree type
,
12703 if (aarch64_short_vector_p (type
, mode
))
12706 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
12709 if (mode
== BLKmode
12710 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
12711 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
12717 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12718 shall be passed or returned in simd/fp register(s) (providing these
12719 parameter passing registers are available).
12721 Upon successful return, *COUNT returns the number of needed registers,
12722 *BASE_MODE returns the mode of the individual register and when IS_HAF
12723 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12724 floating-point aggregate or a homogeneous short-vector aggregate. */
12727 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
12729 machine_mode
*base_mode
,
12733 machine_mode new_mode
= VOIDmode
;
12734 bool composite_p
= aarch64_composite_type_p (type
, mode
);
12736 if (is_ha
!= NULL
) *is_ha
= false;
12738 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12739 || aarch64_short_vector_p (type
, mode
))
12744 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
12746 if (is_ha
!= NULL
) *is_ha
= true;
12748 new_mode
= GET_MODE_INNER (mode
);
12750 else if (type
&& composite_p
)
12752 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
12754 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
12756 if (is_ha
!= NULL
) *is_ha
= true;
12765 *base_mode
= new_mode
;
12769 /* Implement TARGET_STRUCT_VALUE_RTX. */
12772 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
12773 int incoming ATTRIBUTE_UNUSED
)
12775 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
12778 /* Implements target hook vector_mode_supported_p. */
12780 aarch64_vector_mode_supported_p (machine_mode mode
)
12782 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12783 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
12786 /* Return appropriate SIMD container
12787 for MODE within a vector of WIDTH bits. */
12788 static machine_mode
12789 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
12791 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
12807 return VNx16QImode
;
12812 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
12815 if (known_eq (width
, 128))
12855 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12856 static machine_mode
12857 aarch64_preferred_simd_mode (scalar_mode mode
)
12859 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
12860 return aarch64_simd_container_mode (mode
, bits
);
12863 /* Return a list of possible vector sizes for the vectorizer
12864 to iterate over. */
12866 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
12869 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
12870 sizes
->safe_push (16);
12871 sizes
->safe_push (8);
12874 /* Implement TARGET_MANGLE_TYPE. */
12876 static const char *
12877 aarch64_mangle_type (const_tree type
)
12879 /* The AArch64 ABI documents say that "__va_list" has to be
12880 managled as if it is in the "std" namespace. */
12881 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
12882 return "St9__va_list";
12884 /* Half-precision float. */
12885 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
12888 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12890 if (TYPE_NAME (type
) != NULL
)
12891 return aarch64_mangle_builtin_type (type
);
12893 /* Use the default mangling. */
12897 /* Find the first rtx_insn before insn that will generate an assembly
12901 aarch64_prev_real_insn (rtx_insn
*insn
)
12908 insn
= prev_real_insn (insn
);
12910 while (insn
&& recog_memoized (insn
) < 0);
12916 is_madd_op (enum attr_type t1
)
12919 /* A number of these may be AArch32 only. */
12920 enum attr_type mlatypes
[] = {
12921 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
12922 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
12923 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
12926 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
12928 if (t1
== mlatypes
[i
])
12935 /* Check if there is a register dependency between a load and the insn
12936 for which we hold recog_data. */
12939 dep_between_memop_and_curr (rtx memop
)
12944 gcc_assert (GET_CODE (memop
) == SET
);
12946 if (!REG_P (SET_DEST (memop
)))
12949 load_reg
= SET_DEST (memop
);
12950 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
12952 rtx operand
= recog_data
.operand
[opno
];
12953 if (REG_P (operand
)
12954 && reg_overlap_mentioned_p (load_reg
, operand
))
12962 /* When working around the Cortex-A53 erratum 835769,
12963 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12964 instruction and has a preceding memory instruction such that a NOP
12965 should be inserted between them. */
12968 aarch64_madd_needs_nop (rtx_insn
* insn
)
12970 enum attr_type attr_type
;
12974 if (!TARGET_FIX_ERR_A53_835769
)
12977 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
12980 attr_type
= get_attr_type (insn
);
12981 if (!is_madd_op (attr_type
))
12984 prev
= aarch64_prev_real_insn (insn
);
12985 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12986 Restore recog state to INSN to avoid state corruption. */
12987 extract_constrain_insn_cached (insn
);
12989 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
12992 body
= single_set (prev
);
12994 /* If the previous insn is a memory op and there is no dependency between
12995 it and the DImode madd, emit a NOP between them. If body is NULL then we
12996 have a complex memory operation, probably a load/store pair.
12997 Be conservative for now and emit a NOP. */
12998 if (GET_MODE (recog_data
.operand
[0]) == DImode
12999 && (!body
|| !dep_between_memop_and_curr (body
)))
13007 /* Implement FINAL_PRESCAN_INSN. */
13010 aarch64_final_prescan_insn (rtx_insn
*insn
)
13012 if (aarch64_madd_needs_nop (insn
))
13013 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
13017 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13021 aarch64_sve_index_immediate_p (rtx base_or_step
)
13023 return (CONST_INT_P (base_or_step
)
13024 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
13027 /* Return true if X is a valid immediate for the SVE ADD and SUB
13028 instructions. Negate X first if NEGATE_P is true. */
13031 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
13035 if (!const_vec_duplicate_p (x
, &elt
)
13036 || !CONST_INT_P (elt
))
13039 HOST_WIDE_INT val
= INTVAL (elt
);
13042 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
13045 return IN_RANGE (val
, 0, 0xff);
13046 return IN_RANGE (val
, 0, 0xff00);
13049 /* Return true if X is a valid immediate operand for an SVE logical
13050 instruction such as AND. */
13053 aarch64_sve_bitmask_immediate_p (rtx x
)
13057 return (const_vec_duplicate_p (x
, &elt
)
13058 && CONST_INT_P (elt
)
13059 && aarch64_bitmask_imm (INTVAL (elt
),
13060 GET_MODE_INNER (GET_MODE (x
))));
13063 /* Return true if X is a valid immediate for the SVE DUP and CPY
13067 aarch64_sve_dup_immediate_p (rtx x
)
13071 if (!const_vec_duplicate_p (x
, &elt
)
13072 || !CONST_INT_P (elt
))
13075 HOST_WIDE_INT val
= INTVAL (elt
);
13077 return IN_RANGE (val
, -0x80, 0x7f);
13078 return IN_RANGE (val
, -0x8000, 0x7f00);
13081 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13082 SIGNED_P says whether the operand is signed rather than unsigned. */
13085 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
13089 return (const_vec_duplicate_p (x
, &elt
)
13090 && CONST_INT_P (elt
)
13092 ? IN_RANGE (INTVAL (elt
), -16, 15)
13093 : IN_RANGE (INTVAL (elt
), 0, 127)));
13096 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13097 instruction. Negate X first if NEGATE_P is true. */
13100 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
13105 if (!const_vec_duplicate_p (x
, &elt
)
13106 || GET_CODE (elt
) != CONST_DOUBLE
)
13109 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
13112 r
= real_value_negate (&r
);
13114 if (real_equal (&r
, &dconst1
))
13116 if (real_equal (&r
, &dconsthalf
))
13121 /* Return true if X is a valid immediate operand for an SVE FMUL
13125 aarch64_sve_float_mul_immediate_p (rtx x
)
13129 /* GCC will never generate a multiply with an immediate of 2, so there is no
13130 point testing for it (even though it is a valid constant). */
13131 return (const_vec_duplicate_p (x
, &elt
)
13132 && GET_CODE (elt
) == CONST_DOUBLE
13133 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
13136 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13137 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13138 is nonnull, use it to describe valid immediates. */
13140 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
13141 simd_immediate_info
*info
,
13142 enum simd_immediate_check which
,
13143 simd_immediate_info::insn_type insn
)
13145 /* Try a 4-byte immediate with LSL. */
13146 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
13147 if ((val32
& (0xff << shift
)) == val32
)
13150 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13151 simd_immediate_info::LSL
, shift
);
13155 /* Try a 2-byte immediate with LSL. */
13156 unsigned int imm16
= val32
& 0xffff;
13157 if (imm16
== (val32
>> 16))
13158 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
13159 if ((imm16
& (0xff << shift
)) == imm16
)
13162 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
13163 simd_immediate_info::LSL
, shift
);
13167 /* Try a 4-byte immediate with MSL, except for cases that MVN
13169 if (which
== AARCH64_CHECK_MOV
)
13170 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
13172 unsigned int low
= (1 << shift
) - 1;
13173 if (((val32
& (0xff << shift
)) | low
) == val32
)
13176 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13177 simd_immediate_info::MSL
, shift
);
13185 /* Return true if replicating VAL64 is a valid immediate for the
13186 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13187 use it to describe valid immediates. */
13189 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
13190 simd_immediate_info
*info
,
13191 enum simd_immediate_check which
)
13193 unsigned int val32
= val64
& 0xffffffff;
13194 unsigned int val16
= val64
& 0xffff;
13195 unsigned int val8
= val64
& 0xff;
13197 if (val32
== (val64
>> 32))
13199 if ((which
& AARCH64_CHECK_ORR
) != 0
13200 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
13201 simd_immediate_info::MOV
))
13204 if ((which
& AARCH64_CHECK_BIC
) != 0
13205 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
13206 simd_immediate_info::MVN
))
13209 /* Try using a replicated byte. */
13210 if (which
== AARCH64_CHECK_MOV
13211 && val16
== (val32
>> 16)
13212 && val8
== (val16
>> 8))
13215 *info
= simd_immediate_info (QImode
, val8
);
13220 /* Try using a bit-to-bytemask. */
13221 if (which
== AARCH64_CHECK_MOV
)
13224 for (i
= 0; i
< 64; i
+= 8)
13226 unsigned char byte
= (val64
>> i
) & 0xff;
13227 if (byte
!= 0 && byte
!= 0xff)
13233 *info
= simd_immediate_info (DImode
, val64
);
13240 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13241 instruction. If INFO is nonnull, use it to describe valid immediates. */
13244 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
13245 simd_immediate_info
*info
)
13247 scalar_int_mode mode
= DImode
;
13248 unsigned int val32
= val64
& 0xffffffff;
13249 if (val32
== (val64
>> 32))
13252 unsigned int val16
= val32
& 0xffff;
13253 if (val16
== (val32
>> 16))
13256 unsigned int val8
= val16
& 0xff;
13257 if (val8
== (val16
>> 8))
13261 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
13262 if (IN_RANGE (val
, -0x80, 0x7f))
13264 /* DUP with no shift. */
13266 *info
= simd_immediate_info (mode
, val
);
13269 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
13271 /* DUP with LSL #8. */
13273 *info
= simd_immediate_info (mode
, val
);
13276 if (aarch64_bitmask_imm (val64
, mode
))
13280 *info
= simd_immediate_info (mode
, val
);
13286 /* Return true if OP is a valid SIMD immediate for the operation
13287 described by WHICH. If INFO is nonnull, use it to describe valid
13290 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
13291 enum simd_immediate_check which
)
13293 machine_mode mode
= GET_MODE (op
);
13294 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13295 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13298 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
13300 unsigned int n_elts
;
13301 if (GET_CODE (op
) == CONST_VECTOR
13302 && CONST_VECTOR_DUPLICATE_P (op
))
13303 n_elts
= CONST_VECTOR_NPATTERNS (op
);
13304 else if ((vec_flags
& VEC_SVE_DATA
)
13305 && const_vec_series_p (op
, &base
, &step
))
13307 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
13308 if (!aarch64_sve_index_immediate_p (base
)
13309 || !aarch64_sve_index_immediate_p (step
))
13313 *info
= simd_immediate_info (elt_mode
, base
, step
);
13316 else if (GET_CODE (op
) == CONST_VECTOR
13317 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
13318 /* N_ELTS set above. */;
13322 /* Handle PFALSE and PTRUE. */
13323 if (vec_flags
& VEC_SVE_PRED
)
13324 return (op
== CONST0_RTX (mode
)
13325 || op
== CONSTM1_RTX (mode
));
13327 scalar_float_mode elt_float_mode
;
13329 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
13331 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
13332 if (aarch64_float_const_zero_rtx_p (elt
)
13333 || aarch64_float_const_representable_p (elt
))
13336 *info
= simd_immediate_info (elt_float_mode
, elt
);
13341 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
13345 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
13347 /* Expand the vector constant out into a byte vector, with the least
13348 significant byte of the register first. */
13349 auto_vec
<unsigned char, 16> bytes
;
13350 bytes
.reserve (n_elts
* elt_size
);
13351 for (unsigned int i
= 0; i
< n_elts
; i
++)
13353 /* The vector is provided in gcc endian-neutral fashion.
13354 For aarch64_be Advanced SIMD, it must be laid out in the vector
13355 register in reverse order. */
13356 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
13357 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
13359 if (elt_mode
!= elt_int_mode
)
13360 elt
= gen_lowpart (elt_int_mode
, elt
);
13362 if (!CONST_INT_P (elt
))
13365 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
13366 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
13368 bytes
.quick_push (elt_val
& 0xff);
13369 elt_val
>>= BITS_PER_UNIT
;
13373 /* The immediate must repeat every eight bytes. */
13374 unsigned int nbytes
= bytes
.length ();
13375 for (unsigned i
= 8; i
< nbytes
; ++i
)
13376 if (bytes
[i
] != bytes
[i
- 8])
13379 /* Get the repeating 8-byte value as an integer. No endian correction
13380 is needed here because bytes is already in lsb-first order. */
13381 unsigned HOST_WIDE_INT val64
= 0;
13382 for (unsigned int i
= 0; i
< 8; i
++)
13383 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
13384 << (i
* BITS_PER_UNIT
));
13386 if (vec_flags
& VEC_SVE_DATA
)
13387 return aarch64_sve_valid_immediate (val64
, info
);
13389 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
13392 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13393 has a step in the range of INDEX. Return the index expression if so,
13394 otherwise return null. */
13396 aarch64_check_zero_based_sve_index_immediate (rtx x
)
13399 if (const_vec_series_p (x
, &base
, &step
)
13400 && base
== const0_rtx
13401 && aarch64_sve_index_immediate_p (step
))
13406 /* Check of immediate shift constants are within range. */
13408 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
13410 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
13412 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
13414 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
13417 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13418 operation of width WIDTH at bit position POS. */
13421 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
13423 gcc_assert (CONST_INT_P (width
));
13424 gcc_assert (CONST_INT_P (pos
));
13426 unsigned HOST_WIDE_INT mask
13427 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
13428 return GEN_INT (mask
<< UINTVAL (pos
));
13432 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
13434 if (GET_CODE (x
) == HIGH
13435 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
13438 if (CONST_INT_P (x
))
13441 if (VECTOR_MODE_P (GET_MODE (x
)))
13442 return aarch64_simd_valid_immediate (x
, NULL
);
13444 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
13447 if (aarch64_sve_cnt_immediate_p (x
))
13450 return aarch64_classify_symbolic_expression (x
)
13451 == SYMBOL_TINY_ABSOLUTE
;
13454 /* Return a const_int vector of VAL. */
13456 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
13458 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
13459 return gen_const_vec_duplicate (mode
, c
);
13462 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13465 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
13467 machine_mode vmode
;
13469 vmode
= aarch64_simd_container_mode (mode
, 64);
13470 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
13471 return aarch64_simd_valid_immediate (op_v
, NULL
);
13474 /* Construct and return a PARALLEL RTX vector with elements numbering the
13475 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13476 the vector - from the perspective of the architecture. This does not
13477 line up with GCC's perspective on lane numbers, so we end up with
13478 different masks depending on our target endian-ness. The diagram
13479 below may help. We must draw the distinction when building masks
13480 which select one half of the vector. An instruction selecting
13481 architectural low-lanes for a big-endian target, must be described using
13482 a mask selecting GCC high-lanes.
13484 Big-Endian Little-Endian
13486 GCC 0 1 2 3 3 2 1 0
13487 | x | x | x | x | | x | x | x | x |
13488 Architecture 3 2 1 0 3 2 1 0
13490 Low Mask: { 2, 3 } { 0, 1 }
13491 High Mask: { 0, 1 } { 2, 3 }
13493 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13496 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
13498 rtvec v
= rtvec_alloc (nunits
/ 2);
13499 int high_base
= nunits
/ 2;
13505 if (BYTES_BIG_ENDIAN
)
13506 base
= high
? low_base
: high_base
;
13508 base
= high
? high_base
: low_base
;
13510 for (i
= 0; i
< nunits
/ 2; i
++)
13511 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
13513 t1
= gen_rtx_PARALLEL (mode
, v
);
13517 /* Check OP for validity as a PARALLEL RTX vector with elements
13518 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13519 from the perspective of the architecture. See the diagram above
13520 aarch64_simd_vect_par_cnst_half for more details. */
13523 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
13527 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
13530 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
13531 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
13532 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
13535 if (count_op
!= count_ideal
)
13538 for (i
= 0; i
< count_ideal
; i
++)
13540 rtx elt_op
= XVECEXP (op
, 0, i
);
13541 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
13543 if (!CONST_INT_P (elt_op
)
13544 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
13550 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13551 HIGH (exclusive). */
13553 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
13556 HOST_WIDE_INT lane
;
13557 gcc_assert (CONST_INT_P (operand
));
13558 lane
= INTVAL (operand
);
13560 if (lane
< low
|| lane
>= high
)
13563 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
13565 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
13569 /* Peform endian correction on lane number N, which indexes a vector
13570 of mode MODE, and return the result as an SImode rtx. */
13573 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
13575 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
13578 /* Return TRUE if OP is a valid vector addressing mode. */
13581 aarch64_simd_mem_operand_p (rtx op
)
13583 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
13584 || REG_P (XEXP (op
, 0)));
13587 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13590 aarch64_sve_ld1r_operand_p (rtx op
)
13592 struct aarch64_address_info addr
;
13596 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
13597 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
13598 && addr
.type
== ADDRESS_REG_IMM
13599 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
13602 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13603 The conditions for STR are the same. */
13605 aarch64_sve_ldr_operand_p (rtx op
)
13607 struct aarch64_address_info addr
;
13610 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
13611 false, ADDR_QUERY_ANY
)
13612 && addr
.type
== ADDRESS_REG_IMM
);
13615 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13616 We need to be able to access the individual pieces, so the range
13617 is different from LD[234] and ST[234]. */
13619 aarch64_sve_struct_memory_operand_p (rtx op
)
13624 machine_mode mode
= GET_MODE (op
);
13625 struct aarch64_address_info addr
;
13626 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
13628 || addr
.type
!= ADDRESS_REG_IMM
)
13631 poly_int64 first
= addr
.const_offset
;
13632 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
13633 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
13634 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
13637 /* Emit a register copy from operand to operand, taking care not to
13638 early-clobber source registers in the process.
13640 COUNT is the number of components into which the copy needs to be
13643 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
13644 unsigned int count
)
13647 int rdest
= REGNO (operands
[0]);
13648 int rsrc
= REGNO (operands
[1]);
13650 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
13652 for (i
= 0; i
< count
; i
++)
13653 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
13654 gen_rtx_REG (mode
, rsrc
+ i
));
13656 for (i
= 0; i
< count
; i
++)
13657 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
13658 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
13661 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13662 one of VSTRUCT modes: OI, CI, or XI. */
13664 aarch64_simd_attr_length_rglist (machine_mode mode
)
13666 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13667 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
13670 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13671 alignment of a vector to 128 bits. SVE predicates have an alignment of
13673 static HOST_WIDE_INT
13674 aarch64_simd_vector_alignment (const_tree type
)
13676 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13677 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13678 be set for non-predicate vectors of booleans. Modes are the most
13679 direct way we have of identifying real SVE predicate types. */
13680 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
13681 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
13682 return MIN (align
, 128);
13685 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13686 static HOST_WIDE_INT
13687 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
13689 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
13691 /* If the length of the vector is fixed, try to align to that length,
13692 otherwise don't try to align at all. */
13693 HOST_WIDE_INT result
;
13694 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
13695 result
= TYPE_ALIGN (TREE_TYPE (type
));
13698 return TYPE_ALIGN (type
);
13701 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13703 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
13708 /* For fixed-length vectors, check that the vectorizer will aim for
13709 full-vector alignment. This isn't true for generic GCC vectors
13710 that are wider than the ABI maximum of 128 bits. */
13711 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
13712 && (wi::to_widest (TYPE_SIZE (type
))
13713 != aarch64_vectorize_preferred_vector_alignment (type
)))
13716 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13720 /* Return true if the vector misalignment factor is supported by the
13723 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
13724 const_tree type
, int misalignment
,
13727 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
13729 /* Return if movmisalign pattern is not supported for this mode. */
13730 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
13733 /* Misalignment factor is unknown at compile time. */
13734 if (misalignment
== -1)
13737 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
13741 /* If VALS is a vector constant that can be loaded into a register
13742 using DUP, generate instructions to do so and return an RTX to
13743 assign to the register. Otherwise return NULL_RTX. */
13745 aarch64_simd_dup_constant (rtx vals
)
13747 machine_mode mode
= GET_MODE (vals
);
13748 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13751 if (!const_vec_duplicate_p (vals
, &x
))
13754 /* We can load this constant by using DUP and a constant in a
13755 single ARM register. This will be cheaper than a vector
13757 x
= copy_to_mode_reg (inner_mode
, x
);
13758 return gen_vec_duplicate (mode
, x
);
13762 /* Generate code to load VALS, which is a PARALLEL containing only
13763 constants (for vec_init) or CONST_VECTOR, efficiently into a
13764 register. Returns an RTX to copy into the register, or NULL_RTX
13765 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13767 aarch64_simd_make_constant (rtx vals
)
13769 machine_mode mode
= GET_MODE (vals
);
13771 rtx const_vec
= NULL_RTX
;
13775 if (GET_CODE (vals
) == CONST_VECTOR
)
13777 else if (GET_CODE (vals
) == PARALLEL
)
13779 /* A CONST_VECTOR must contain only CONST_INTs and
13780 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13781 Only store valid constants in a CONST_VECTOR. */
13782 int n_elts
= XVECLEN (vals
, 0);
13783 for (i
= 0; i
< n_elts
; ++i
)
13785 rtx x
= XVECEXP (vals
, 0, i
);
13786 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13789 if (n_const
== n_elts
)
13790 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
13793 gcc_unreachable ();
13795 if (const_vec
!= NULL_RTX
13796 && aarch64_simd_valid_immediate (const_vec
, NULL
))
13797 /* Load using MOVI/MVNI. */
13799 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
13800 /* Loaded using DUP. */
13802 else if (const_vec
!= NULL_RTX
)
13803 /* Load from constant pool. We can not take advantage of single-cycle
13804 LD1 because we need a PC-relative addressing mode. */
13807 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13808 We can not construct an initializer. */
13812 /* Expand a vector initialisation sequence, such that TARGET is
13813 initialised to contain VALS. */
13816 aarch64_expand_vector_init (rtx target
, rtx vals
)
13818 machine_mode mode
= GET_MODE (target
);
13819 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
13820 /* The number of vector elements. */
13821 int n_elts
= XVECLEN (vals
, 0);
13822 /* The number of vector elements which are not constant. */
13824 rtx any_const
= NULL_RTX
;
13825 /* The first element of vals. */
13826 rtx v0
= XVECEXP (vals
, 0, 0);
13827 bool all_same
= true;
13829 /* Count the number of variable elements to initialise. */
13830 for (int i
= 0; i
< n_elts
; ++i
)
13832 rtx x
= XVECEXP (vals
, 0, i
);
13833 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
13838 all_same
&= rtx_equal_p (x
, v0
);
13841 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13842 how best to handle this. */
13845 rtx constant
= aarch64_simd_make_constant (vals
);
13846 if (constant
!= NULL_RTX
)
13848 emit_move_insn (target
, constant
);
13853 /* Splat a single non-constant element if we can. */
13856 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
13857 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13861 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
13862 gcc_assert (icode
!= CODE_FOR_nothing
);
13864 /* If there are only variable elements, try to optimize
13865 the insertion using dup for the most common element
13866 followed by insertions. */
13868 /* The algorithm will fill matches[*][0] with the earliest matching element,
13869 and matches[X][1] with the count of duplicate elements (if X is the
13870 earliest element which has duplicates). */
13872 if (n_var
== n_elts
&& n_elts
<= 16)
13874 int matches
[16][2] = {0};
13875 for (int i
= 0; i
< n_elts
; i
++)
13877 for (int j
= 0; j
<= i
; j
++)
13879 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
13887 int maxelement
= 0;
13889 for (int i
= 0; i
< n_elts
; i
++)
13890 if (matches
[i
][1] > maxv
)
13893 maxv
= matches
[i
][1];
13896 /* Create a duplicate of the most common element, unless all elements
13897 are equally useless to us, in which case just immediately set the
13898 vector register using the first element. */
13902 /* For vectors of two 64-bit elements, we can do even better. */
13904 && (inner_mode
== E_DImode
13905 || inner_mode
== E_DFmode
))
13908 rtx x0
= XVECEXP (vals
, 0, 0);
13909 rtx x1
= XVECEXP (vals
, 0, 1);
13910 /* Combine can pick up this case, but handling it directly
13911 here leaves clearer RTL.
13913 This is load_pair_lanes<mode>, and also gives us a clean-up
13914 for store_pair_lanes<mode>. */
13915 if (memory_operand (x0
, inner_mode
)
13916 && memory_operand (x1
, inner_mode
)
13917 && !STRICT_ALIGNMENT
13918 && rtx_equal_p (XEXP (x1
, 0),
13919 plus_constant (Pmode
,
13921 GET_MODE_SIZE (inner_mode
))))
13924 if (inner_mode
== DFmode
)
13925 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
13927 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
13932 /* The subreg-move sequence below will move into lane zero of the
13933 vector register. For big-endian we want that position to hold
13934 the last element of VALS. */
13935 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
13936 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
13937 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
13941 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
13942 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13945 /* Insert the rest. */
13946 for (int i
= 0; i
< n_elts
; i
++)
13948 rtx x
= XVECEXP (vals
, 0, i
);
13949 if (matches
[i
][0] == maxelement
)
13951 x
= copy_to_mode_reg (inner_mode
, x
);
13952 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
13957 /* Initialise a vector which is part-variable. We want to first try
13958 to build those lanes which are constant in the most efficient way we
13960 if (n_var
!= n_elts
)
13962 rtx copy
= copy_rtx (vals
);
13964 /* Load constant part of vector. We really don't care what goes into the
13965 parts we will overwrite, but we're more likely to be able to load the
13966 constant efficiently if it has fewer, larger, repeating parts
13967 (see aarch64_simd_valid_immediate). */
13968 for (int i
= 0; i
< n_elts
; i
++)
13970 rtx x
= XVECEXP (vals
, 0, i
);
13971 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13973 rtx subst
= any_const
;
13974 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
13976 /* Look in the copied vector, as more elements are const. */
13977 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
13978 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
13984 XVECEXP (copy
, 0, i
) = subst
;
13986 aarch64_expand_vector_init (target
, copy
);
13989 /* Insert the variable lanes directly. */
13990 for (int i
= 0; i
< n_elts
; i
++)
13992 rtx x
= XVECEXP (vals
, 0, i
);
13993 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13995 x
= copy_to_mode_reg (inner_mode
, x
);
13996 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14000 static unsigned HOST_WIDE_INT
14001 aarch64_shift_truncation_mask (machine_mode mode
)
14003 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
14005 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
14008 /* Select a format to encode pointers in exception handling data. */
14010 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
14013 switch (aarch64_cmodel
)
14015 case AARCH64_CMODEL_TINY
:
14016 case AARCH64_CMODEL_TINY_PIC
:
14017 case AARCH64_CMODEL_SMALL
:
14018 case AARCH64_CMODEL_SMALL_PIC
:
14019 case AARCH64_CMODEL_SMALL_SPIC
:
14020 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14022 type
= DW_EH_PE_sdata4
;
14025 /* No assumptions here. 8-byte relocs required. */
14026 type
= DW_EH_PE_sdata8
;
14029 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
14032 /* The last .arch and .tune assembly strings that we printed. */
14033 static std::string aarch64_last_printed_arch_string
;
14034 static std::string aarch64_last_printed_tune_string
;
14036 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14037 by the function fndecl. */
14040 aarch64_declare_function_name (FILE *stream
, const char* name
,
14043 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14045 struct cl_target_option
*targ_options
;
14047 targ_options
= TREE_TARGET_OPTION (target_parts
);
14049 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
14050 gcc_assert (targ_options
);
14052 const struct processor
*this_arch
14053 = aarch64_get_arch (targ_options
->x_explicit_arch
);
14055 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
14056 std::string extension
14057 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
14059 /* Only update the assembler .arch string if it is distinct from the last
14060 such string we printed. */
14061 std::string to_print
= this_arch
->name
+ extension
;
14062 if (to_print
!= aarch64_last_printed_arch_string
)
14064 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
14065 aarch64_last_printed_arch_string
= to_print
;
14068 /* Print the cpu name we're tuning for in the comments, might be
14069 useful to readers of the generated asm. Do it only when it changes
14070 from function to function and verbose assembly is requested. */
14071 const struct processor
*this_tune
14072 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
14074 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
14076 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
14078 aarch64_last_printed_tune_string
= this_tune
->name
;
14081 /* Don't forget the type directive for ELF. */
14082 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
14083 ASM_OUTPUT_LABEL (stream
, name
);
14086 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14089 aarch64_start_file (void)
14091 struct cl_target_option
*default_options
14092 = TREE_TARGET_OPTION (target_option_default_node
);
14094 const struct processor
*default_arch
14095 = aarch64_get_arch (default_options
->x_explicit_arch
);
14096 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
14097 std::string extension
14098 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
14099 default_arch
->flags
);
14101 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
14102 aarch64_last_printed_tune_string
= "";
14103 asm_fprintf (asm_out_file
, "\t.arch %s\n",
14104 aarch64_last_printed_arch_string
.c_str ());
14106 default_file_start ();
14109 /* Emit load exclusive. */
14112 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
14113 rtx mem
, rtx model_rtx
)
14115 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
14118 /* Emit store exclusive. */
14121 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
14122 rtx rval
, rtx mem
, rtx model_rtx
)
14124 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
14127 /* Mark the previous jump instruction as unlikely. */
14130 aarch64_emit_unlikely_jump (rtx insn
)
14132 rtx_insn
*jump
= emit_jump_insn (insn
);
14133 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
14136 /* Expand a compare and swap pattern. */
14139 aarch64_expand_compare_and_swap (rtx operands
[])
14141 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
14142 machine_mode mode
, cmp_mode
;
14144 bval
= operands
[0];
14145 rval
= operands
[1];
14147 oldval
= operands
[3];
14148 newval
= operands
[4];
14149 is_weak
= operands
[5];
14150 mod_s
= operands
[6];
14151 mod_f
= operands
[7];
14152 mode
= GET_MODE (mem
);
14155 /* Normally the succ memory model must be stronger than fail, but in the
14156 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14157 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14159 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
14160 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
14161 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
14167 /* For short modes, we're going to perform the comparison in SImode,
14168 so do the zero-extension now. */
14170 rval
= gen_reg_rtx (SImode
);
14171 oldval
= convert_modes (SImode
, mode
, oldval
, true);
14172 /* Fall through. */
14176 /* Force the value into a register if needed. */
14177 if (!aarch64_plus_operand (oldval
, mode
))
14178 oldval
= force_reg (cmp_mode
, oldval
);
14182 gcc_unreachable ();
14186 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
, oldval
,
14187 newval
, is_weak
, mod_s
,
14190 emit_insn (gen_aarch64_compare_and_swap (mode
, rval
, mem
, oldval
, newval
,
14191 is_weak
, mod_s
, mod_f
));
14194 if (mode
== QImode
|| mode
== HImode
)
14195 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
14197 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14198 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
14199 emit_insn (gen_rtx_SET (bval
, x
));
14202 /* Test whether the target supports using a atomic load-operate instruction.
14203 CODE is the operation and AFTER is TRUE if the data in memory after the
14204 operation should be returned and FALSE if the data before the operation
14205 should be returned. Returns FALSE if the operation isn't supported by the
14209 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
14228 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14229 sequence implementing an atomic operation. */
14232 aarch64_emit_post_barrier (enum memmodel model
)
14234 const enum memmodel base_model
= memmodel_base (model
);
14236 if (is_mm_sync (model
)
14237 && (base_model
== MEMMODEL_ACQUIRE
14238 || base_model
== MEMMODEL_ACQ_REL
14239 || base_model
== MEMMODEL_SEQ_CST
))
14241 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
14245 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14246 for the data in memory. EXPECTED is the value expected to be in memory.
14247 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14248 is the memory ordering to use. */
14251 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
14252 rtx expected
, rtx desired
,
14257 mode
= GET_MODE (mem
);
14259 /* Move the expected value into the CAS destination register. */
14260 emit_insn (gen_rtx_SET (rval
, expected
));
14262 /* Emit the CAS. */
14263 emit_insn (gen_aarch64_atomic_cas (mode
, rval
, mem
, desired
, model
));
14265 /* Compare the expected value with the value loaded by the CAS, to establish
14266 whether the swap was made. */
14267 aarch64_gen_compare_reg (EQ
, rval
, expected
);
14270 /* Split a compare and swap pattern. */
14273 aarch64_split_compare_and_swap (rtx operands
[])
14275 rtx rval
, mem
, oldval
, newval
, scratch
;
14278 rtx_code_label
*label1
, *label2
;
14280 enum memmodel model
;
14283 rval
= operands
[0];
14285 oldval
= operands
[2];
14286 newval
= operands
[3];
14287 is_weak
= (operands
[4] != const0_rtx
);
14288 model_rtx
= operands
[5];
14289 scratch
= operands
[7];
14290 mode
= GET_MODE (mem
);
14291 model
= memmodel_from_int (INTVAL (model_rtx
));
14293 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14296 LD[A]XR rval, [mem]
14298 ST[L]XR scratch, newval, [mem]
14299 CBNZ scratch, .label1
14302 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
14307 label1
= gen_label_rtx ();
14308 emit_label (label1
);
14310 label2
= gen_label_rtx ();
14312 /* The initial load can be relaxed for a __sync operation since a final
14313 barrier will be emitted to stop code hoisting. */
14314 if (is_mm_sync (model
))
14315 aarch64_emit_load_exclusive (mode
, rval
, mem
,
14316 GEN_INT (MEMMODEL_RELAXED
));
14318 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
14322 if (aarch64_track_speculation
)
14324 /* Emit an explicit compare instruction, so that we can correctly
14325 track the condition codes. */
14326 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
14327 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
14330 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
14332 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14333 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14334 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14338 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
14339 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14340 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14341 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14342 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14345 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
14349 if (aarch64_track_speculation
)
14351 /* Emit an explicit compare instruction, so that we can correctly
14352 track the condition codes. */
14353 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
14354 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
14357 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
14359 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14360 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
14361 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14365 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14366 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
14367 emit_insn (gen_rtx_SET (cond
, x
));
14370 emit_label (label2
);
14371 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14372 to set the condition flags. If this is not used it will be removed by
14376 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14377 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
14378 emit_insn (gen_rtx_SET (cond
, x
));
14380 /* Emit any final barrier needed for a __sync operation. */
14381 if (is_mm_sync (model
))
14382 aarch64_emit_post_barrier (model
);
14385 /* Emit a BIC instruction. */
14388 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
14390 rtx shift_rtx
= GEN_INT (shift
);
14391 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14395 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
14396 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
14398 gcc_unreachable ();
14401 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
14404 /* Emit an atomic swap. */
14407 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
14408 rtx mem
, rtx model
)
14410 emit_insn (gen_aarch64_atomic_swp (mode
, dst
, mem
, value
, model
));
14413 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14414 location to store the data read from memory. OUT_RESULT is the location to
14415 store the result of the operation. MEM is the memory location to read and
14416 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14417 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14421 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
14422 rtx mem
, rtx value
, rtx model_rtx
)
14424 machine_mode mode
= GET_MODE (mem
);
14425 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14426 const bool short_mode
= (mode
< SImode
);
14432 out_data
= gen_lowpart (mode
, out_data
);
14435 out_result
= gen_lowpart (mode
, out_result
);
14437 /* Make sure the value is in a register, putting it into a destination
14438 register if it needs to be manipulated. */
14439 if (!register_operand (value
, mode
)
14440 || code
== AND
|| code
== MINUS
)
14442 src
= out_result
? out_result
: out_data
;
14443 emit_move_insn (src
, gen_lowpart (mode
, value
));
14447 gcc_assert (register_operand (src
, mode
));
14449 /* Preprocess the data for the operation as necessary. If the operation is
14450 a SET then emit a swap instruction and finish. */
14454 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
14458 /* Negate the value and treat it as a PLUS. */
14462 /* Resize the value if necessary. */
14464 src
= gen_lowpart (wmode
, src
);
14466 neg_src
= gen_rtx_NEG (wmode
, src
);
14467 emit_insn (gen_rtx_SET (src
, neg_src
));
14470 src
= gen_lowpart (mode
, src
);
14472 /* Fall-through. */
14474 ldop_code
= UNSPECV_ATOMIC_LDOP_PLUS
;
14478 ldop_code
= UNSPECV_ATOMIC_LDOP_OR
;
14482 ldop_code
= UNSPECV_ATOMIC_LDOP_XOR
;
14489 /* Resize the value if necessary. */
14491 src
= gen_lowpart (wmode
, src
);
14493 not_src
= gen_rtx_NOT (wmode
, src
);
14494 emit_insn (gen_rtx_SET (src
, not_src
));
14497 src
= gen_lowpart (mode
, src
);
14499 ldop_code
= UNSPECV_ATOMIC_LDOP_BIC
;
14503 /* The operation can't be done with atomic instructions. */
14504 gcc_unreachable ();
14507 emit_insn (gen_aarch64_atomic_load (ldop_code
, mode
,
14508 out_data
, mem
, src
, model_rtx
));
14510 /* If necessary, calculate the data in memory after the update by redoing the
14511 operation from values in registers. */
14517 src
= gen_lowpart (wmode
, src
);
14518 out_data
= gen_lowpart (wmode
, out_data
);
14519 out_result
= gen_lowpart (wmode
, out_result
);
14528 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
14531 x
= gen_rtx_IOR (wmode
, out_data
, src
);
14534 x
= gen_rtx_XOR (wmode
, out_data
, src
);
14537 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
14540 gcc_unreachable ();
14543 emit_set_insn (out_result
, x
);
14548 /* Split an atomic operation. */
14551 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
14552 rtx value
, rtx model_rtx
, rtx cond
)
14554 machine_mode mode
= GET_MODE (mem
);
14555 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14556 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
14557 const bool is_sync
= is_mm_sync (model
);
14558 rtx_code_label
*label
;
14561 /* Split the atomic operation into a sequence. */
14562 label
= gen_label_rtx ();
14563 emit_label (label
);
14566 new_out
= gen_lowpart (wmode
, new_out
);
14568 old_out
= gen_lowpart (wmode
, old_out
);
14571 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
14573 /* The initial load can be relaxed for a __sync operation since a final
14574 barrier will be emitted to stop code hoisting. */
14576 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
14577 GEN_INT (MEMMODEL_RELAXED
));
14579 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
14588 x
= gen_rtx_AND (wmode
, old_out
, value
);
14589 emit_insn (gen_rtx_SET (new_out
, x
));
14590 x
= gen_rtx_NOT (wmode
, new_out
);
14591 emit_insn (gen_rtx_SET (new_out
, x
));
14595 if (CONST_INT_P (value
))
14597 value
= GEN_INT (-INTVAL (value
));
14600 /* Fall through. */
14603 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
14604 emit_insn (gen_rtx_SET (new_out
, x
));
14608 aarch64_emit_store_exclusive (mode
, cond
, mem
,
14609 gen_lowpart (mode
, new_out
), model_rtx
);
14611 if (aarch64_track_speculation
)
14613 /* Emit an explicit compare instruction, so that we can correctly
14614 track the condition codes. */
14615 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
14616 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
14619 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14621 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14622 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
14623 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14625 /* Emit any final barrier needed for a __sync operation. */
14627 aarch64_emit_post_barrier (model
);
14631 aarch64_init_libfuncs (void)
14633 /* Half-precision float operations. The compiler handles all operations
14634 with NULL libfuncs by converting to SFmode. */
14637 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
14638 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
14641 set_optab_libfunc (add_optab
, HFmode
, NULL
);
14642 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
14643 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
14644 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
14645 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
14648 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
14649 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
14650 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
14651 set_optab_libfunc (le_optab
, HFmode
, NULL
);
14652 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
14653 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
14654 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
14657 /* Target hook for c_mode_for_suffix. */
14658 static machine_mode
14659 aarch64_c_mode_for_suffix (char suffix
)
14667 /* We can only represent floating point constants which will fit in
14668 "quarter-precision" values. These values are characterised by
14669 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14672 (-1)^s * (n/16) * 2^r
14675 's' is the sign bit.
14676 'n' is an integer in the range 16 <= n <= 31.
14677 'r' is an integer in the range -3 <= r <= 4. */
14679 /* Return true iff X can be represented by a quarter-precision
14680 floating point immediate operand X. Note, we cannot represent 0.0. */
14682 aarch64_float_const_representable_p (rtx x
)
14684 /* This represents our current view of how many bits
14685 make up the mantissa. */
14686 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
14688 unsigned HOST_WIDE_INT mantissa
, mask
;
14689 REAL_VALUE_TYPE r
, m
;
14692 if (!CONST_DOUBLE_P (x
))
14695 if (GET_MODE (x
) == VOIDmode
14696 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
14699 r
= *CONST_DOUBLE_REAL_VALUE (x
);
14701 /* We cannot represent infinities, NaNs or +/-zero. We won't
14702 know if we have +zero until we analyse the mantissa, but we
14703 can reject the other invalid values. */
14704 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
14705 || REAL_VALUE_MINUS_ZERO (r
))
14708 /* Extract exponent. */
14709 r
= real_value_abs (&r
);
14710 exponent
= REAL_EXP (&r
);
14712 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14713 highest (sign) bit, with a fixed binary point at bit point_pos.
14714 m1 holds the low part of the mantissa, m2 the high part.
14715 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14716 bits for the mantissa, this can fail (low bits will be lost). */
14717 real_ldexp (&m
, &r
, point_pos
- exponent
);
14718 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
14720 /* If the low part of the mantissa has bits set we cannot represent
14722 if (w
.ulow () != 0)
14724 /* We have rejected the lower HOST_WIDE_INT, so update our
14725 understanding of how many bits lie in the mantissa and
14726 look only at the high HOST_WIDE_INT. */
14727 mantissa
= w
.elt (1);
14728 point_pos
-= HOST_BITS_PER_WIDE_INT
;
14730 /* We can only represent values with a mantissa of the form 1.xxxx. */
14731 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
14732 if ((mantissa
& mask
) != 0)
14735 /* Having filtered unrepresentable values, we may now remove all
14736 but the highest 5 bits. */
14737 mantissa
>>= point_pos
- 5;
14739 /* We cannot represent the value 0.0, so reject it. This is handled
14744 /* Then, as bit 4 is always set, we can mask it off, leaving
14745 the mantissa in the range [0, 15]. */
14746 mantissa
&= ~(1 << 4);
14747 gcc_assert (mantissa
<= 15);
14749 /* GCC internally does not use IEEE754-like encoding (where normalized
14750 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14751 Our mantissa values are shifted 4 places to the left relative to
14752 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14753 by 5 places to correct for GCC's representation. */
14754 exponent
= 5 - exponent
;
14756 return (exponent
>= 0 && exponent
<= 7);
14759 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14760 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14761 output MOVI/MVNI, ORR or BIC immediate. */
14763 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
14764 enum simd_immediate_check which
)
14767 static char templ
[40];
14768 const char *mnemonic
;
14769 const char *shift_op
;
14770 unsigned int lane_count
= 0;
14773 struct simd_immediate_info info
;
14775 /* This will return true to show const_vector is legal for use as either
14776 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14777 It will also update INFO to show how the immediate should be generated.
14778 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14779 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
14780 gcc_assert (is_valid
);
14782 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14783 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
14785 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14787 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
14788 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14789 move immediate path. */
14790 if (aarch64_float_const_zero_rtx_p (info
.value
))
14791 info
.value
= GEN_INT (0);
14794 const unsigned int buf_size
= 20;
14795 char float_buf
[buf_size
] = {'\0'};
14796 real_to_decimal_for_mode (float_buf
,
14797 CONST_DOUBLE_REAL_VALUE (info
.value
),
14798 buf_size
, buf_size
, 1, info
.elt_mode
);
14800 if (lane_count
== 1)
14801 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
14803 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
14804 lane_count
, element_char
, float_buf
);
14809 gcc_assert (CONST_INT_P (info
.value
));
14811 if (which
== AARCH64_CHECK_MOV
)
14813 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
14814 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
14815 if (lane_count
== 1)
14816 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
14817 mnemonic
, UINTVAL (info
.value
));
14818 else if (info
.shift
)
14819 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14820 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
14821 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
14823 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14824 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
14825 element_char
, UINTVAL (info
.value
));
14829 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14830 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
14832 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14833 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
14834 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
14836 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14837 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
14838 element_char
, UINTVAL (info
.value
));
14844 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
14847 /* If a floating point number was passed and we desire to use it in an
14848 integer mode do the conversion to integer. */
14849 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
14851 unsigned HOST_WIDE_INT ival
;
14852 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
14853 gcc_unreachable ();
14854 immediate
= gen_int_mode (ival
, mode
);
14857 machine_mode vmode
;
14858 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14859 a 128 bit vector mode. */
14860 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
14862 vmode
= aarch64_simd_container_mode (mode
, width
);
14863 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
14864 return aarch64_output_simd_mov_immediate (v_op
, width
);
14867 /* Return the output string to use for moving immediate CONST_VECTOR
14868 into an SVE register. */
14871 aarch64_output_sve_mov_immediate (rtx const_vector
)
14873 static char templ
[40];
14874 struct simd_immediate_info info
;
14877 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
14878 gcc_assert (is_valid
);
14880 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14884 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
14885 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
14886 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
14890 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14892 if (aarch64_float_const_zero_rtx_p (info
.value
))
14893 info
.value
= GEN_INT (0);
14896 const int buf_size
= 20;
14897 char float_buf
[buf_size
] = {};
14898 real_to_decimal_for_mode (float_buf
,
14899 CONST_DOUBLE_REAL_VALUE (info
.value
),
14900 buf_size
, buf_size
, 1, info
.elt_mode
);
14902 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
14903 element_char
, float_buf
);
14908 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
14909 element_char
, INTVAL (info
.value
));
14913 /* Return the asm format for a PTRUE instruction whose destination has
14914 mode MODE. SUFFIX is the element size suffix. */
14917 aarch64_output_ptrue (machine_mode mode
, char suffix
)
14919 unsigned int nunits
;
14920 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
14921 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
14922 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
14924 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
14928 /* Split operands into moves from op[1] + op[2] into op[0]. */
14931 aarch64_split_combinev16qi (rtx operands
[3])
14933 unsigned int dest
= REGNO (operands
[0]);
14934 unsigned int src1
= REGNO (operands
[1]);
14935 unsigned int src2
= REGNO (operands
[2]);
14936 machine_mode halfmode
= GET_MODE (operands
[1]);
14937 unsigned int halfregs
= REG_NREGS (operands
[1]);
14938 rtx destlo
, desthi
;
14940 gcc_assert (halfmode
== V16QImode
);
14942 if (src1
== dest
&& src2
== dest
+ halfregs
)
14944 /* No-op move. Can't split to nothing; emit something. */
14945 emit_note (NOTE_INSN_DELETED
);
14949 /* Preserve register attributes for variable tracking. */
14950 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
14951 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
14952 GET_MODE_SIZE (halfmode
));
14954 /* Special case of reversed high/low parts. */
14955 if (reg_overlap_mentioned_p (operands
[2], destlo
)
14956 && reg_overlap_mentioned_p (operands
[1], desthi
))
14958 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
14959 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
14960 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
14962 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
14964 /* Try to avoid unnecessary moves if part of the result
14965 is in the right place already. */
14967 emit_move_insn (destlo
, operands
[1]);
14968 if (src2
!= dest
+ halfregs
)
14969 emit_move_insn (desthi
, operands
[2]);
14973 if (src2
!= dest
+ halfregs
)
14974 emit_move_insn (desthi
, operands
[2]);
14976 emit_move_insn (destlo
, operands
[1]);
14980 /* vec_perm support. */
14982 struct expand_vec_perm_d
14984 rtx target
, op0
, op1
;
14985 vec_perm_indices perm
;
14986 machine_mode vmode
;
14987 unsigned int vec_flags
;
14992 /* Generate a variable permutation. */
14995 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
14997 machine_mode vmode
= GET_MODE (target
);
14998 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15000 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
15001 gcc_checking_assert (GET_MODE (op0
) == vmode
);
15002 gcc_checking_assert (GET_MODE (op1
) == vmode
);
15003 gcc_checking_assert (GET_MODE (sel
) == vmode
);
15004 gcc_checking_assert (TARGET_SIMD
);
15008 if (vmode
== V8QImode
)
15010 /* Expand the argument to a V16QI mode by duplicating it. */
15011 rtx pair
= gen_reg_rtx (V16QImode
);
15012 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
15013 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15017 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
15024 if (vmode
== V8QImode
)
15026 pair
= gen_reg_rtx (V16QImode
);
15027 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
15028 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15032 pair
= gen_reg_rtx (OImode
);
15033 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
15034 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
15039 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15040 NELT is the number of elements in the vector. */
15043 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
15046 machine_mode vmode
= GET_MODE (target
);
15047 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15050 /* The TBL instruction does not use a modulo index, so we must take care
15051 of that ourselves. */
15052 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
15053 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
15054 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
15056 /* For big-endian, we also need to reverse the index within the vector
15057 (but not which vector). */
15058 if (BYTES_BIG_ENDIAN
)
15060 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15062 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15063 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15064 NULL
, 0, OPTAB_LIB_WIDEN
);
15066 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15069 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15072 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15074 emit_insn (gen_rtx_SET (target
,
15075 gen_rtx_UNSPEC (GET_MODE (target
),
15076 gen_rtvec (2, op0
, op1
), code
)));
15079 /* Expand an SVE vec_perm with the given operands. */
15082 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15084 machine_mode data_mode
= GET_MODE (target
);
15085 machine_mode sel_mode
= GET_MODE (sel
);
15086 /* Enforced by the pattern condition. */
15087 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15089 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15090 size of the two value vectors, i.e. the upper bits of the indices
15091 are effectively ignored. SVE TBL instead produces 0 for any
15092 out-of-range indices, so we need to modulo all the vec_perm indices
15093 to ensure they are all in range. */
15094 rtx sel_reg
= force_reg (sel_mode
, sel
);
15096 /* Check if the sel only references the first values vector. */
15097 if (GET_CODE (sel
) == CONST_VECTOR
15098 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15100 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15104 /* Check if the two values vectors are the same. */
15105 if (rtx_equal_p (op0
, op1
))
15107 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15108 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15109 NULL
, 0, OPTAB_DIRECT
);
15110 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15114 /* Run TBL on for each value vector and combine the results. */
15116 rtx res0
= gen_reg_rtx (data_mode
);
15117 rtx res1
= gen_reg_rtx (data_mode
);
15118 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15119 if (GET_CODE (sel
) != CONST_VECTOR
15120 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15122 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15124 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15125 NULL
, 0, OPTAB_DIRECT
);
15127 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15128 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15129 NULL
, 0, OPTAB_DIRECT
);
15130 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15131 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15132 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15134 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15137 /* Recognize patterns suitable for the TRN instructions. */
15139 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15142 poly_uint64 nelt
= d
->perm
.length ();
15143 rtx out
, in0
, in1
, x
;
15144 machine_mode vmode
= d
->vmode
;
15146 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15149 /* Note that these are little-endian tests.
15150 We correct for big-endian later. */
15151 if (!d
->perm
[0].is_constant (&odd
)
15152 || (odd
!= 0 && odd
!= 1)
15153 || !d
->perm
.series_p (0, 2, odd
, 2)
15154 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
15163 /* We don't need a big-endian lane correction for SVE; see the comment
15164 at the head of aarch64-sve.md for details. */
15165 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15167 x
= in0
, in0
= in1
, in1
= x
;
15172 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15173 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
15177 /* Recognize patterns suitable for the UZP instructions. */
15179 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
15182 rtx out
, in0
, in1
, x
;
15183 machine_mode vmode
= d
->vmode
;
15185 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15188 /* Note that these are little-endian tests.
15189 We correct for big-endian later. */
15190 if (!d
->perm
[0].is_constant (&odd
)
15191 || (odd
!= 0 && odd
!= 1)
15192 || !d
->perm
.series_p (0, 1, odd
, 2))
15201 /* We don't need a big-endian lane correction for SVE; see the comment
15202 at the head of aarch64-sve.md for details. */
15203 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15205 x
= in0
, in0
= in1
, in1
= x
;
15210 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15211 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15215 /* Recognize patterns suitable for the ZIP instructions. */
15217 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15220 poly_uint64 nelt
= d
->perm
.length ();
15221 rtx out
, in0
, in1
, x
;
15222 machine_mode vmode
= d
->vmode
;
15224 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15227 /* Note that these are little-endian tests.
15228 We correct for big-endian later. */
15229 poly_uint64 first
= d
->perm
[0];
15230 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15231 || !d
->perm
.series_p (0, 2, first
, 1)
15232 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15234 high
= maybe_ne (first
, 0U);
15242 /* We don't need a big-endian lane correction for SVE; see the comment
15243 at the head of aarch64-sve.md for details. */
15244 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15246 x
= in0
, in0
= in1
, in1
= x
;
15251 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15252 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
15256 /* Recognize patterns for the EXT insn. */
15259 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
15261 HOST_WIDE_INT location
;
15264 /* The first element always refers to the first vector.
15265 Check if the extracted indices are increasing by one. */
15266 if (d
->vec_flags
== VEC_SVE_PRED
15267 || !d
->perm
[0].is_constant (&location
)
15268 || !d
->perm
.series_p (0, 1, location
, 1))
15275 /* The case where (location == 0) is a no-op for both big- and little-endian,
15276 and is removed by the mid-end at optimization levels -O1 and higher.
15278 We don't need a big-endian lane correction for SVE; see the comment
15279 at the head of aarch64-sve.md for details. */
15280 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
15282 /* After setup, we want the high elements of the first vector (stored
15283 at the LSB end of the register), and the low elements of the second
15284 vector (stored at the MSB end of the register). So swap. */
15285 std::swap (d
->op0
, d
->op1
);
15286 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15287 to_constant () is safe since this is restricted to Advanced SIMD
15289 location
= d
->perm
.length ().to_constant () - location
;
15292 offset
= GEN_INT (location
);
15293 emit_set_insn (d
->target
,
15294 gen_rtx_UNSPEC (d
->vmode
,
15295 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
15300 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15301 within each 64-bit, 32-bit or 16-bit granule. */
15304 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
15306 HOST_WIDE_INT diff
;
15307 unsigned int i
, size
, unspec
;
15308 machine_mode pred_mode
;
15310 if (d
->vec_flags
== VEC_SVE_PRED
15311 || !d
->one_vector_p
15312 || !d
->perm
[0].is_constant (&diff
))
15315 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
15318 unspec
= UNSPEC_REV64
;
15319 pred_mode
= VNx2BImode
;
15321 else if (size
== 4)
15323 unspec
= UNSPEC_REV32
;
15324 pred_mode
= VNx4BImode
;
15326 else if (size
== 2)
15328 unspec
= UNSPEC_REV16
;
15329 pred_mode
= VNx8BImode
;
15334 unsigned int step
= diff
+ 1;
15335 for (i
= 0; i
< step
; ++i
)
15336 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
15343 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
15344 if (d
->vec_flags
== VEC_SVE_DATA
)
15346 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15347 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
15348 UNSPEC_MERGE_PTRUE
);
15350 emit_set_insn (d
->target
, src
);
15354 /* Recognize patterns for the REV insn, which reverses elements within
15358 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
15360 poly_uint64 nelt
= d
->perm
.length ();
15362 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
15365 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
15372 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
15373 emit_set_insn (d
->target
, src
);
15378 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
15380 rtx out
= d
->target
;
15383 machine_mode vmode
= d
->vmode
;
15386 if (d
->vec_flags
== VEC_SVE_PRED
15387 || d
->perm
.encoding ().encoded_nelts () != 1
15388 || !d
->perm
[0].is_constant (&elt
))
15391 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
15398 /* The generic preparation in aarch64_expand_vec_perm_const_1
15399 swaps the operand order and the permute indices if it finds
15400 d->perm[0] to be in the second operand. Thus, we can always
15401 use d->op0 and need not do any extra arithmetic to get the
15402 correct lane number. */
15404 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
15406 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
15407 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
15408 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
15413 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
15415 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
15416 machine_mode vmode
= d
->vmode
;
15418 /* Make sure that the indices are constant. */
15419 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
15420 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
15421 if (!d
->perm
[i
].is_constant ())
15427 /* Generic code will try constant permutation twice. Once with the
15428 original mode and again with the elements lowered to QImode.
15429 So wait and don't do the selector expansion ourselves. */
15430 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
15433 /* to_constant is safe since this routine is specific to Advanced SIMD
15435 unsigned int nelt
= d
->perm
.length ().to_constant ();
15436 for (unsigned int i
= 0; i
< nelt
; ++i
)
15437 /* If big-endian and two vectors we end up with a weird mixed-endian
15438 mode on NEON. Reverse the index within each word but not the word
15439 itself. to_constant is safe because we checked is_constant above. */
15440 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
15441 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
15442 : d
->perm
[i
].to_constant ());
15444 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
15445 sel
= force_reg (vmode
, sel
);
15447 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
15451 /* Try to implement D using an SVE TBL instruction. */
15454 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
15456 unsigned HOST_WIDE_INT nelt
;
15458 /* Permuting two variable-length vectors could overflow the
15460 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
15466 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
15467 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
15468 if (d
->one_vector_p
)
15469 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
15471 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
15476 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
15478 /* The pattern matching functions above are written to look for a small
15479 number to begin the sequence (0, 1, N/2). If we begin with an index
15480 from the second operand, we can swap the operands. */
15481 poly_int64 nelt
= d
->perm
.length ();
15482 if (known_ge (d
->perm
[0], nelt
))
15484 d
->perm
.rotate_inputs (1);
15485 std::swap (d
->op0
, d
->op1
);
15488 if ((d
->vec_flags
== VEC_ADVSIMD
15489 || d
->vec_flags
== VEC_SVE_DATA
15490 || d
->vec_flags
== VEC_SVE_PRED
)
15491 && known_gt (nelt
, 1))
15493 if (aarch64_evpc_rev_local (d
))
15495 else if (aarch64_evpc_rev_global (d
))
15497 else if (aarch64_evpc_ext (d
))
15499 else if (aarch64_evpc_dup (d
))
15501 else if (aarch64_evpc_zip (d
))
15503 else if (aarch64_evpc_uzp (d
))
15505 else if (aarch64_evpc_trn (d
))
15507 if (d
->vec_flags
== VEC_SVE_DATA
)
15508 return aarch64_evpc_sve_tbl (d
);
15509 else if (d
->vec_flags
== VEC_ADVSIMD
)
15510 return aarch64_evpc_tbl (d
);
15515 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15518 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
15519 rtx op1
, const vec_perm_indices
&sel
)
15521 struct expand_vec_perm_d d
;
15523 /* Check whether the mask can be applied to a single vector. */
15524 if (sel
.ninputs () == 1
15525 || (op0
&& rtx_equal_p (op0
, op1
)))
15526 d
.one_vector_p
= true;
15527 else if (sel
.all_from_input_p (0))
15529 d
.one_vector_p
= true;
15532 else if (sel
.all_from_input_p (1))
15534 d
.one_vector_p
= true;
15538 d
.one_vector_p
= false;
15540 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
15541 sel
.nelts_per_input ());
15543 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
15547 d
.testing_p
= !target
;
15550 return aarch64_expand_vec_perm_const_1 (&d
);
15552 rtx_insn
*last
= get_last_insn ();
15553 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
15554 gcc_assert (last
== get_last_insn ());
15559 /* Generate a byte permute mask for a register of mode MODE,
15560 which has NUNITS units. */
15563 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
15565 /* We have to reverse each vector because we dont have
15566 a permuted load that can reverse-load according to ABI rules. */
15568 rtvec v
= rtvec_alloc (16);
15570 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
15572 gcc_assert (BYTES_BIG_ENDIAN
);
15573 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
15575 for (i
= 0; i
< nunits
; i
++)
15576 for (j
= 0; j
< usize
; j
++)
15577 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
15578 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
15579 return force_reg (V16QImode
, mask
);
15582 /* Return true if X is a valid second operand for the SVE instruction
15583 that implements integer comparison OP_CODE. */
15586 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
15588 if (register_operand (x
, VOIDmode
))
15597 return aarch64_sve_cmp_immediate_p (x
, false);
15604 return aarch64_sve_cmp_immediate_p (x
, true);
15606 gcc_unreachable ();
15610 /* Use predicated SVE instructions to implement the equivalent of:
15614 given that PTRUE is an all-true predicate of the appropriate mode. */
15617 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
15619 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15620 gen_rtvec (2, ptrue
, op
),
15621 UNSPEC_MERGE_PTRUE
);
15622 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
15623 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15626 /* Likewise, but also clobber the condition codes. */
15629 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
15631 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15632 gen_rtvec (2, ptrue
, op
),
15633 UNSPEC_MERGE_PTRUE
);
15634 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
15635 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15638 /* Return the UNSPEC_COND_* code for comparison CODE. */
15640 static unsigned int
15641 aarch64_unspec_cond_code (rtx_code code
)
15646 return UNSPEC_COND_NE
;
15648 return UNSPEC_COND_EQ
;
15650 return UNSPEC_COND_LT
;
15652 return UNSPEC_COND_GT
;
15654 return UNSPEC_COND_LE
;
15656 return UNSPEC_COND_GE
;
15658 gcc_unreachable ();
15664 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15666 where <X> is the operation associated with comparison CODE. This form
15667 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15668 semantics, such as when PRED might not be all-true and when comparing
15669 inactive lanes could have side effects. */
15672 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
15673 rtx pred
, rtx op0
, rtx op1
)
15675 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
15676 gen_rtvec (3, pred
, op0
, op1
),
15677 aarch64_unspec_cond_code (code
));
15678 emit_set_insn (target
, unspec
);
15681 /* Expand an SVE integer comparison using the SVE equivalent of:
15683 (set TARGET (CODE OP0 OP1)). */
15686 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
15688 machine_mode pred_mode
= GET_MODE (target
);
15689 machine_mode data_mode
= GET_MODE (op0
);
15691 if (!aarch64_sve_cmp_operand_p (code
, op1
))
15692 op1
= force_reg (data_mode
, op1
);
15694 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15695 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15696 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
15699 /* Emit the SVE equivalent of:
15701 (set TMP1 (CODE1 OP0 OP1))
15702 (set TMP2 (CODE2 OP0 OP1))
15703 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15705 PTRUE is an all-true predicate with the same mode as TARGET. */
15708 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
15709 rtx ptrue
, rtx op0
, rtx op1
)
15711 machine_mode pred_mode
= GET_MODE (ptrue
);
15712 rtx tmp1
= gen_reg_rtx (pred_mode
);
15713 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
15714 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
15715 rtx tmp2
= gen_reg_rtx (pred_mode
);
15716 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
15717 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
15718 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
15721 /* Emit the SVE equivalent of:
15723 (set TMP (CODE OP0 OP1))
15724 (set TARGET (not TMP))
15726 PTRUE is an all-true predicate with the same mode as TARGET. */
15729 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
15732 machine_mode pred_mode
= GET_MODE (ptrue
);
15733 rtx tmp
= gen_reg_rtx (pred_mode
);
15734 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
15735 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
15736 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15739 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15741 (set TARGET (CODE OP0 OP1))
15743 If CAN_INVERT_P is true, the caller can also handle inverted results;
15744 return true if the result is in fact inverted. */
15747 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
15748 rtx op0
, rtx op1
, bool can_invert_p
)
15750 machine_mode pred_mode
= GET_MODE (target
);
15751 machine_mode data_mode
= GET_MODE (op0
);
15753 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15757 /* UNORDERED has no immediate form. */
15758 op1
= force_reg (data_mode
, op1
);
15767 /* There is native support for the comparison. */
15768 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15769 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15774 /* This is a trapping operation (LT or GT). */
15775 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
15779 if (!flag_trapping_math
)
15781 /* This would trap for signaling NaNs. */
15782 op1
= force_reg (data_mode
, op1
);
15783 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
15791 if (flag_trapping_math
)
15793 /* Work out which elements are ordered. */
15794 rtx ordered
= gen_reg_rtx (pred_mode
);
15795 op1
= force_reg (data_mode
, op1
);
15796 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
15798 /* Test the opposite condition for the ordered elements,
15799 then invert the result. */
15803 code
= reverse_condition_maybe_unordered (code
);
15806 aarch64_emit_sve_predicated_cond (target
, code
,
15807 ordered
, op0
, op1
);
15810 rtx tmp
= gen_reg_rtx (pred_mode
);
15811 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
15812 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15818 /* ORDERED has no immediate form. */
15819 op1
= force_reg (data_mode
, op1
);
15823 gcc_unreachable ();
15826 /* There is native support for the inverse comparison. */
15827 code
= reverse_condition_maybe_unordered (code
);
15830 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15831 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15834 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
15838 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15839 of the data being selected and CMP_MODE is the mode of the values being
15843 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
15846 machine_mode pred_mode
15847 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
15848 GET_MODE_SIZE (cmp_mode
)).require ();
15849 rtx pred
= gen_reg_rtx (pred_mode
);
15850 if (FLOAT_MODE_P (cmp_mode
))
15852 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
15853 ops
[4], ops
[5], true))
15854 std::swap (ops
[1], ops
[2]);
15857 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
15859 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
15860 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
15863 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15864 true. However due to issues with register allocation it is preferable
15865 to avoid tieing integer scalar and FP scalar modes. Executing integer
15866 operations in general registers is better than treating them as scalar
15867 vector operations. This reduces latency and avoids redundant int<->FP
15868 moves. So tie modes if they are either the same class, or vector modes
15869 with other vector modes, vector structs or any scalar mode. */
15872 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
15874 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
15877 /* We specifically want to allow elements of "structure" modes to
15878 be tieable to the structure. This more general condition allows
15879 other rarer situations too. The reason we don't extend this to
15880 predicate modes is that there are no predicate structure modes
15881 nor any specific instructions for extracting part of a predicate
15883 if (aarch64_vector_data_mode_p (mode1
)
15884 && aarch64_vector_data_mode_p (mode2
))
15887 /* Also allow any scalar modes with vectors. */
15888 if (aarch64_vector_mode_supported_p (mode1
)
15889 || aarch64_vector_mode_supported_p (mode2
))
15895 /* Return a new RTX holding the result of moving POINTER forward by
15899 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
15901 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
15903 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
15907 /* Return a new RTX holding the result of moving POINTER forward by the
15908 size of the mode it points to. */
15911 aarch64_progress_pointer (rtx pointer
)
15913 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
15916 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15920 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
15923 rtx reg
= gen_reg_rtx (mode
);
15925 /* "Cast" the pointers to the correct mode. */
15926 *src
= adjust_address (*src
, mode
, 0);
15927 *dst
= adjust_address (*dst
, mode
, 0);
15928 /* Emit the memcpy. */
15929 emit_move_insn (reg
, *src
);
15930 emit_move_insn (*dst
, reg
);
15931 /* Move the pointers forward. */
15932 *src
= aarch64_progress_pointer (*src
);
15933 *dst
= aarch64_progress_pointer (*dst
);
15936 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15937 we succeed, otherwise return false. */
15940 aarch64_expand_movmem (rtx
*operands
)
15943 rtx dst
= operands
[0];
15944 rtx src
= operands
[1];
15946 machine_mode cur_mode
= BLKmode
, next_mode
;
15947 bool speed_p
= !optimize_function_for_size_p (cfun
);
15949 /* When optimizing for size, give a better estimate of the length of a
15950 memcpy call, but use the default otherwise. Moves larger than 8 bytes
15951 will always require an even number of instructions to do now. And each
15952 operation requires both a load+store, so devide the max number by 2. */
15953 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
15955 /* We can't do anything smart if the amount to copy is not constant. */
15956 if (!CONST_INT_P (operands
[2]))
15959 n
= INTVAL (operands
[2]);
15961 /* Try to keep the number of instructions low. For all cases we will do at
15962 most two moves for the residual amount, since we'll always overlap the
15964 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
15967 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
15968 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
15970 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
15971 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
15973 /* Convert n to bits to make the rest of the code simpler. */
15974 n
= n
* BITS_PER_UNIT
;
15976 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
15977 larger than TImode, but we should not use them for loads/stores here. */
15978 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
15982 /* Find the largest mode in which to do the copy in without over reading
15984 opt_scalar_int_mode mode_iter
;
15985 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
15986 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
15987 cur_mode
= mode_iter
.require ();
15989 gcc_assert (cur_mode
!= BLKmode
);
15991 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
15992 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
15996 /* Do certain trailing copies as overlapping if it's going to be
15997 cheaper. i.e. less instructions to do so. For instance doing a 15
15998 byte copy it's more efficient to do two overlapping 8 byte copies than
16000 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
16002 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
16003 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
16004 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
16005 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
16013 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16014 SImode stores. Handle the case when the constant has identical
16015 bottom and top halves. This is beneficial when the two stores can be
16016 merged into an STP and we avoid synthesising potentially expensive
16017 immediates twice. Return true if such a split is possible. */
16020 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
16022 rtx lo
= gen_lowpart (SImode
, src
);
16023 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
16025 bool size_p
= optimize_function_for_size_p (cfun
);
16027 if (!rtx_equal_p (lo
, hi
))
16030 unsigned int orig_cost
16031 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
16032 unsigned int lo_cost
16033 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
16035 /* We want to transform:
16037 MOVK x1, 0x140, lsl 16
16038 MOVK x1, 0xc0da, lsl 32
16039 MOVK x1, 0x140, lsl 48
16043 MOVK w1, 0x140, lsl 16
16045 So we want to perform this only when we save two instructions
16046 or more. When optimizing for size, however, accept any code size
16048 if (size_p
&& orig_cost
<= lo_cost
)
16052 && (orig_cost
<= lo_cost
+ 1))
16055 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
16056 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
16059 rtx tmp_reg
= gen_reg_rtx (SImode
);
16060 aarch64_expand_mov_immediate (tmp_reg
, lo
);
16061 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
16062 /* Don't emit an explicit store pair as this may not be always profitable.
16063 Let the sched-fusion logic decide whether to merge them. */
16064 emit_move_insn (mem_lo
, tmp_reg
);
16065 emit_move_insn (mem_hi
, tmp_reg
);
16070 /* Generate RTL for a conditional branch with rtx comparison CODE in
16071 mode CC_MODE. The destination of the unlikely conditional branch
16075 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
16079 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
16080 gen_rtx_REG (cc_mode
, CC_REGNUM
),
16083 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16084 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
16086 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16089 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16091 OP1 represents the TImode destination operand 1
16092 OP2 represents the TImode destination operand 2
16093 LOW_DEST represents the low half (DImode) of TImode operand 0
16094 LOW_IN1 represents the low half (DImode) of TImode operand 1
16095 LOW_IN2 represents the low half (DImode) of TImode operand 2
16096 HIGH_DEST represents the high half (DImode) of TImode operand 0
16097 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16098 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16101 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16102 rtx
*low_in1
, rtx
*low_in2
,
16103 rtx
*high_dest
, rtx
*high_in1
,
16106 *low_dest
= gen_reg_rtx (DImode
);
16107 *low_in1
= gen_lowpart (DImode
, op1
);
16108 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16109 subreg_lowpart_offset (DImode
, TImode
));
16110 *high_dest
= gen_reg_rtx (DImode
);
16111 *high_in1
= gen_highpart (DImode
, op1
);
16112 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16113 subreg_highpart_offset (DImode
, TImode
));
16116 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16118 This function differs from 'arch64_addti_scratch_regs' in that
16119 OP1 can be an immediate constant (zero). We must call
16120 subreg_highpart_offset with DImode and TImode arguments, otherwise
16121 VOIDmode will be used for the const_int which generates an internal
16122 error from subreg_size_highpart_offset which does not expect a size of zero.
16124 OP1 represents the TImode destination operand 1
16125 OP2 represents the TImode destination operand 2
16126 LOW_DEST represents the low half (DImode) of TImode operand 0
16127 LOW_IN1 represents the low half (DImode) of TImode operand 1
16128 LOW_IN2 represents the low half (DImode) of TImode operand 2
16129 HIGH_DEST represents the high half (DImode) of TImode operand 0
16130 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16131 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16135 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16136 rtx
*low_in1
, rtx
*low_in2
,
16137 rtx
*high_dest
, rtx
*high_in1
,
16140 *low_dest
= gen_reg_rtx (DImode
);
16141 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16142 subreg_lowpart_offset (DImode
, TImode
));
16144 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16145 subreg_lowpart_offset (DImode
, TImode
));
16146 *high_dest
= gen_reg_rtx (DImode
);
16148 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16149 subreg_highpart_offset (DImode
, TImode
));
16150 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16151 subreg_highpart_offset (DImode
, TImode
));
16154 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16156 OP0 represents the TImode destination operand 0
16157 LOW_DEST represents the low half (DImode) of TImode operand 0
16158 LOW_IN1 represents the low half (DImode) of TImode operand 1
16159 LOW_IN2 represents the low half (DImode) of TImode operand 2
16160 HIGH_DEST represents the high half (DImode) of TImode operand 0
16161 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16162 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16165 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
16166 rtx low_in2
, rtx high_dest
, rtx high_in1
,
16169 if (low_in2
== const0_rtx
)
16171 low_dest
= low_in1
;
16172 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
,
16173 force_reg (DImode
, high_in2
)));
16177 if (CONST_INT_P (low_in2
))
16179 low_in2
= force_reg (DImode
, GEN_INT (-UINTVAL (low_in2
)));
16180 high_in2
= force_reg (DImode
, high_in2
);
16181 emit_insn (gen_adddi3_compareC (low_dest
, low_in1
, low_in2
));
16184 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
16185 emit_insn (gen_subdi3_carryinCV (high_dest
,
16186 force_reg (DImode
, high_in1
),
16190 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
16191 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
16195 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16197 static unsigned HOST_WIDE_INT
16198 aarch64_asan_shadow_offset (void)
16200 return (HOST_WIDE_INT_1
<< 36);
16204 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
16205 int code
, tree treeop0
, tree treeop1
)
16207 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16209 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16211 struct expand_operand ops
[4];
16214 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16216 op_mode
= GET_MODE (op0
);
16217 if (op_mode
== VOIDmode
)
16218 op_mode
= GET_MODE (op1
);
16226 icode
= CODE_FOR_cmpsi
;
16231 icode
= CODE_FOR_cmpdi
;
16236 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16237 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
16242 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16243 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
16251 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
16252 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
16258 *prep_seq
= get_insns ();
16261 create_fixed_operand (&ops
[0], op0
);
16262 create_fixed_operand (&ops
[1], op1
);
16265 if (!maybe_expand_insn (icode
, 2, ops
))
16270 *gen_seq
= get_insns ();
16273 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
16274 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
16278 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
16279 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
16281 rtx op0
, op1
, target
;
16282 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16283 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16285 struct expand_operand ops
[6];
16288 push_to_sequence (*prep_seq
);
16289 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16291 op_mode
= GET_MODE (op0
);
16292 if (op_mode
== VOIDmode
)
16293 op_mode
= GET_MODE (op1
);
16301 icode
= CODE_FOR_ccmpsi
;
16306 icode
= CODE_FOR_ccmpdi
;
16311 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16312 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
16317 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16318 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
16326 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
16327 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
16333 *prep_seq
= get_insns ();
16336 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
16337 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
16339 if (bit_code
!= AND
)
16341 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
16342 GET_MODE (XEXP (prev
, 0))),
16343 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
16344 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
16347 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
16348 create_fixed_operand (&ops
[1], target
);
16349 create_fixed_operand (&ops
[2], op0
);
16350 create_fixed_operand (&ops
[3], op1
);
16351 create_fixed_operand (&ops
[4], prev
);
16352 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
16354 push_to_sequence (*gen_seq
);
16355 if (!maybe_expand_insn (icode
, 6, ops
))
16361 *gen_seq
= get_insns ();
16364 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
16367 #undef TARGET_GEN_CCMP_FIRST
16368 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16370 #undef TARGET_GEN_CCMP_NEXT
16371 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16373 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16374 instruction fusion of some sort. */
16377 aarch64_macro_fusion_p (void)
16379 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
16383 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16384 should be kept together during scheduling. */
16387 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
16390 rtx prev_set
= single_set (prev
);
16391 rtx curr_set
= single_set (curr
);
16392 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16393 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
16395 if (!aarch64_macro_fusion_p ())
16398 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
16400 /* We are trying to match:
16401 prev (mov) == (set (reg r0) (const_int imm16))
16402 curr (movk) == (set (zero_extract (reg r0)
16405 (const_int imm16_1)) */
16407 set_dest
= SET_DEST (curr_set
);
16409 if (GET_CODE (set_dest
) == ZERO_EXTRACT
16410 && CONST_INT_P (SET_SRC (curr_set
))
16411 && CONST_INT_P (SET_SRC (prev_set
))
16412 && CONST_INT_P (XEXP (set_dest
, 2))
16413 && INTVAL (XEXP (set_dest
, 2)) == 16
16414 && REG_P (XEXP (set_dest
, 0))
16415 && REG_P (SET_DEST (prev_set
))
16416 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
16422 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
16425 /* We're trying to match:
16426 prev (adrp) == (set (reg r1)
16427 (high (symbol_ref ("SYM"))))
16428 curr (add) == (set (reg r0)
16430 (symbol_ref ("SYM"))))
16431 Note that r0 need not necessarily be the same as r1, especially
16432 during pre-regalloc scheduling. */
16434 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16435 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16437 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
16438 && REG_P (XEXP (SET_SRC (curr_set
), 0))
16439 && REGNO (XEXP (SET_SRC (curr_set
), 0))
16440 == REGNO (SET_DEST (prev_set
))
16441 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
16442 XEXP (SET_SRC (curr_set
), 1)))
16447 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
16450 /* We're trying to match:
16451 prev (movk) == (set (zero_extract (reg r0)
16454 (const_int imm16_1))
16455 curr (movk) == (set (zero_extract (reg r0)
16458 (const_int imm16_2)) */
16460 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
16461 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
16462 && REG_P (XEXP (SET_DEST (prev_set
), 0))
16463 && REG_P (XEXP (SET_DEST (curr_set
), 0))
16464 && REGNO (XEXP (SET_DEST (prev_set
), 0))
16465 == REGNO (XEXP (SET_DEST (curr_set
), 0))
16466 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
16467 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
16468 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
16469 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
16470 && CONST_INT_P (SET_SRC (prev_set
))
16471 && CONST_INT_P (SET_SRC (curr_set
)))
16475 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
16477 /* We're trying to match:
16478 prev (adrp) == (set (reg r0)
16479 (high (symbol_ref ("SYM"))))
16480 curr (ldr) == (set (reg r1)
16481 (mem (lo_sum (reg r0)
16482 (symbol_ref ("SYM")))))
16484 curr (ldr) == (set (reg r1)
16487 (symbol_ref ("SYM")))))) */
16488 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16489 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16491 rtx curr_src
= SET_SRC (curr_set
);
16493 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
16494 curr_src
= XEXP (curr_src
, 0);
16496 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
16497 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
16498 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
16499 == REGNO (SET_DEST (prev_set
))
16500 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
16501 XEXP (SET_SRC (prev_set
), 0)))
16506 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
16507 && aarch_crypto_can_dual_issue (prev
, curr
))
16510 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
16511 && any_condjump_p (curr
))
16513 enum attr_type prev_type
= get_attr_type (prev
);
16515 unsigned int condreg1
, condreg2
;
16517 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
16518 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
16520 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
16522 && modified_in_p (cc_reg_1
, prev
))
16524 /* FIXME: this misses some which is considered simple arthematic
16525 instructions for ThunderX. Simple shifts are missed here. */
16526 if (prev_type
== TYPE_ALUS_SREG
16527 || prev_type
== TYPE_ALUS_IMM
16528 || prev_type
== TYPE_LOGICS_REG
16529 || prev_type
== TYPE_LOGICS_IMM
)
16536 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
16537 && any_condjump_p (curr
))
16539 /* We're trying to match:
16540 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16541 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16543 (label_ref ("SYM"))
16545 if (SET_DEST (curr_set
) == (pc_rtx
)
16546 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
16547 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
16548 && REG_P (SET_DEST (prev_set
))
16549 && REGNO (SET_DEST (prev_set
))
16550 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
16552 /* Fuse ALU operations followed by conditional branch instruction. */
16553 switch (get_attr_type (prev
))
16556 case TYPE_ALU_SREG
:
16559 case TYPE_ADCS_REG
:
16560 case TYPE_ADCS_IMM
:
16561 case TYPE_LOGIC_REG
:
16562 case TYPE_LOGIC_IMM
:
16566 case TYPE_SHIFT_REG
:
16567 case TYPE_SHIFT_IMM
:
16582 /* Return true iff the instruction fusion described by OP is enabled. */
16585 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
16587 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
16590 /* If MEM is in the form of [base+offset], extract the two parts
16591 of address and set to BASE and OFFSET, otherwise return false
16592 after clearing BASE and OFFSET. */
16595 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
16599 gcc_assert (MEM_P (mem
));
16601 addr
= XEXP (mem
, 0);
16606 *offset
= const0_rtx
;
16610 if (GET_CODE (addr
) == PLUS
16611 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
16613 *base
= XEXP (addr
, 0);
16614 *offset
= XEXP (addr
, 1);
16619 *offset
= NULL_RTX
;
16624 /* Types for scheduling fusion. */
16625 enum sched_fusion_type
16627 SCHED_FUSION_NONE
= 0,
16628 SCHED_FUSION_LD_SIGN_EXTEND
,
16629 SCHED_FUSION_LD_ZERO_EXTEND
,
16635 /* If INSN is a load or store of address in the form of [base+offset],
16636 extract the two parts and set to BASE and OFFSET. Return scheduling
16637 fusion type this INSN is. */
16639 static enum sched_fusion_type
16640 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
16643 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
16645 gcc_assert (INSN_P (insn
));
16646 x
= PATTERN (insn
);
16647 if (GET_CODE (x
) != SET
)
16648 return SCHED_FUSION_NONE
;
16651 dest
= SET_DEST (x
);
16653 machine_mode dest_mode
= GET_MODE (dest
);
16655 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
16656 return SCHED_FUSION_NONE
;
16658 if (GET_CODE (src
) == SIGN_EXTEND
)
16660 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
16661 src
= XEXP (src
, 0);
16662 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16663 return SCHED_FUSION_NONE
;
16665 else if (GET_CODE (src
) == ZERO_EXTEND
)
16667 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
16668 src
= XEXP (src
, 0);
16669 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16670 return SCHED_FUSION_NONE
;
16673 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
16674 extract_base_offset_in_addr (src
, base
, offset
);
16675 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
16677 fusion
= SCHED_FUSION_ST
;
16678 extract_base_offset_in_addr (dest
, base
, offset
);
16681 return SCHED_FUSION_NONE
;
16683 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
16684 fusion
= SCHED_FUSION_NONE
;
16689 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16691 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16692 and PRI are only calculated for these instructions. For other instruction,
16693 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16694 type instruction fusion can be added by returning different priorities.
16696 It's important that irrelevant instructions get the largest FUSION_PRI. */
16699 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
16700 int *fusion_pri
, int *pri
)
16704 enum sched_fusion_type fusion
;
16706 gcc_assert (INSN_P (insn
));
16709 fusion
= fusion_load_store (insn
, &base
, &offset
);
16710 if (fusion
== SCHED_FUSION_NONE
)
16717 /* Set FUSION_PRI according to fusion type and base register. */
16718 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
16720 /* Calculate PRI. */
16723 /* INSN with smaller offset goes first. */
16724 off_val
= (int)(INTVAL (offset
));
16726 tmp
-= (off_val
& 0xfffff);
16728 tmp
+= ((- off_val
) & 0xfffff);
16734 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16735 Adjust priority of sha1h instructions so they are scheduled before
16736 other SHA1 instructions. */
16739 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
16741 rtx x
= PATTERN (insn
);
16743 if (GET_CODE (x
) == SET
)
16747 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
16748 return priority
+ 10;
16754 /* Given OPERANDS of consecutive load/store, check if we can merge
16755 them into ldp/stp. LOAD is true if they are load instructions.
16756 MODE is the mode of memory operands. */
16759 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
16762 HOST_WIDE_INT offval_1
, offval_2
, msize
;
16763 enum reg_class rclass_1
, rclass_2
;
16764 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
16768 mem_1
= operands
[1];
16769 mem_2
= operands
[3];
16770 reg_1
= operands
[0];
16771 reg_2
= operands
[2];
16772 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
16773 if (REGNO (reg_1
) == REGNO (reg_2
))
16778 mem_1
= operands
[0];
16779 mem_2
= operands
[2];
16780 reg_1
= operands
[1];
16781 reg_2
= operands
[3];
16784 /* The mems cannot be volatile. */
16785 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
16788 /* If we have SImode and slow unaligned ldp,
16789 check the alignment to be at least 8 byte. */
16791 && (aarch64_tune_params
.extra_tuning_flags
16792 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16794 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16797 /* Check if the addresses are in the form of [base+offset]. */
16798 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16799 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16801 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16802 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16805 /* Check if the bases are same. */
16806 if (!rtx_equal_p (base_1
, base_2
))
16809 /* The operands must be of the same size. */
16810 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
16811 GET_MODE_SIZE (GET_MODE (mem_2
))));
16813 offval_1
= INTVAL (offset_1
);
16814 offval_2
= INTVAL (offset_2
);
16815 /* We should only be trying this for fixed-sized modes. There is no
16816 SVE LDP/STP instruction. */
16817 msize
= GET_MODE_SIZE (mode
).to_constant ();
16818 /* Check if the offsets are consecutive. */
16819 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
16822 /* Check if the addresses are clobbered by load. */
16825 if (reg_mentioned_p (reg_1
, mem_1
))
16828 /* In increasing order, the last load can clobber the address. */
16829 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
16833 /* One of the memory accesses must be a mempair operand.
16834 If it is not the first one, they need to be swapped by the
16836 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
16837 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
16840 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16841 rclass_1
= FP_REGS
;
16843 rclass_1
= GENERAL_REGS
;
16845 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16846 rclass_2
= FP_REGS
;
16848 rclass_2
= GENERAL_REGS
;
16850 /* Check if the registers are of same class. */
16851 if (rclass_1
!= rclass_2
)
16857 /* Given OPERANDS of consecutive load/store that can be merged,
16858 swap them if they are not in ascending order. */
16860 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
16862 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
16863 HOST_WIDE_INT offval_1
, offval_2
;
16867 mem_1
= operands
[1];
16868 mem_2
= operands
[3];
16872 mem_1
= operands
[0];
16873 mem_2
= operands
[2];
16876 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16877 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16879 offval_1
= INTVAL (offset_1
);
16880 offval_2
= INTVAL (offset_2
);
16882 if (offval_1
> offval_2
)
16884 /* Irrespective of whether this is a load or a store,
16885 we do the same swap. */
16886 std::swap (operands
[0], operands
[2]);
16887 std::swap (operands
[1], operands
[3]);
16891 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16892 comparison between the two. */
16894 aarch64_host_wide_int_compare (const void *x
, const void *y
)
16896 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
16897 * ((const HOST_WIDE_INT
*) y
));
16900 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16901 other pointing to a REG rtx containing an offset, compare the offsets
16906 1 iff offset (X) > offset (Y)
16907 0 iff offset (X) == offset (Y)
16908 -1 iff offset (X) < offset (Y) */
16910 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
16912 const rtx
* operands_1
= (const rtx
*) x
;
16913 const rtx
* operands_2
= (const rtx
*) y
;
16914 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
16916 if (MEM_P (operands_1
[0]))
16917 mem_1
= operands_1
[0];
16919 mem_1
= operands_1
[1];
16921 if (MEM_P (operands_2
[0]))
16922 mem_2
= operands_2
[0];
16924 mem_2
= operands_2
[1];
16926 /* Extract the offsets. */
16927 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
16928 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
16930 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
16932 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
16935 /* Given OPERANDS of consecutive load/store, check if we can merge
16936 them into ldp/stp by adjusting the offset. LOAD is true if they
16937 are load instructions. MODE is the mode of memory operands.
16939 Given below consecutive stores:
16941 str w1, [xb, 0x100]
16942 str w1, [xb, 0x104]
16943 str w1, [xb, 0x108]
16944 str w1, [xb, 0x10c]
16946 Though the offsets are out of the range supported by stp, we can
16947 still pair them after adjusting the offset, like:
16949 add scratch, xb, 0x100
16950 stp w1, w1, [scratch]
16951 stp w1, w1, [scratch, 0x8]
16953 The peephole patterns detecting this opportunity should guarantee
16954 the scratch register is avaliable. */
16957 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
16960 const int num_insns
= 4;
16961 enum reg_class rclass
;
16962 HOST_WIDE_INT offvals
[num_insns
], msize
;
16963 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
16967 for (int i
= 0; i
< num_insns
; i
++)
16969 reg
[i
] = operands
[2 * i
];
16970 mem
[i
] = operands
[2 * i
+ 1];
16972 gcc_assert (REG_P (reg
[i
]));
16975 /* Do not attempt to merge the loads if the loads clobber each other. */
16976 for (int i
= 0; i
< 8; i
+= 2)
16977 for (int j
= i
+ 2; j
< 8; j
+= 2)
16978 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
16982 for (int i
= 0; i
< num_insns
; i
++)
16984 mem
[i
] = operands
[2 * i
];
16985 reg
[i
] = operands
[2 * i
+ 1];
16988 /* Skip if memory operand is by itself valid for ldp/stp. */
16989 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
16992 for (int i
= 0; i
< num_insns
; i
++)
16994 /* The mems cannot be volatile. */
16995 if (MEM_VOLATILE_P (mem
[i
]))
16998 /* Check if the addresses are in the form of [base+offset]. */
16999 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
17000 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
17004 /* Check if the registers are of same class. */
17005 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
17006 ? FP_REGS
: GENERAL_REGS
;
17008 for (int i
= 1; i
< num_insns
; i
++)
17009 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
17011 if (rclass
!= FP_REGS
)
17016 if (rclass
!= GENERAL_REGS
)
17020 /* Only the last register in the order in which they occur
17021 may be clobbered by the load. */
17022 if (rclass
== GENERAL_REGS
&& load
)
17023 for (int i
= 0; i
< num_insns
- 1; i
++)
17024 if (reg_mentioned_p (reg
[i
], mem
[i
]))
17027 /* Check if the bases are same. */
17028 for (int i
= 0; i
< num_insns
- 1; i
++)
17029 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
17032 for (int i
= 0; i
< num_insns
; i
++)
17033 offvals
[i
] = INTVAL (offset
[i
]);
17035 msize
= GET_MODE_SIZE (mode
);
17037 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17038 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
17039 aarch64_host_wide_int_compare
);
17041 if (!(offvals
[1] == offvals
[0] + msize
17042 && offvals
[3] == offvals
[2] + msize
))
17045 /* Check that offsets are within range of each other. The ldp/stp
17046 instructions have 7 bit immediate offsets, so use 0x80. */
17047 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
17050 /* The offsets must be aligned with respect to each other. */
17051 if (offvals
[0] % msize
!= offvals
[2] % msize
)
17054 /* If we have SImode and slow unaligned ldp,
17055 check the alignment to be at least 8 byte. */
17057 && (aarch64_tune_params
.extra_tuning_flags
17058 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17060 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
17066 /* Given OPERANDS of consecutive load/store, this function pairs them
17067 into LDP/STP after adjusting the offset. It depends on the fact
17068 that the operands can be sorted so the offsets are correct for STP.
17069 MODE is the mode of memory operands. CODE is the rtl operator
17070 which should be applied to all memory operands, it's SIGN_EXTEND,
17071 ZERO_EXTEND or UNKNOWN. */
17074 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17075 scalar_mode mode
, RTX_CODE code
)
17077 rtx base
, offset_1
, offset_3
, t1
, t2
;
17078 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17079 rtx temp_operands
[8];
17080 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
17081 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
17083 /* We make changes on a copy as we may still bail out. */
17084 for (int i
= 0; i
< 8; i
++)
17085 temp_operands
[i
] = operands
[i
];
17087 /* Sort the operands. */
17088 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
17092 mem_1
= temp_operands
[1];
17093 mem_2
= temp_operands
[3];
17094 mem_3
= temp_operands
[5];
17095 mem_4
= temp_operands
[7];
17099 mem_1
= temp_operands
[0];
17100 mem_2
= temp_operands
[2];
17101 mem_3
= temp_operands
[4];
17102 mem_4
= temp_operands
[6];
17103 gcc_assert (code
== UNKNOWN
);
17106 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17107 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
17108 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
17109 && offset_3
!= NULL_RTX
);
17111 /* Adjust offset so it can fit in LDP/STP instruction. */
17112 msize
= GET_MODE_SIZE (mode
);
17113 stp_off_upper_limit
= msize
* (0x40 - 1);
17114 stp_off_lower_limit
= - msize
* 0x40;
17116 off_val_1
= INTVAL (offset_1
);
17117 off_val_3
= INTVAL (offset_3
);
17119 /* The base offset is optimally half way between the two STP/LDP offsets. */
17121 base_off
= (off_val_1
+ off_val_3
) / 2;
17123 /* However, due to issues with negative LDP/STP offset generation for
17124 larger modes, for DF, DI and vector modes. we must not use negative
17125 addresses smaller than 9 signed unadjusted bits can store. This
17126 provides the most range in this case. */
17127 base_off
= off_val_1
;
17129 /* Adjust the base so that it is aligned with the addresses but still
17131 if (base_off
% msize
!= off_val_1
% msize
)
17132 /* Fix the offset, bearing in mind we want to make it bigger not
17134 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17135 else if (msize
<= 4)
17136 /* The negative range of LDP/STP is one larger than the positive range. */
17139 /* Check if base offset is too big or too small. We can attempt to resolve
17140 this issue by setting it to the maximum value and seeing if the offsets
17142 if (base_off
>= 0x1000)
17144 base_off
= 0x1000 - 1;
17145 /* We must still make sure that the base offset is aligned with respect
17146 to the address. But it may may not be made any bigger. */
17147 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17150 /* Likewise for the case where the base is too small. */
17151 if (base_off
<= -0x1000)
17153 base_off
= -0x1000 + 1;
17154 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17157 /* Offset of the first STP/LDP. */
17158 new_off_1
= off_val_1
- base_off
;
17160 /* Offset of the second STP/LDP. */
17161 new_off_3
= off_val_3
- base_off
;
17163 /* The offsets must be within the range of the LDP/STP instructions. */
17164 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
17165 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
17168 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
17170 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
17171 new_off_1
+ msize
), true);
17172 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
17174 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
17175 new_off_3
+ msize
), true);
17177 if (!aarch64_mem_pair_operand (mem_1
, mode
)
17178 || !aarch64_mem_pair_operand (mem_3
, mode
))
17181 if (code
== ZERO_EXTEND
)
17183 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
17184 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
17185 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
17186 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
17188 else if (code
== SIGN_EXTEND
)
17190 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
17191 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
17192 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
17193 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
17198 operands
[0] = temp_operands
[0];
17199 operands
[1] = mem_1
;
17200 operands
[2] = temp_operands
[2];
17201 operands
[3] = mem_2
;
17202 operands
[4] = temp_operands
[4];
17203 operands
[5] = mem_3
;
17204 operands
[6] = temp_operands
[6];
17205 operands
[7] = mem_4
;
17209 operands
[0] = mem_1
;
17210 operands
[1] = temp_operands
[1];
17211 operands
[2] = mem_2
;
17212 operands
[3] = temp_operands
[3];
17213 operands
[4] = mem_3
;
17214 operands
[5] = temp_operands
[5];
17215 operands
[6] = mem_4
;
17216 operands
[7] = temp_operands
[7];
17219 /* Emit adjusting instruction. */
17220 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
17221 /* Emit ldp/stp instructions. */
17222 t1
= gen_rtx_SET (operands
[0], operands
[1]);
17223 t2
= gen_rtx_SET (operands
[2], operands
[3]);
17224 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17225 t1
= gen_rtx_SET (operands
[4], operands
[5]);
17226 t2
= gen_rtx_SET (operands
[6], operands
[7]);
17227 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17231 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17232 it isn't worth branching around empty masked ops (including masked
17236 aarch64_empty_mask_is_expensive (unsigned)
17241 /* Return 1 if pseudo register should be created and used to hold
17242 GOT address for PIC code. */
17245 aarch64_use_pseudo_pic_reg (void)
17247 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
17250 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17253 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
17255 switch (XINT (x
, 1))
17257 case UNSPEC_GOTSMALLPIC
:
17258 case UNSPEC_GOTSMALLPIC28K
:
17259 case UNSPEC_GOTTINYPIC
:
17265 return default_unspec_may_trap_p (x
, flags
);
17269 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17270 return the log2 of that value. Otherwise return -1. */
17273 aarch64_fpconst_pow_of_2 (rtx x
)
17275 const REAL_VALUE_TYPE
*r
;
17277 if (!CONST_DOUBLE_P (x
))
17280 r
= CONST_DOUBLE_REAL_VALUE (x
);
17282 if (REAL_VALUE_NEGATIVE (*r
)
17283 || REAL_VALUE_ISNAN (*r
)
17284 || REAL_VALUE_ISINF (*r
)
17285 || !real_isinteger (r
, DFmode
))
17288 return exact_log2 (real_to_integer (r
));
17291 /* If X is a vector of equal CONST_DOUBLE values and that value is
17292 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17295 aarch64_vec_fpconst_pow_of_2 (rtx x
)
17298 if (GET_CODE (x
) != CONST_VECTOR
17299 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
17302 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
17305 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
17309 for (int i
= 1; i
< nelts
; i
++)
17310 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
17316 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17319 __fp16 always promotes through this hook.
17320 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17321 through the generic excess precision logic rather than here. */
17324 aarch64_promoted_type (const_tree t
)
17326 if (SCALAR_FLOAT_TYPE_P (t
)
17327 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
17328 return float_type_node
;
17333 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17336 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
17337 optimization_type opt_type
)
17342 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
17349 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17351 static unsigned int
17352 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
17355 /* Polynomial invariant 1 == (VG / 2) - 1. */
17356 gcc_assert (i
== 1);
17359 return AARCH64_DWARF_VG
;
17362 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17363 if MODE is HFmode, and punt to the generic implementation otherwise. */
17366 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
17368 return (mode
== HFmode
17370 : default_libgcc_floating_mode_supported_p (mode
));
17373 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17374 if MODE is HFmode, and punt to the generic implementation otherwise. */
17377 aarch64_scalar_mode_supported_p (scalar_mode mode
)
17379 return (mode
== HFmode
17381 : default_scalar_mode_supported_p (mode
));
17384 /* Set the value of FLT_EVAL_METHOD.
17385 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17387 0: evaluate all operations and constants, whose semantic type has at
17388 most the range and precision of type float, to the range and
17389 precision of float; evaluate all other operations and constants to
17390 the range and precision of the semantic type;
17392 N, where _FloatN is a supported interchange floating type
17393 evaluate all operations and constants, whose semantic type has at
17394 most the range and precision of _FloatN type, to the range and
17395 precision of the _FloatN type; evaluate all other operations and
17396 constants to the range and precision of the semantic type;
17398 If we have the ARMv8.2-A extensions then we support _Float16 in native
17399 precision, so we should set this to 16. Otherwise, we support the type,
17400 but want to evaluate expressions in float precision, so set this to
17403 static enum flt_eval_method
17404 aarch64_excess_precision (enum excess_precision_type type
)
17408 case EXCESS_PRECISION_TYPE_FAST
:
17409 case EXCESS_PRECISION_TYPE_STANDARD
:
17410 /* We can calculate either in 16-bit range and precision or
17411 32-bit range and precision. Make that decision based on whether
17412 we have native support for the ARMv8.2-A 16-bit floating-point
17413 instructions or not. */
17414 return (TARGET_FP_F16INST
17415 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17416 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
17417 case EXCESS_PRECISION_TYPE_IMPLICIT
:
17418 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
17420 gcc_unreachable ();
17422 return FLT_EVAL_METHOD_UNPREDICTABLE
;
17425 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17426 scheduled for speculative execution. Reject the long-running division
17427 and square-root instructions. */
17430 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
17432 switch (get_attr_type (insn
))
17440 case TYPE_NEON_FP_SQRT_S
:
17441 case TYPE_NEON_FP_SQRT_D
:
17442 case TYPE_NEON_FP_SQRT_S_Q
:
17443 case TYPE_NEON_FP_SQRT_D_Q
:
17444 case TYPE_NEON_FP_DIV_S
:
17445 case TYPE_NEON_FP_DIV_D
:
17446 case TYPE_NEON_FP_DIV_S_Q
:
17447 case TYPE_NEON_FP_DIV_D_Q
:
17454 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17457 aarch64_compute_pressure_classes (reg_class
*classes
)
17460 classes
[i
++] = GENERAL_REGS
;
17461 classes
[i
++] = FP_REGS
;
17462 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17463 registers need to go in PR_LO_REGS at some point during their
17464 lifetime. Splitting it into two halves has the effect of making
17465 all predicates count against PR_LO_REGS, so that we try whenever
17466 possible to restrict the number of live predicates to 8. This
17467 greatly reduces the amount of spilling in certain loops. */
17468 classes
[i
++] = PR_LO_REGS
;
17469 classes
[i
++] = PR_HI_REGS
;
17473 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17476 aarch64_can_change_mode_class (machine_mode from
,
17477 machine_mode to
, reg_class_t
)
17479 if (BYTES_BIG_ENDIAN
)
17481 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
17482 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
17484 /* Don't allow changes between SVE data modes and non-SVE modes.
17485 See the comment at the head of aarch64-sve.md for details. */
17486 if (from_sve_p
!= to_sve_p
)
17489 /* Don't allow changes in element size: lane 0 of the new vector
17490 would not then be lane 0 of the old vector. See the comment
17491 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17494 In the worst case, this forces a register to be spilled in
17495 one mode and reloaded in the other, which handles the
17496 endianness correctly. */
17497 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
17503 /* Implement TARGET_EARLY_REMAT_MODES. */
17506 aarch64_select_early_remat_modes (sbitmap modes
)
17508 /* SVE values are not normally live across a call, so it should be
17509 worth doing early rematerialization even in VL-specific mode. */
17510 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
17512 machine_mode mode
= (machine_mode
) i
;
17513 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
17514 if (vec_flags
& VEC_ANY_SVE
)
17515 bitmap_set_bit (modes
, i
);
17519 /* Override the default target speculation_safe_value. */
17521 aarch64_speculation_safe_value (machine_mode mode
,
17522 rtx result
, rtx val
, rtx failval
)
17524 /* Maybe we should warn if falling back to hard barriers. They are
17525 likely to be noticably more expensive than the alternative below. */
17526 if (!aarch64_track_speculation
)
17527 return default_speculation_safe_value (mode
, result
, val
, failval
);
17530 val
= copy_to_mode_reg (mode
, val
);
17532 if (!aarch64_reg_or_zero (failval
, mode
))
17533 failval
= copy_to_mode_reg (mode
, failval
);
17538 emit_insn (gen_despeculate_copyqi (result
, val
, failval
));
17541 emit_insn (gen_despeculate_copyhi (result
, val
, failval
));
17544 emit_insn (gen_despeculate_copysi (result
, val
, failval
));
17547 emit_insn (gen_despeculate_copydi (result
, val
, failval
));
17550 emit_insn (gen_despeculate_copyti (result
, val
, failval
));
17553 gcc_unreachable ();
17558 /* Target-specific selftests. */
17562 namespace selftest
{
17564 /* Selftest for the RTL loader.
17565 Verify that the RTL loader copes with a dump from
17566 print_rtx_function. This is essentially just a test that class
17567 function_reader can handle a real dump, but it also verifies
17568 that lookup_reg_by_dump_name correctly handles hard regs.
17569 The presence of hard reg names in the dump means that the test is
17570 target-specific, hence it is in this file. */
17573 aarch64_test_loading_full_dump ()
17575 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
17577 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
17579 rtx_insn
*insn_1
= get_insn_by_uid (1);
17580 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
17582 rtx_insn
*insn_15
= get_insn_by_uid (15);
17583 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
17584 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
17586 /* Verify crtl->return_rtx. */
17587 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
17588 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
17589 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
17592 /* Run all target-specific selftests. */
17595 aarch64_run_selftests (void)
17597 aarch64_test_loading_full_dump ();
17600 } // namespace selftest
17602 #endif /* #if CHECKING_P */
17604 #undef TARGET_ADDRESS_COST
17605 #define TARGET_ADDRESS_COST aarch64_address_cost
17607 /* This hook will determines whether unnamed bitfields affect the alignment
17608 of the containing structure. The hook returns true if the structure
17609 should inherit the alignment requirements of an unnamed bitfield's
17611 #undef TARGET_ALIGN_ANON_BITFIELD
17612 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17614 #undef TARGET_ASM_ALIGNED_DI_OP
17615 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17617 #undef TARGET_ASM_ALIGNED_HI_OP
17618 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17620 #undef TARGET_ASM_ALIGNED_SI_OP
17621 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17623 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17624 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17625 hook_bool_const_tree_hwi_hwi_const_tree_true
17627 #undef TARGET_ASM_FILE_START
17628 #define TARGET_ASM_FILE_START aarch64_start_file
17630 #undef TARGET_ASM_OUTPUT_MI_THUNK
17631 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17633 #undef TARGET_ASM_SELECT_RTX_SECTION
17634 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17636 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17637 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17639 #undef TARGET_BUILD_BUILTIN_VA_LIST
17640 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17642 #undef TARGET_CALLEE_COPIES
17643 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17645 #undef TARGET_CAN_ELIMINATE
17646 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17648 #undef TARGET_CAN_INLINE_P
17649 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17651 #undef TARGET_CANNOT_FORCE_CONST_MEM
17652 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17654 #undef TARGET_CASE_VALUES_THRESHOLD
17655 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17657 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17658 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17660 /* Only the least significant bit is used for initialization guard
17662 #undef TARGET_CXX_GUARD_MASK_BIT
17663 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17665 #undef TARGET_C_MODE_FOR_SUFFIX
17666 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17668 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17669 #undef TARGET_DEFAULT_TARGET_FLAGS
17670 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17673 #undef TARGET_CLASS_MAX_NREGS
17674 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17676 #undef TARGET_BUILTIN_DECL
17677 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17679 #undef TARGET_BUILTIN_RECIPROCAL
17680 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17682 #undef TARGET_C_EXCESS_PRECISION
17683 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17685 #undef TARGET_EXPAND_BUILTIN
17686 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17688 #undef TARGET_EXPAND_BUILTIN_VA_START
17689 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17691 #undef TARGET_FOLD_BUILTIN
17692 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17694 #undef TARGET_FUNCTION_ARG
17695 #define TARGET_FUNCTION_ARG aarch64_function_arg
17697 #undef TARGET_FUNCTION_ARG_ADVANCE
17698 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17700 #undef TARGET_FUNCTION_ARG_BOUNDARY
17701 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17703 #undef TARGET_FUNCTION_ARG_PADDING
17704 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17706 #undef TARGET_GET_RAW_RESULT_MODE
17707 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17708 #undef TARGET_GET_RAW_ARG_MODE
17709 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17711 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17712 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17714 #undef TARGET_FUNCTION_VALUE
17715 #define TARGET_FUNCTION_VALUE aarch64_function_value
17717 #undef TARGET_FUNCTION_VALUE_REGNO_P
17718 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17720 #undef TARGET_GIMPLE_FOLD_BUILTIN
17721 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17723 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17724 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17726 #undef TARGET_INIT_BUILTINS
17727 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17729 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17730 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17731 aarch64_ira_change_pseudo_allocno_class
17733 #undef TARGET_LEGITIMATE_ADDRESS_P
17734 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17736 #undef TARGET_LEGITIMATE_CONSTANT_P
17737 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17739 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17740 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17741 aarch64_legitimize_address_displacement
17743 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17744 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17746 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17747 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17748 aarch64_libgcc_floating_mode_supported_p
17750 #undef TARGET_MANGLE_TYPE
17751 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17753 #undef TARGET_MEMORY_MOVE_COST
17754 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17756 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17757 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17759 #undef TARGET_MUST_PASS_IN_STACK
17760 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17762 /* This target hook should return true if accesses to volatile bitfields
17763 should use the narrowest mode possible. It should return false if these
17764 accesses should use the bitfield container type. */
17765 #undef TARGET_NARROW_VOLATILE_BITFIELD
17766 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17768 #undef TARGET_OPTION_OVERRIDE
17769 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17771 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17772 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17773 aarch64_override_options_after_change
17775 #undef TARGET_OPTION_SAVE
17776 #define TARGET_OPTION_SAVE aarch64_option_save
17778 #undef TARGET_OPTION_RESTORE
17779 #define TARGET_OPTION_RESTORE aarch64_option_restore
17781 #undef TARGET_OPTION_PRINT
17782 #define TARGET_OPTION_PRINT aarch64_option_print
17784 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17785 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17787 #undef TARGET_SET_CURRENT_FUNCTION
17788 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17790 #undef TARGET_PASS_BY_REFERENCE
17791 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17793 #undef TARGET_PREFERRED_RELOAD_CLASS
17794 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17796 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17797 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17799 #undef TARGET_PROMOTED_TYPE
17800 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17802 #undef TARGET_SECONDARY_RELOAD
17803 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17805 #undef TARGET_SHIFT_TRUNCATION_MASK
17806 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17808 #undef TARGET_SETUP_INCOMING_VARARGS
17809 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17811 #undef TARGET_STRUCT_VALUE_RTX
17812 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17814 #undef TARGET_REGISTER_MOVE_COST
17815 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17817 #undef TARGET_RETURN_IN_MEMORY
17818 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17820 #undef TARGET_RETURN_IN_MSB
17821 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17823 #undef TARGET_RTX_COSTS
17824 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17826 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17827 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17829 #undef TARGET_SCHED_ISSUE_RATE
17830 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17832 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17833 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17834 aarch64_sched_first_cycle_multipass_dfa_lookahead
17836 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17837 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17838 aarch64_first_cycle_multipass_dfa_lookahead_guard
17840 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17841 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17842 aarch64_get_separate_components
17844 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17845 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17846 aarch64_components_for_bb
17848 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17849 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17850 aarch64_disqualify_components
17852 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17853 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17854 aarch64_emit_prologue_components
17856 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17857 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17858 aarch64_emit_epilogue_components
17860 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17861 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17862 aarch64_set_handled_components
17864 #undef TARGET_TRAMPOLINE_INIT
17865 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17867 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17868 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17870 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17871 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17873 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17874 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17875 aarch64_builtin_support_vector_misalignment
17877 #undef TARGET_ARRAY_MODE
17878 #define TARGET_ARRAY_MODE aarch64_array_mode
17880 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17881 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17883 #undef TARGET_VECTORIZE_ADD_STMT_COST
17884 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17886 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17887 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17888 aarch64_builtin_vectorization_cost
17890 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17891 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17893 #undef TARGET_VECTORIZE_BUILTINS
17894 #define TARGET_VECTORIZE_BUILTINS
17896 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17897 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17898 aarch64_builtin_vectorized_function
17900 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17901 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17902 aarch64_autovectorize_vector_sizes
17904 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17905 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17906 aarch64_atomic_assign_expand_fenv
17908 /* Section anchor support. */
17910 #undef TARGET_MIN_ANCHOR_OFFSET
17911 #define TARGET_MIN_ANCHOR_OFFSET -256
17913 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17914 byte offset; we can do much more for larger data types, but have no way
17915 to determine the size of the access. We assume accesses are aligned. */
17916 #undef TARGET_MAX_ANCHOR_OFFSET
17917 #define TARGET_MAX_ANCHOR_OFFSET 4095
17919 #undef TARGET_VECTOR_ALIGNMENT
17920 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17922 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17923 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17924 aarch64_vectorize_preferred_vector_alignment
17925 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17926 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17927 aarch64_simd_vector_alignment_reachable
17929 /* vec_perm support. */
17931 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17932 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17933 aarch64_vectorize_vec_perm_const
17935 #undef TARGET_VECTORIZE_GET_MASK_MODE
17936 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17937 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17938 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17939 aarch64_empty_mask_is_expensive
17940 #undef TARGET_PREFERRED_ELSE_VALUE
17941 #define TARGET_PREFERRED_ELSE_VALUE \
17942 aarch64_preferred_else_value
17944 #undef TARGET_INIT_LIBFUNCS
17945 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17947 #undef TARGET_FIXED_CONDITION_CODE_REGS
17948 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17950 #undef TARGET_FLAGS_REGNUM
17951 #define TARGET_FLAGS_REGNUM CC_REGNUM
17953 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17954 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17956 #undef TARGET_ASAN_SHADOW_OFFSET
17957 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17959 #undef TARGET_LEGITIMIZE_ADDRESS
17960 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17962 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17963 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17965 #undef TARGET_CAN_USE_DOLOOP_P
17966 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17968 #undef TARGET_SCHED_ADJUST_PRIORITY
17969 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17971 #undef TARGET_SCHED_MACRO_FUSION_P
17972 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17974 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17975 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17977 #undef TARGET_SCHED_FUSION_PRIORITY
17978 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17980 #undef TARGET_UNSPEC_MAY_TRAP_P
17981 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17983 #undef TARGET_USE_PSEUDO_PIC_REG
17984 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17986 #undef TARGET_PRINT_OPERAND
17987 #define TARGET_PRINT_OPERAND aarch64_print_operand
17989 #undef TARGET_PRINT_OPERAND_ADDRESS
17990 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17992 #undef TARGET_OPTAB_SUPPORTED_P
17993 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17995 #undef TARGET_OMIT_STRUCT_RETURN_REG
17996 #define TARGET_OMIT_STRUCT_RETURN_REG true
17998 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17999 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18000 aarch64_dwarf_poly_indeterminate_value
18002 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18003 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18004 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18006 #undef TARGET_HARD_REGNO_NREGS
18007 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18008 #undef TARGET_HARD_REGNO_MODE_OK
18009 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18011 #undef TARGET_MODES_TIEABLE_P
18012 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18014 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18015 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18016 aarch64_hard_regno_call_part_clobbered
18018 #undef TARGET_CONSTANT_ALIGNMENT
18019 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18021 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18022 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18024 #undef TARGET_CAN_CHANGE_MODE_CLASS
18025 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18027 #undef TARGET_SELECT_EARLY_REMAT_MODES
18028 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18030 #undef TARGET_SPECULATION_SAFE_VALUE
18031 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18034 #undef TARGET_RUN_TARGET_SELFTESTS
18035 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18036 #endif /* #if CHECKING_P */
18038 struct gcc_target targetm
= TARGET_INITIALIZER
;
18040 #include "gt-aarch64.h"