1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_STRING
24 #include "coretypes.h"
35 #include "stringpool.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
52 #include "langhooks.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 /* This file should be included last. */
69 #include "target-def.h"
71 /* Defined for convenience. */
72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
74 /* Classifies an address.
77 A simple base register plus immediate offset.
80 A base register indexed by immediate offset with writeback.
83 A base register indexed by (optionally scaled) register.
86 A base register indexed by (optionally scaled) zero-extended register.
89 A base register indexed by (optionally scaled) sign-extended register.
92 A LO_SUM rtx with a base register and "LO12" symbol relocation.
95 A constant symbolic address, in pc-relative literal pool. */
97 enum aarch64_address_type
{
107 struct aarch64_address_info
{
108 enum aarch64_address_type type
;
112 enum aarch64_symbol_type symbol_type
;
115 struct simd_immediate_info
124 /* The current code model. */
125 enum aarch64_code_model aarch64_cmodel
;
128 #undef TARGET_HAVE_TLS
129 #define TARGET_HAVE_TLS 1
132 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
135 machine_mode
*, int *,
137 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
138 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
139 static void aarch64_override_options_after_change (void);
140 static bool aarch64_vector_mode_supported_p (machine_mode
);
141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
142 const unsigned char *sel
);
143 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
145 /* Major revision number of the ARM Architecture implemented by the target. */
146 unsigned aarch64_architecture_version
;
148 /* The processor for which instructions should be scheduled. */
149 enum aarch64_processor aarch64_tune
= cortexa53
;
151 /* Mask to specify which instruction scheduling options should be used. */
152 unsigned long aarch64_tune_flags
= 0;
154 /* Global flag for PC relative loads. */
155 bool aarch64_pcrelative_literal_loads
;
157 /* Support for command line parsing of boolean flags in the tuning
159 struct aarch64_flag_desc
165 #define AARCH64_FUSION_PAIR(name, internal_name) \
166 { name, AARCH64_FUSE_##internal_name },
167 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
169 { "none", AARCH64_FUSE_NOTHING
},
170 #include "aarch64-fusion-pairs.def"
171 { "all", AARCH64_FUSE_ALL
},
172 { NULL
, AARCH64_FUSE_NOTHING
}
175 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
176 { name, AARCH64_EXTRA_TUNE_##internal_name },
177 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
179 { "none", AARCH64_EXTRA_TUNE_NONE
},
180 #include "aarch64-tuning-flags.def"
181 { "all", AARCH64_EXTRA_TUNE_ALL
},
182 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
185 /* Tuning parameters. */
187 static const struct cpu_addrcost_table generic_addrcost_table
=
197 0, /* register_offset */
198 0, /* register_sextend */
199 0, /* register_zextend */
203 static const struct cpu_addrcost_table cortexa57_addrcost_table
=
213 0, /* register_offset */
214 0, /* register_sextend */
215 0, /* register_zextend */
219 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
229 1, /* register_offset */
230 1, /* register_sextend */
231 2, /* register_zextend */
235 static const struct cpu_addrcost_table xgene1_addrcost_table
=
245 0, /* register_offset */
246 1, /* register_sextend */
247 1, /* register_zextend */
251 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
261 0, /* register_offset */
262 0, /* register_sextend */
263 0, /* register_zextend */
267 static const struct cpu_addrcost_table vulcan_addrcost_table
=
277 2, /* register_offset */
278 3, /* register_sextend */
279 3, /* register_zextend */
283 static const struct cpu_regmove_cost generic_regmove_cost
=
286 /* Avoid the use of slow int<->fp moves for spilling by setting
287 their cost higher than memmov_cost. */
293 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
296 /* Avoid the use of slow int<->fp moves for spilling by setting
297 their cost higher than memmov_cost. */
303 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
306 /* Avoid the use of slow int<->fp moves for spilling by setting
307 their cost higher than memmov_cost. */
313 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
316 /* Avoid the use of slow int<->fp moves for spilling by setting
317 their cost higher than memmov_cost (actual, 4 and 9). */
323 static const struct cpu_regmove_cost thunderx_regmove_cost
=
331 static const struct cpu_regmove_cost xgene1_regmove_cost
=
334 /* Avoid the use of slow int<->fp moves for spilling by setting
335 their cost higher than memmov_cost. */
341 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
344 /* Avoid the use of int<->fp moves for spilling. */
350 static const struct cpu_regmove_cost vulcan_regmove_cost
=
353 /* Avoid the use of int<->fp moves for spilling. */
359 /* Generic costs for vector insn classes. */
360 static const struct cpu_vector_cost generic_vector_cost
=
362 1, /* scalar_stmt_cost */
363 1, /* scalar_load_cost */
364 1, /* scalar_store_cost */
365 1, /* vec_stmt_cost */
366 2, /* vec_permute_cost */
367 1, /* vec_to_scalar_cost */
368 1, /* scalar_to_vec_cost */
369 1, /* vec_align_load_cost */
370 1, /* vec_unalign_load_cost */
371 1, /* vec_unalign_store_cost */
372 1, /* vec_store_cost */
373 3, /* cond_taken_branch_cost */
374 1 /* cond_not_taken_branch_cost */
377 /* ThunderX costs for vector insn classes. */
378 static const struct cpu_vector_cost thunderx_vector_cost
=
380 1, /* scalar_stmt_cost */
381 3, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 4, /* vec_stmt_cost */
384 4, /* vec_permute_cost */
385 2, /* vec_to_scalar_cost */
386 2, /* scalar_to_vec_cost */
387 3, /* vec_align_load_cost */
388 10, /* vec_unalign_load_cost */
389 10, /* vec_unalign_store_cost */
390 1, /* vec_store_cost */
391 3, /* cond_taken_branch_cost */
392 3 /* cond_not_taken_branch_cost */
395 /* Generic costs for vector insn classes. */
396 static const struct cpu_vector_cost cortexa57_vector_cost
=
398 1, /* scalar_stmt_cost */
399 4, /* scalar_load_cost */
400 1, /* scalar_store_cost */
401 3, /* vec_stmt_cost */
402 3, /* vec_permute_cost */
403 8, /* vec_to_scalar_cost */
404 8, /* scalar_to_vec_cost */
405 5, /* vec_align_load_cost */
406 5, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 1, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 static const struct cpu_vector_cost exynosm1_vector_cost
=
415 1, /* scalar_stmt_cost */
416 5, /* scalar_load_cost */
417 1, /* scalar_store_cost */
418 3, /* vec_stmt_cost */
419 3, /* vec_permute_cost */
420 3, /* vec_to_scalar_cost */
421 3, /* scalar_to_vec_cost */
422 5, /* vec_align_load_cost */
423 5, /* vec_unalign_load_cost */
424 1, /* vec_unalign_store_cost */
425 1, /* vec_store_cost */
426 1, /* cond_taken_branch_cost */
427 1 /* cond_not_taken_branch_cost */
430 /* Generic costs for vector insn classes. */
431 static const struct cpu_vector_cost xgene1_vector_cost
=
433 1, /* scalar_stmt_cost */
434 5, /* scalar_load_cost */
435 1, /* scalar_store_cost */
436 2, /* vec_stmt_cost */
437 2, /* vec_permute_cost */
438 4, /* vec_to_scalar_cost */
439 4, /* scalar_to_vec_cost */
440 10, /* vec_align_load_cost */
441 10, /* vec_unalign_load_cost */
442 2, /* vec_unalign_store_cost */
443 2, /* vec_store_cost */
444 2, /* cond_taken_branch_cost */
445 1 /* cond_not_taken_branch_cost */
448 /* Costs for vector insn classes for Vulcan. */
449 static const struct cpu_vector_cost vulcan_vector_cost
=
451 6, /* scalar_stmt_cost */
452 4, /* scalar_load_cost */
453 1, /* scalar_store_cost */
454 6, /* vec_stmt_cost */
455 3, /* vec_permute_cost */
456 6, /* vec_to_scalar_cost */
457 5, /* scalar_to_vec_cost */
458 8, /* vec_align_load_cost */
459 8, /* vec_unalign_load_cost */
460 4, /* vec_unalign_store_cost */
461 4, /* vec_store_cost */
462 2, /* cond_taken_branch_cost */
463 1 /* cond_not_taken_branch_cost */
466 /* Generic costs for branch instructions. */
467 static const struct cpu_branch_cost generic_branch_cost
=
469 2, /* Predictable. */
470 2 /* Unpredictable. */
473 /* Branch costs for Cortex-A57. */
474 static const struct cpu_branch_cost cortexa57_branch_cost
=
476 1, /* Predictable. */
477 3 /* Unpredictable. */
480 /* Branch costs for Vulcan. */
481 static const struct cpu_branch_cost vulcan_branch_cost
=
483 1, /* Predictable. */
484 3 /* Unpredictable. */
487 /* Generic approximation modes. */
488 static const cpu_approx_modes generic_approx_modes
=
490 AARCH64_APPROX_NONE
, /* division */
491 AARCH64_APPROX_NONE
, /* sqrt */
492 AARCH64_APPROX_NONE
/* recip_sqrt */
495 /* Approximation modes for Exynos M1. */
496 static const cpu_approx_modes exynosm1_approx_modes
=
498 AARCH64_APPROX_NONE
, /* division */
499 AARCH64_APPROX_ALL
, /* sqrt */
500 AARCH64_APPROX_ALL
/* recip_sqrt */
503 /* Approximation modes for X-Gene 1. */
504 static const cpu_approx_modes xgene1_approx_modes
=
506 AARCH64_APPROX_NONE
, /* division */
507 AARCH64_APPROX_NONE
, /* sqrt */
508 AARCH64_APPROX_ALL
/* recip_sqrt */
511 static const struct tune_params generic_tunings
=
513 &cortexa57_extra_costs
,
514 &generic_addrcost_table
,
515 &generic_regmove_cost
,
516 &generic_vector_cost
,
517 &generic_branch_cost
,
518 &generic_approx_modes
,
521 AARCH64_FUSE_NOTHING
, /* fusible_ops */
522 8, /* function_align. */
525 2, /* int_reassoc_width. */
526 4, /* fp_reassoc_width. */
527 1, /* vec_reassoc_width. */
528 2, /* min_div_recip_mul_sf. */
529 2, /* min_div_recip_mul_df. */
530 0, /* max_case_values. */
531 0, /* cache_line_size. */
532 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
533 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
536 static const struct tune_params cortexa35_tunings
=
538 &cortexa53_extra_costs
,
539 &generic_addrcost_table
,
540 &cortexa53_regmove_cost
,
541 &generic_vector_cost
,
542 &cortexa57_branch_cost
,
543 &generic_approx_modes
,
546 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
547 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
548 16, /* function_align. */
551 2, /* int_reassoc_width. */
552 4, /* fp_reassoc_width. */
553 1, /* vec_reassoc_width. */
554 2, /* min_div_recip_mul_sf. */
555 2, /* min_div_recip_mul_df. */
556 0, /* max_case_values. */
557 0, /* cache_line_size. */
558 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
559 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
562 static const struct tune_params cortexa53_tunings
=
564 &cortexa53_extra_costs
,
565 &generic_addrcost_table
,
566 &cortexa53_regmove_cost
,
567 &generic_vector_cost
,
568 &cortexa57_branch_cost
,
569 &generic_approx_modes
,
572 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
573 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
574 16, /* function_align. */
577 2, /* int_reassoc_width. */
578 4, /* fp_reassoc_width. */
579 1, /* vec_reassoc_width. */
580 2, /* min_div_recip_mul_sf. */
581 2, /* min_div_recip_mul_df. */
582 0, /* max_case_values. */
583 0, /* cache_line_size. */
584 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
585 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
588 static const struct tune_params cortexa57_tunings
=
590 &cortexa57_extra_costs
,
591 &cortexa57_addrcost_table
,
592 &cortexa57_regmove_cost
,
593 &cortexa57_vector_cost
,
594 &cortexa57_branch_cost
,
595 &generic_approx_modes
,
598 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
599 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
600 16, /* function_align. */
603 2, /* int_reassoc_width. */
604 4, /* fp_reassoc_width. */
605 1, /* vec_reassoc_width. */
606 2, /* min_div_recip_mul_sf. */
607 2, /* min_div_recip_mul_df. */
608 0, /* max_case_values. */
609 0, /* cache_line_size. */
610 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
611 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
) /* tune_flags. */
614 static const struct tune_params cortexa72_tunings
=
616 &cortexa57_extra_costs
,
617 &cortexa57_addrcost_table
,
618 &cortexa57_regmove_cost
,
619 &cortexa57_vector_cost
,
620 &cortexa57_branch_cost
,
621 &generic_approx_modes
,
624 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
625 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
626 16, /* function_align. */
629 2, /* int_reassoc_width. */
630 4, /* fp_reassoc_width. */
631 1, /* vec_reassoc_width. */
632 2, /* min_div_recip_mul_sf. */
633 2, /* min_div_recip_mul_df. */
634 0, /* max_case_values. */
635 0, /* cache_line_size. */
636 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
637 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
640 static const struct tune_params cortexa73_tunings
=
642 &cortexa57_extra_costs
,
643 &cortexa57_addrcost_table
,
644 &cortexa57_regmove_cost
,
645 &cortexa57_vector_cost
,
646 &cortexa57_branch_cost
,
647 &generic_approx_modes
,
648 4, /* memmov_cost. */
650 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
651 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
652 16, /* function_align. */
655 2, /* int_reassoc_width. */
656 4, /* fp_reassoc_width. */
657 1, /* vec_reassoc_width. */
658 2, /* min_div_recip_mul_sf. */
659 2, /* min_div_recip_mul_df. */
660 0, /* max_case_values. */
661 0, /* cache_line_size. */
662 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
663 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
666 static const struct tune_params exynosm1_tunings
=
668 &exynosm1_extra_costs
,
669 &exynosm1_addrcost_table
,
670 &exynosm1_regmove_cost
,
671 &exynosm1_vector_cost
,
672 &generic_branch_cost
,
673 &exynosm1_approx_modes
,
676 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
677 4, /* function_align. */
680 2, /* int_reassoc_width. */
681 4, /* fp_reassoc_width. */
682 1, /* vec_reassoc_width. */
683 2, /* min_div_recip_mul_sf. */
684 2, /* min_div_recip_mul_df. */
685 48, /* max_case_values. */
686 64, /* cache_line_size. */
687 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
688 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
691 static const struct tune_params thunderx_tunings
=
693 &thunderx_extra_costs
,
694 &generic_addrcost_table
,
695 &thunderx_regmove_cost
,
696 &thunderx_vector_cost
,
697 &generic_branch_cost
,
698 &generic_approx_modes
,
701 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
702 8, /* function_align. */
705 2, /* int_reassoc_width. */
706 4, /* fp_reassoc_width. */
707 1, /* vec_reassoc_width. */
708 2, /* min_div_recip_mul_sf. */
709 2, /* min_div_recip_mul_df. */
710 0, /* max_case_values. */
711 0, /* cache_line_size. */
712 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
713 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
) /* tune_flags. */
716 static const struct tune_params xgene1_tunings
=
719 &xgene1_addrcost_table
,
720 &xgene1_regmove_cost
,
722 &generic_branch_cost
,
723 &xgene1_approx_modes
,
726 AARCH64_FUSE_NOTHING
, /* fusible_ops */
727 16, /* function_align. */
729 16, /* loop_align. */
730 2, /* int_reassoc_width. */
731 4, /* fp_reassoc_width. */
732 1, /* vec_reassoc_width. */
733 2, /* min_div_recip_mul_sf. */
734 2, /* min_div_recip_mul_df. */
735 0, /* max_case_values. */
736 0, /* cache_line_size. */
737 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
738 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
741 static const struct tune_params qdf24xx_tunings
=
743 &qdf24xx_extra_costs
,
744 &qdf24xx_addrcost_table
,
745 &qdf24xx_regmove_cost
,
746 &generic_vector_cost
,
747 &generic_branch_cost
,
748 &generic_approx_modes
,
751 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
752 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
753 16, /* function_align. */
755 16, /* loop_align. */
756 2, /* int_reassoc_width. */
757 4, /* fp_reassoc_width. */
758 1, /* vec_reassoc_width. */
759 2, /* min_div_recip_mul_sf. */
760 2, /* min_div_recip_mul_df. */
761 0, /* max_case_values. */
762 64, /* cache_line_size. */
763 tune_params::AUTOPREFETCHER_STRONG
, /* autoprefetcher_model. */
764 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
767 static const struct tune_params vulcan_tunings
=
770 &vulcan_addrcost_table
,
771 &vulcan_regmove_cost
,
774 &generic_approx_modes
,
775 4, /* memmov_cost. */
777 AARCH64_FUSE_NOTHING
, /* fuseable_ops. */
778 16, /* function_align. */
780 16, /* loop_align. */
781 3, /* int_reassoc_width. */
782 2, /* fp_reassoc_width. */
783 2, /* vec_reassoc_width. */
784 2, /* min_div_recip_mul_sf. */
785 2, /* min_div_recip_mul_df. */
786 0, /* max_case_values. */
787 64, /* cache_line_size. */
788 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
789 (AARCH64_EXTRA_TUNE_NONE
) /* tune_flags. */
792 /* Support for fine-grained override of the tuning structures. */
793 struct aarch64_tuning_override_function
796 void (*parse_override
)(const char*, struct tune_params
*);
799 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
800 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
802 static const struct aarch64_tuning_override_function
803 aarch64_tuning_override_functions
[] =
805 { "fuse", aarch64_parse_fuse_string
},
806 { "tune", aarch64_parse_tune_string
},
810 /* A processor implementing AArch64. */
813 const char *const name
;
814 enum aarch64_processor ident
;
815 enum aarch64_processor sched_core
;
816 enum aarch64_arch arch
;
817 unsigned architecture_version
;
818 const unsigned long flags
;
819 const struct tune_params
*const tune
;
822 /* Architectures implementing AArch64. */
823 static const struct processor all_architectures
[] =
825 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
826 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
827 #include "aarch64-arches.def"
828 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
831 /* Processor cores implementing AArch64. */
832 static const struct processor all_cores
[] =
834 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
835 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
836 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
837 FLAGS, &COSTS##_tunings},
838 #include "aarch64-cores.def"
839 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
840 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
841 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
845 /* Target specification. These are populated by the -march, -mtune, -mcpu
846 handling code or by target attributes. */
847 static const struct processor
*selected_arch
;
848 static const struct processor
*selected_cpu
;
849 static const struct processor
*selected_tune
;
851 /* The current tuning set. */
852 struct tune_params aarch64_tune_params
= generic_tunings
;
854 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
856 /* An ISA extension in the co-processor and main instruction set space. */
857 struct aarch64_option_extension
859 const char *const name
;
860 const unsigned long flags_on
;
861 const unsigned long flags_off
;
864 typedef enum aarch64_cond_code
866 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
867 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
868 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
872 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
874 /* The condition codes of the processor, and the inverse function. */
875 static const char * const aarch64_condition_codes
[] =
877 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
878 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
881 /* Generate code to enable conditional branches in functions over 1 MiB. */
883 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
884 const char * branch_format
)
886 rtx_code_label
* tmp_label
= gen_label_rtx ();
889 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
890 CODE_LABEL_NUMBER (tmp_label
));
891 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
892 rtx dest_label
= operands
[pos_label
];
893 operands
[pos_label
] = tmp_label
;
895 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
896 output_asm_insn (buffer
, operands
);
898 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
899 operands
[pos_label
] = dest_label
;
900 output_asm_insn (buffer
, operands
);
905 aarch64_err_no_fpadvsimd (machine_mode mode
, const char *msg
)
907 const char *mc
= FLOAT_MODE_P (mode
) ? "floating-point" : "vector";
908 if (TARGET_GENERAL_REGS_ONLY
)
909 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc
, msg
);
911 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc
, msg
);
914 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
915 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
916 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
917 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
918 cost (in this case the best class is the lowest cost one). Using ALL_REGS
919 irrespectively of its cost results in bad allocations with many redundant
920 int<->FP moves which are expensive on various cores.
921 To avoid this we don't allow ALL_REGS as the allocno class, but force a
922 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
923 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
924 Otherwise set the allocno class depending on the mode.
925 The result of this is that it is no longer inefficient to have a higher
926 memory move cost than the register move cost.
930 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
931 reg_class_t best_class
)
933 enum machine_mode mode
;
935 if (allocno_class
!= ALL_REGS
)
936 return allocno_class
;
938 if (best_class
!= ALL_REGS
)
941 mode
= PSEUDO_REGNO_MODE (regno
);
942 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
946 aarch64_min_divisions_for_recip_mul (enum machine_mode mode
)
948 if (GET_MODE_UNIT_SIZE (mode
) == 4)
949 return aarch64_tune_params
.min_div_recip_mul_sf
;
950 return aarch64_tune_params
.min_div_recip_mul_df
;
954 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED
,
955 enum machine_mode mode
)
957 if (VECTOR_MODE_P (mode
))
958 return aarch64_tune_params
.vec_reassoc_width
;
959 if (INTEGRAL_MODE_P (mode
))
960 return aarch64_tune_params
.int_reassoc_width
;
961 if (FLOAT_MODE_P (mode
))
962 return aarch64_tune_params
.fp_reassoc_width
;
966 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
968 aarch64_dbx_register_number (unsigned regno
)
970 if (GP_REGNUM_P (regno
))
971 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
972 else if (regno
== SP_REGNUM
)
973 return AARCH64_DWARF_SP
;
974 else if (FP_REGNUM_P (regno
))
975 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
977 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
978 equivalent DWARF register. */
979 return DWARF_FRAME_REGISTERS
;
982 /* Return TRUE if MODE is any of the large INT modes. */
984 aarch64_vect_struct_mode_p (machine_mode mode
)
986 return mode
== OImode
|| mode
== CImode
|| mode
== XImode
;
989 /* Return TRUE if MODE is any of the vector modes. */
991 aarch64_vector_mode_p (machine_mode mode
)
993 return aarch64_vector_mode_supported_p (mode
)
994 || aarch64_vect_struct_mode_p (mode
);
997 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
999 aarch64_array_mode_supported_p (machine_mode mode
,
1000 unsigned HOST_WIDE_INT nelems
)
1003 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1004 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1005 && (nelems
>= 2 && nelems
<= 4))
1011 /* Implement HARD_REGNO_NREGS. */
1014 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1016 switch (aarch64_regno_regclass (regno
))
1020 return (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
;
1022 return (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
1027 /* Implement HARD_REGNO_MODE_OK. */
1030 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1032 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1033 return regno
== CC_REGNUM
;
1035 if (regno
== SP_REGNUM
)
1036 /* The purpose of comparing with ptr_mode is to support the
1037 global register variable associated with the stack pointer
1038 register via the syntax of asm ("wsp") in ILP32. */
1039 return mode
== Pmode
|| mode
== ptr_mode
;
1041 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1042 return mode
== Pmode
;
1044 if (GP_REGNUM_P (regno
) && ! aarch64_vect_struct_mode_p (mode
))
1047 if (FP_REGNUM_P (regno
))
1049 if (aarch64_vect_struct_mode_p (mode
))
1051 (regno
+ aarch64_hard_regno_nregs (regno
, mode
) - 1) <= V31_REGNUM
;
1059 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1061 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned nregs
,
1064 /* Handle modes that fit within single registers. */
1065 if (nregs
== 1 && GET_MODE_SIZE (mode
) <= 16)
1067 if (GET_MODE_SIZE (mode
) >= 4)
1072 /* Fall back to generic for multi-reg and very large modes. */
1074 return choose_hard_reg_mode (regno
, nregs
, false);
1077 /* Return true if calls to DECL should be treated as
1078 long-calls (ie called via a register). */
1080 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1085 /* Return true if calls to symbol-ref SYM should be treated as
1086 long-calls (ie called via a register). */
1088 aarch64_is_long_call_p (rtx sym
)
1090 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1093 /* Return true if calls to symbol-ref SYM should not go through
1097 aarch64_is_noplt_call_p (rtx sym
)
1099 const_tree decl
= SYMBOL_REF_DECL (sym
);
1104 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1105 && !targetm
.binds_local_p (decl
))
1111 /* Return true if the offsets to a zero/sign-extract operation
1112 represent an expression that matches an extend operation. The
1113 operands represent the paramters from
1115 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1117 aarch64_is_extend_from_extract (machine_mode mode
, rtx mult_imm
,
1120 HOST_WIDE_INT mult_val
, extract_val
;
1122 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1125 mult_val
= INTVAL (mult_imm
);
1126 extract_val
= INTVAL (extract_imm
);
1129 && extract_val
< GET_MODE_BITSIZE (mode
)
1130 && exact_log2 (extract_val
& ~7) > 0
1131 && (extract_val
& 7) <= 4
1132 && mult_val
== (1 << (extract_val
& 7)))
1138 /* Emit an insn that's a simple single-set. Both the operands must be
1139 known to be valid. */
1141 emit_set_insn (rtx x
, rtx y
)
1143 return emit_insn (gen_rtx_SET (x
, y
));
1146 /* X and Y are two things to compare using CODE. Emit the compare insn and
1147 return the rtx for register 0 in the proper mode. */
1149 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1151 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1152 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1154 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1158 /* Build the SYMBOL_REF for __tls_get_addr. */
1160 static GTY(()) rtx tls_get_addr_libfunc
;
1163 aarch64_tls_get_addr (void)
1165 if (!tls_get_addr_libfunc
)
1166 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1167 return tls_get_addr_libfunc
;
1170 /* Return the TLS model to use for ADDR. */
1172 static enum tls_model
1173 tls_symbolic_operand_type (rtx addr
)
1175 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1178 if (GET_CODE (addr
) == CONST
)
1180 split_const (addr
, &sym
, &addend
);
1181 if (GET_CODE (sym
) == SYMBOL_REF
)
1182 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1184 else if (GET_CODE (addr
) == SYMBOL_REF
)
1185 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1190 /* We'll allow lo_sum's in addresses in our legitimate addresses
1191 so that combine would take care of combining addresses where
1192 necessary, but for generation purposes, we'll generate the address
1195 tmp = hi (symbol_ref); adrp x1, foo
1196 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1200 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1201 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1205 Load TLS symbol, depending on TLS mechanism and TLS access model.
1207 Global Dynamic - Traditional TLS:
1208 adrp tmp, :tlsgd:imm
1209 add dest, tmp, #:tlsgd_lo12:imm
1212 Global Dynamic - TLS Descriptors:
1213 adrp dest, :tlsdesc:imm
1214 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1215 add dest, dest, #:tlsdesc_lo12:imm
1222 adrp tmp, :gottprel:imm
1223 ldr dest, [tmp, #:gottprel_lo12:imm]
1228 add t0, tp, #:tprel_hi12:imm, lsl #12
1229 add t0, t0, #:tprel_lo12_nc:imm
1233 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1234 enum aarch64_symbol_type type
)
1238 case SYMBOL_SMALL_ABSOLUTE
:
1240 /* In ILP32, the mode of dest can be either SImode or DImode. */
1242 machine_mode mode
= GET_MODE (dest
);
1244 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1246 if (can_create_pseudo_p ())
1247 tmp_reg
= gen_reg_rtx (mode
);
1249 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1250 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1254 case SYMBOL_TINY_ABSOLUTE
:
1255 emit_insn (gen_rtx_SET (dest
, imm
));
1258 case SYMBOL_SMALL_GOT_28K
:
1260 machine_mode mode
= GET_MODE (dest
);
1261 rtx gp_rtx
= pic_offset_table_rtx
;
1265 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1266 here before rtl expand. Tree IVOPT will generate rtl pattern to
1267 decide rtx costs, in which case pic_offset_table_rtx is not
1268 initialized. For that case no need to generate the first adrp
1269 instruction as the final cost for global variable access is
1273 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1274 using the page base as GOT base, the first page may be wasted,
1275 in the worst scenario, there is only 28K space for GOT).
1277 The generate instruction sequence for accessing global variable
1280 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1282 Only one instruction needed. But we must initialize
1283 pic_offset_table_rtx properly. We generate initialize insn for
1284 every global access, and allow CSE to remove all redundant.
1286 The final instruction sequences will look like the following
1287 for multiply global variables access.
1289 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1291 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1292 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1293 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1296 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1297 crtl
->uses_pic_offset_table
= 1;
1298 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1300 if (mode
!= GET_MODE (gp_rtx
))
1301 gp_rtx
= simplify_gen_subreg (mode
, gp_rtx
, GET_MODE (gp_rtx
), 0);
1304 if (mode
== ptr_mode
)
1307 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1309 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1311 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1315 gcc_assert (mode
== Pmode
);
1317 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1318 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1321 /* The operand is expected to be MEM. Whenever the related insn
1322 pattern changed, above code which calculate mem should be
1324 gcc_assert (GET_CODE (mem
) == MEM
);
1325 MEM_READONLY_P (mem
) = 1;
1326 MEM_NOTRAP_P (mem
) = 1;
1331 case SYMBOL_SMALL_GOT_4G
:
1333 /* In ILP32, the mode of dest can be either SImode or DImode,
1334 while the got entry is always of SImode size. The mode of
1335 dest depends on how dest is used: if dest is assigned to a
1336 pointer (e.g. in the memory), it has SImode; it may have
1337 DImode if dest is dereferenced to access the memeory.
1338 This is why we have to handle three different ldr_got_small
1339 patterns here (two patterns for ILP32). */
1344 machine_mode mode
= GET_MODE (dest
);
1346 if (can_create_pseudo_p ())
1347 tmp_reg
= gen_reg_rtx (mode
);
1349 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1350 if (mode
== ptr_mode
)
1353 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1355 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1357 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1361 gcc_assert (mode
== Pmode
);
1363 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1364 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1367 gcc_assert (GET_CODE (mem
) == MEM
);
1368 MEM_READONLY_P (mem
) = 1;
1369 MEM_NOTRAP_P (mem
) = 1;
1374 case SYMBOL_SMALL_TLSGD
:
1377 rtx result
= gen_rtx_REG (Pmode
, R0_REGNUM
);
1380 aarch64_emit_call_insn (gen_tlsgd_small (result
, imm
));
1381 insns
= get_insns ();
1384 RTL_CONST_CALL_P (insns
) = 1;
1385 emit_libcall_block (insns
, dest
, result
, imm
);
1389 case SYMBOL_SMALL_TLSDESC
:
1391 machine_mode mode
= GET_MODE (dest
);
1392 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1395 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1397 /* In ILP32, the got entry is always of SImode size. Unlike
1398 small GOT, the dest is fixed at reg 0. */
1400 emit_insn (gen_tlsdesc_small_si (imm
));
1402 emit_insn (gen_tlsdesc_small_di (imm
));
1403 tp
= aarch64_load_tp (NULL
);
1406 tp
= gen_lowpart (mode
, tp
);
1408 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1409 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1413 case SYMBOL_SMALL_TLSIE
:
1415 /* In ILP32, the mode of dest can be either SImode or DImode,
1416 while the got entry is always of SImode size. The mode of
1417 dest depends on how dest is used: if dest is assigned to a
1418 pointer (e.g. in the memory), it has SImode; it may have
1419 DImode if dest is dereferenced to access the memeory.
1420 This is why we have to handle three different tlsie_small
1421 patterns here (two patterns for ILP32). */
1422 machine_mode mode
= GET_MODE (dest
);
1423 rtx tmp_reg
= gen_reg_rtx (mode
);
1424 rtx tp
= aarch64_load_tp (NULL
);
1426 if (mode
== ptr_mode
)
1429 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1432 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1433 tp
= gen_lowpart (mode
, tp
);
1438 gcc_assert (mode
== Pmode
);
1439 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1442 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1443 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1447 case SYMBOL_TLSLE12
:
1448 case SYMBOL_TLSLE24
:
1449 case SYMBOL_TLSLE32
:
1450 case SYMBOL_TLSLE48
:
1452 machine_mode mode
= GET_MODE (dest
);
1453 rtx tp
= aarch64_load_tp (NULL
);
1456 tp
= gen_lowpart (mode
, tp
);
1460 case SYMBOL_TLSLE12
:
1461 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1464 case SYMBOL_TLSLE24
:
1465 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1468 case SYMBOL_TLSLE32
:
1469 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1471 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1474 case SYMBOL_TLSLE48
:
1475 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1477 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1484 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1488 case SYMBOL_TINY_GOT
:
1489 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1492 case SYMBOL_TINY_TLSIE
:
1494 machine_mode mode
= GET_MODE (dest
);
1495 rtx tp
= aarch64_load_tp (NULL
);
1497 if (mode
== ptr_mode
)
1500 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1503 tp
= gen_lowpart (mode
, tp
);
1504 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1509 gcc_assert (mode
== Pmode
);
1510 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1513 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1522 /* Emit a move from SRC to DEST. Assume that the move expanders can
1523 handle all moves if !can_create_pseudo_p (). The distinction is
1524 important because, unlike emit_move_insn, the move expanders know
1525 how to force Pmode objects into the constant pool even when the
1526 constant pool address is not itself legitimate. */
1528 aarch64_emit_move (rtx dest
, rtx src
)
1530 return (can_create_pseudo_p ()
1531 ? emit_move_insn (dest
, src
)
1532 : emit_move_insn_1 (dest
, src
));
1535 /* Split a 128-bit move operation into two 64-bit move operations,
1536 taking care to handle partial overlap of register to register
1537 copies. Special cases are needed when moving between GP regs and
1538 FP regs. SRC can be a register, constant or memory; DST a register
1539 or memory. If either operand is memory it must not have any side
1542 aarch64_split_128bit_move (rtx dst
, rtx src
)
1547 machine_mode mode
= GET_MODE (dst
);
1549 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1550 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1551 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1553 if (REG_P (dst
) && REG_P (src
))
1555 int src_regno
= REGNO (src
);
1556 int dst_regno
= REGNO (dst
);
1558 /* Handle FP <-> GP regs. */
1559 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1561 src_lo
= gen_lowpart (word_mode
, src
);
1562 src_hi
= gen_highpart (word_mode
, src
);
1566 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1567 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1571 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1572 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1576 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1578 dst_lo
= gen_lowpart (word_mode
, dst
);
1579 dst_hi
= gen_highpart (word_mode
, dst
);
1583 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
1584 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
1588 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1589 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1595 dst_lo
= gen_lowpart (word_mode
, dst
);
1596 dst_hi
= gen_highpart (word_mode
, dst
);
1597 src_lo
= gen_lowpart (word_mode
, src
);
1598 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1600 /* At most one pairing may overlap. */
1601 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1603 aarch64_emit_move (dst_hi
, src_hi
);
1604 aarch64_emit_move (dst_lo
, src_lo
);
1608 aarch64_emit_move (dst_lo
, src_lo
);
1609 aarch64_emit_move (dst_hi
, src_hi
);
1614 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1616 return (! REG_P (src
)
1617 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1620 /* Split a complex SIMD combine. */
1623 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1625 machine_mode src_mode
= GET_MODE (src1
);
1626 machine_mode dst_mode
= GET_MODE (dst
);
1628 gcc_assert (VECTOR_MODE_P (dst_mode
));
1630 if (REG_P (dst
) && REG_P (src1
) && REG_P (src2
))
1632 rtx (*gen
) (rtx
, rtx
, rtx
);
1637 gen
= gen_aarch64_simd_combinev8qi
;
1640 gen
= gen_aarch64_simd_combinev4hi
;
1643 gen
= gen_aarch64_simd_combinev2si
;
1646 gen
= gen_aarch64_simd_combinev4hf
;
1649 gen
= gen_aarch64_simd_combinev2sf
;
1652 gen
= gen_aarch64_simd_combinedi
;
1655 gen
= gen_aarch64_simd_combinedf
;
1661 emit_insn (gen (dst
, src1
, src2
));
1666 /* Split a complex SIMD move. */
1669 aarch64_split_simd_move (rtx dst
, rtx src
)
1671 machine_mode src_mode
= GET_MODE (src
);
1672 machine_mode dst_mode
= GET_MODE (dst
);
1674 gcc_assert (VECTOR_MODE_P (dst_mode
));
1676 if (REG_P (dst
) && REG_P (src
))
1678 rtx (*gen
) (rtx
, rtx
);
1680 gcc_assert (VECTOR_MODE_P (src_mode
));
1685 gen
= gen_aarch64_split_simd_movv16qi
;
1688 gen
= gen_aarch64_split_simd_movv8hi
;
1691 gen
= gen_aarch64_split_simd_movv4si
;
1694 gen
= gen_aarch64_split_simd_movv2di
;
1697 gen
= gen_aarch64_split_simd_movv8hf
;
1700 gen
= gen_aarch64_split_simd_movv4sf
;
1703 gen
= gen_aarch64_split_simd_movv2df
;
1709 emit_insn (gen (dst
, src
));
1715 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
1716 machine_mode ymode
, rtx y
)
1718 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
1719 gcc_assert (r
!= NULL
);
1720 return rtx_equal_p (x
, r
);
1725 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
1727 if (can_create_pseudo_p ())
1728 return force_reg (mode
, value
);
1731 x
= aarch64_emit_move (x
, value
);
1738 aarch64_add_offset (machine_mode mode
, rtx temp
, rtx reg
, HOST_WIDE_INT offset
)
1740 if (!aarch64_plus_immediate (GEN_INT (offset
), mode
))
1743 /* Load the full offset into a register. This
1744 might be improvable in the future. */
1745 high
= GEN_INT (offset
);
1747 high
= aarch64_force_temporary (mode
, temp
, high
);
1748 reg
= aarch64_force_temporary (mode
, temp
,
1749 gen_rtx_PLUS (mode
, high
, reg
));
1751 return plus_constant (mode
, reg
, offset
);
1755 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
1759 unsigned HOST_WIDE_INT val
, val2
, mask
;
1760 int one_match
, zero_match
;
1765 if (aarch64_move_imm (val
, mode
))
1768 emit_insn (gen_rtx_SET (dest
, imm
));
1772 if ((val
>> 32) == 0 || mode
== SImode
)
1776 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
1778 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
1779 GEN_INT ((val
>> 16) & 0xffff)));
1781 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
1782 GEN_INT ((val
>> 16) & 0xffff)));
1787 /* Remaining cases are all for DImode. */
1790 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
1791 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
1792 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
1793 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
1795 if (zero_match
!= 2 && one_match
!= 2)
1797 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1798 For a 64-bit bitmask try whether changing 16 bits to all ones or
1799 zeroes creates a valid bitmask. To check any repeated bitmask,
1800 try using 16 bits from the other 32-bit half of val. */
1802 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1805 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1808 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1810 val2
= val2
& ~mask
;
1811 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
1812 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1819 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
1820 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1821 GEN_INT ((val
>> i
) & 0xffff)));
1827 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1828 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1829 otherwise skip zero bits. */
1833 val2
= one_match
> zero_match
? ~val
: val
;
1834 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
1837 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
1838 ? (val
| ~(mask
<< i
))
1839 : (val
& (mask
<< i
)))));
1840 for (i
+= 16; i
< 64; i
+= 16)
1842 if ((val2
& (mask
<< i
)) == 0)
1845 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1846 GEN_INT ((val
>> i
) & 0xffff)));
1855 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
1857 machine_mode mode
= GET_MODE (dest
);
1859 gcc_assert (mode
== SImode
|| mode
== DImode
);
1861 /* Check on what type of symbol it is. */
1862 if (GET_CODE (imm
) == SYMBOL_REF
1863 || GET_CODE (imm
) == LABEL_REF
1864 || GET_CODE (imm
) == CONST
)
1866 rtx mem
, base
, offset
;
1867 enum aarch64_symbol_type sty
;
1869 /* If we have (const (plus symbol offset)), separate out the offset
1870 before we start classifying the symbol. */
1871 split_const (imm
, &base
, &offset
);
1873 sty
= aarch64_classify_symbol (base
, offset
);
1876 case SYMBOL_FORCE_TO_MEM
:
1877 if (offset
!= const0_rtx
1878 && targetm
.cannot_force_const_mem (mode
, imm
))
1880 gcc_assert (can_create_pseudo_p ());
1881 base
= aarch64_force_temporary (mode
, dest
, base
);
1882 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1883 aarch64_emit_move (dest
, base
);
1887 mem
= force_const_mem (ptr_mode
, imm
);
1890 /* If we aren't generating PC relative literals, then
1891 we need to expand the literal pool access carefully.
1892 This is something that needs to be done in a number
1893 of places, so could well live as a separate function. */
1894 if (!aarch64_pcrelative_literal_loads
)
1896 gcc_assert (can_create_pseudo_p ());
1897 base
= gen_reg_rtx (ptr_mode
);
1898 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
1899 mem
= gen_rtx_MEM (ptr_mode
, base
);
1902 if (mode
!= ptr_mode
)
1903 mem
= gen_rtx_ZERO_EXTEND (mode
, mem
);
1905 emit_insn (gen_rtx_SET (dest
, mem
));
1909 case SYMBOL_SMALL_TLSGD
:
1910 case SYMBOL_SMALL_TLSDESC
:
1911 case SYMBOL_SMALL_TLSIE
:
1912 case SYMBOL_SMALL_GOT_28K
:
1913 case SYMBOL_SMALL_GOT_4G
:
1914 case SYMBOL_TINY_GOT
:
1915 case SYMBOL_TINY_TLSIE
:
1916 if (offset
!= const0_rtx
)
1918 gcc_assert(can_create_pseudo_p ());
1919 base
= aarch64_force_temporary (mode
, dest
, base
);
1920 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1921 aarch64_emit_move (dest
, base
);
1926 case SYMBOL_SMALL_ABSOLUTE
:
1927 case SYMBOL_TINY_ABSOLUTE
:
1928 case SYMBOL_TLSLE12
:
1929 case SYMBOL_TLSLE24
:
1930 case SYMBOL_TLSLE32
:
1931 case SYMBOL_TLSLE48
:
1932 aarch64_load_symref_appropriately (dest
, imm
, sty
);
1940 if (!CONST_INT_P (imm
))
1942 if (GET_CODE (imm
) == HIGH
)
1943 emit_insn (gen_rtx_SET (dest
, imm
));
1946 rtx mem
= force_const_mem (mode
, imm
);
1948 emit_insn (gen_rtx_SET (dest
, mem
));
1954 aarch64_internal_mov_immediate (dest
, imm
, true, GET_MODE (dest
));
1957 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1958 temporary value if necessary. FRAME_RELATED_P should be true if
1959 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1960 to the generated instructions. If SCRATCHREG is known to hold
1961 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1964 Since this function may be used to adjust the stack pointer, we must
1965 ensure that it cannot cause transient stack deallocation (for example
1966 by first incrementing SP and then decrementing when adjusting by a
1967 large immediate). */
1970 aarch64_add_constant_internal (machine_mode mode
, int regnum
, int scratchreg
,
1971 HOST_WIDE_INT delta
, bool frame_related_p
,
1974 HOST_WIDE_INT mdelta
= abs_hwi (delta
);
1975 rtx this_rtx
= gen_rtx_REG (mode
, regnum
);
1981 /* Single instruction adjustment. */
1982 if (aarch64_uimm12_shift (mdelta
))
1984 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
)));
1985 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
1989 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
1990 Only do this if mdelta is not a 16-bit move as adjusting using a move
1992 if (mdelta
< 0x1000000 && !aarch64_move_imm (mdelta
, mode
))
1994 HOST_WIDE_INT low_off
= mdelta
& 0xfff;
1996 low_off
= delta
< 0 ? -low_off
: low_off
;
1997 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (low_off
)));
1998 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
1999 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
- low_off
)));
2000 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2004 /* Emit a move immediate if required and an addition/subtraction. */
2005 rtx scratch_rtx
= gen_rtx_REG (mode
, scratchreg
);
2007 aarch64_internal_mov_immediate (scratch_rtx
, GEN_INT (mdelta
), true, mode
);
2008 insn
= emit_insn (delta
< 0 ? gen_sub2_insn (this_rtx
, scratch_rtx
)
2009 : gen_add2_insn (this_rtx
, scratch_rtx
));
2010 if (frame_related_p
)
2012 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2013 rtx adj
= plus_constant (mode
, this_rtx
, delta
);
2014 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (this_rtx
, adj
));
2019 aarch64_add_constant (machine_mode mode
, int regnum
, int scratchreg
,
2020 HOST_WIDE_INT delta
)
2022 aarch64_add_constant_internal (mode
, regnum
, scratchreg
, delta
, false, true);
2026 aarch64_add_sp (int scratchreg
, HOST_WIDE_INT delta
, bool emit_move_imm
)
2028 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, delta
,
2029 true, emit_move_imm
);
2033 aarch64_sub_sp (int scratchreg
, HOST_WIDE_INT delta
, bool frame_related_p
)
2035 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, -delta
,
2036 frame_related_p
, true);
2040 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
2041 tree exp ATTRIBUTE_UNUSED
)
2043 /* Currently, always true. */
2047 /* Implement TARGET_PASS_BY_REFERENCE. */
2050 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
2053 bool named ATTRIBUTE_UNUSED
)
2056 machine_mode dummymode
;
2059 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2060 size
= (mode
== BLKmode
&& type
)
2061 ? int_size_in_bytes (type
) : (int) GET_MODE_SIZE (mode
);
2063 /* Aggregates are passed by reference based on their size. */
2064 if (type
&& AGGREGATE_TYPE_P (type
))
2066 size
= int_size_in_bytes (type
);
2069 /* Variable sized arguments are always returned by reference. */
2073 /* Can this be a candidate to be passed in fp/simd register(s)? */
2074 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2079 /* Arguments which are variable sized or larger than 2 registers are
2080 passed by reference unless they are a homogenous floating point
2082 return size
> 2 * UNITS_PER_WORD
;
2085 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2087 aarch64_return_in_msb (const_tree valtype
)
2089 machine_mode dummy_mode
;
2092 /* Never happens in little-endian mode. */
2093 if (!BYTES_BIG_ENDIAN
)
2096 /* Only composite types smaller than or equal to 16 bytes can
2097 be potentially returned in registers. */
2098 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
2099 || int_size_in_bytes (valtype
) <= 0
2100 || int_size_in_bytes (valtype
) > 16)
2103 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2104 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2105 is always passed/returned in the least significant bits of fp/simd
2107 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
2108 &dummy_mode
, &dummy_int
, NULL
))
2114 /* Implement TARGET_FUNCTION_VALUE.
2115 Define how to find the value returned by a function. */
2118 aarch64_function_value (const_tree type
, const_tree func
,
2119 bool outgoing ATTRIBUTE_UNUSED
)
2124 machine_mode ag_mode
;
2126 mode
= TYPE_MODE (type
);
2127 if (INTEGRAL_TYPE_P (type
))
2128 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
2130 if (aarch64_return_in_msb (type
))
2132 HOST_WIDE_INT size
= int_size_in_bytes (type
);
2134 if (size
% UNITS_PER_WORD
!= 0)
2136 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
2137 mode
= mode_for_size (size
* BITS_PER_UNIT
, MODE_INT
, 0);
2141 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2142 &ag_mode
, &count
, NULL
))
2144 if (!aarch64_composite_type_p (type
, mode
))
2146 gcc_assert (count
== 1 && mode
== ag_mode
);
2147 return gen_rtx_REG (mode
, V0_REGNUM
);
2154 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
2155 for (i
= 0; i
< count
; i
++)
2157 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
2158 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2159 GEN_INT (i
* GET_MODE_SIZE (ag_mode
)));
2160 XVECEXP (par
, 0, i
) = tmp
;
2166 return gen_rtx_REG (mode
, R0_REGNUM
);
2169 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2170 Return true if REGNO is the number of a hard register in which the values
2171 of called function may come back. */
2174 aarch64_function_value_regno_p (const unsigned int regno
)
2176 /* Maximum of 16 bytes can be returned in the general registers. Examples
2177 of 16-byte return values are: 128-bit integers and 16-byte small
2178 structures (excluding homogeneous floating-point aggregates). */
2179 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
2182 /* Up to four fp/simd registers can return a function value, e.g. a
2183 homogeneous floating-point aggregate having four members. */
2184 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
2185 return TARGET_FLOAT
;
2190 /* Implement TARGET_RETURN_IN_MEMORY.
2192 If the type T of the result of a function is such that
2194 would require that arg be passed as a value in a register (or set of
2195 registers) according to the parameter passing rules, then the result
2196 is returned in the same registers as would be used for such an
2200 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
2203 machine_mode ag_mode
;
2206 if (!AGGREGATE_TYPE_P (type
)
2207 && TREE_CODE (type
) != COMPLEX_TYPE
2208 && TREE_CODE (type
) != VECTOR_TYPE
)
2209 /* Simple scalar types always returned in registers. */
2212 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
2219 /* Types larger than 2 registers returned in memory. */
2220 size
= int_size_in_bytes (type
);
2221 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
2225 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
2226 const_tree type
, int *nregs
)
2228 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2229 return aarch64_vfp_is_call_or_return_candidate (mode
,
2231 &pcum
->aapcs_vfp_rmode
,
2236 /* Given MODE and TYPE of a function argument, return the alignment in
2237 bits. The idea is to suppress any stronger alignment requested by
2238 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2239 This is a helper function for local use only. */
2242 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
2245 return GET_MODE_ALIGNMENT (mode
);
2246 if (integer_zerop (TYPE_SIZE (type
)))
2249 gcc_assert (TYPE_MODE (type
) == mode
);
2251 if (!AGGREGATE_TYPE_P (type
))
2252 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
2254 if (TREE_CODE (type
) == ARRAY_TYPE
)
2255 return TYPE_ALIGN (TREE_TYPE (type
));
2257 unsigned int alignment
= 0;
2259 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
2260 alignment
= std::max (alignment
, DECL_ALIGN (field
));
2265 /* Layout a function argument according to the AAPCS64 rules. The rule
2266 numbers refer to the rule numbers in the AAPCS64. */
2269 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2271 bool named ATTRIBUTE_UNUSED
)
2273 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2274 int ncrn
, nvrn
, nregs
;
2275 bool allocate_ncrn
, allocate_nvrn
;
2278 /* We need to do this once per argument. */
2279 if (pcum
->aapcs_arg_processed
)
2282 pcum
->aapcs_arg_processed
= true;
2284 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2286 = ROUND_UP (type
? int_size_in_bytes (type
) : GET_MODE_SIZE (mode
),
2289 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
2290 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
2295 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2296 The following code thus handles passing by SIMD/FP registers first. */
2298 nvrn
= pcum
->aapcs_nvrn
;
2300 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2301 and homogenous short-vector aggregates (HVA). */
2305 aarch64_err_no_fpadvsimd (mode
, "argument");
2307 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
2309 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
2310 if (!aarch64_composite_type_p (type
, mode
))
2312 gcc_assert (nregs
== 1);
2313 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
2319 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2320 for (i
= 0; i
< nregs
; i
++)
2322 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
2323 V0_REGNUM
+ nvrn
+ i
);
2324 tmp
= gen_rtx_EXPR_LIST
2326 GEN_INT (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
)));
2327 XVECEXP (par
, 0, i
) = tmp
;
2329 pcum
->aapcs_reg
= par
;
2335 /* C.3 NSRN is set to 8. */
2336 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
2341 ncrn
= pcum
->aapcs_ncrn
;
2342 nregs
= size
/ UNITS_PER_WORD
;
2344 /* C6 - C9. though the sign and zero extension semantics are
2345 handled elsewhere. This is the case where the argument fits
2346 entirely general registers. */
2347 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
2349 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
2351 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
2353 /* C.8 if the argument has an alignment of 16 then the NGRN is
2354 rounded up to the next even number. */
2355 if (nregs
== 2 && alignment
== 16 * BITS_PER_UNIT
&& ncrn
% 2)
2358 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
2360 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2361 A reg is still generated for it, but the caller should be smart
2362 enough not to use it. */
2363 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
2365 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
2372 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2373 for (i
= 0; i
< nregs
; i
++)
2375 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
2376 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2377 GEN_INT (i
* UNITS_PER_WORD
));
2378 XVECEXP (par
, 0, i
) = tmp
;
2380 pcum
->aapcs_reg
= par
;
2383 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
2388 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
2390 /* The argument is passed on stack; record the needed number of words for
2391 this argument and align the total size if necessary. */
2393 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
2394 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
2395 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
2396 16 / UNITS_PER_WORD
);
2400 /* Implement TARGET_FUNCTION_ARG. */
2403 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2404 const_tree type
, bool named
)
2406 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2407 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
2409 if (mode
== VOIDmode
)
2412 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2413 return pcum
->aapcs_reg
;
2417 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
2418 const_tree fntype ATTRIBUTE_UNUSED
,
2419 rtx libname ATTRIBUTE_UNUSED
,
2420 const_tree fndecl ATTRIBUTE_UNUSED
,
2421 unsigned n_named ATTRIBUTE_UNUSED
)
2423 pcum
->aapcs_ncrn
= 0;
2424 pcum
->aapcs_nvrn
= 0;
2425 pcum
->aapcs_nextncrn
= 0;
2426 pcum
->aapcs_nextnvrn
= 0;
2427 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
2428 pcum
->aapcs_reg
= NULL_RTX
;
2429 pcum
->aapcs_arg_processed
= false;
2430 pcum
->aapcs_stack_words
= 0;
2431 pcum
->aapcs_stack_size
= 0;
2434 && fndecl
&& TREE_PUBLIC (fndecl
)
2435 && fntype
&& fntype
!= error_mark_node
)
2437 const_tree type
= TREE_TYPE (fntype
);
2438 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
2439 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
2440 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
2441 &mode
, &nregs
, NULL
))
2442 aarch64_err_no_fpadvsimd (TYPE_MODE (type
), "return type");
2448 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
2453 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2454 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
2456 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2457 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
2458 != (pcum
->aapcs_stack_words
!= 0));
2459 pcum
->aapcs_arg_processed
= false;
2460 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
2461 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
2462 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
2463 pcum
->aapcs_stack_words
= 0;
2464 pcum
->aapcs_reg
= NULL_RTX
;
2469 aarch64_function_arg_regno_p (unsigned regno
)
2471 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
2472 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
2475 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2476 PARM_BOUNDARY bits of alignment, but will be given anything up
2477 to STACK_BOUNDARY bits if the type requires it. This makes sure
2478 that both before and after the layout of each argument, the Next
2479 Stacked Argument Address (NSAA) will have a minimum alignment of
2483 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
2485 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
2487 if (alignment
< PARM_BOUNDARY
)
2488 alignment
= PARM_BOUNDARY
;
2489 if (alignment
> STACK_BOUNDARY
)
2490 alignment
= STACK_BOUNDARY
;
2494 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2496 Return true if an argument passed on the stack should be padded upwards,
2497 i.e. if the least-significant byte of the stack slot has useful data.
2499 Small aggregate types are placed in the lowest memory address.
2501 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2504 aarch64_pad_arg_upward (machine_mode mode
, const_tree type
)
2506 /* On little-endian targets, the least significant byte of every stack
2507 argument is passed at the lowest byte address of the stack slot. */
2508 if (!BYTES_BIG_ENDIAN
)
2511 /* Otherwise, integral, floating-point and pointer types are padded downward:
2512 the least significant byte of a stack argument is passed at the highest
2513 byte address of the stack slot. */
2515 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
2516 || POINTER_TYPE_P (type
))
2517 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
2520 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2524 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2526 It specifies padding for the last (may also be the only)
2527 element of a block move between registers and memory. If
2528 assuming the block is in the memory, padding upward means that
2529 the last element is padded after its highest significant byte,
2530 while in downward padding, the last element is padded at the
2531 its least significant byte side.
2533 Small aggregates and small complex types are always padded
2536 We don't need to worry about homogeneous floating-point or
2537 short-vector aggregates; their move is not affected by the
2538 padding direction determined here. Regardless of endianness,
2539 each element of such an aggregate is put in the least
2540 significant bits of a fp/simd register.
2542 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2543 register has useful data, and return the opposite if the most
2544 significant byte does. */
2547 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
2548 bool first ATTRIBUTE_UNUSED
)
2551 /* Small composite types are always padded upward. */
2552 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
2554 HOST_WIDE_INT size
= (type
? int_size_in_bytes (type
)
2555 : GET_MODE_SIZE (mode
));
2556 if (size
< 2 * UNITS_PER_WORD
)
2560 /* Otherwise, use the default padding. */
2561 return !BYTES_BIG_ENDIAN
;
2565 aarch64_libgcc_cmp_return_mode (void)
2570 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2572 /* We use the 12-bit shifted immediate arithmetic instructions so values
2573 must be multiple of (1 << 12), i.e. 4096. */
2574 #define ARITH_FACTOR 4096
2576 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2577 #error Cannot use simple address calculation for stack probing
2580 /* The pair of scratch registers used for stack probing. */
2581 #define PROBE_STACK_FIRST_REG 9
2582 #define PROBE_STACK_SECOND_REG 10
2584 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2585 inclusive. These are offsets from the current stack pointer. */
2588 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, HOST_WIDE_INT size
)
2590 rtx reg1
= gen_rtx_REG (ptr_mode
, PROBE_STACK_FIRST_REG
);
2592 /* See the same assertion on PROBE_INTERVAL above. */
2593 gcc_assert ((first
% ARITH_FACTOR
) == 0);
2595 /* See if we have a constant small number of probes to generate. If so,
2596 that's the easy case. */
2597 if (size
<= PROBE_INTERVAL
)
2599 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
2601 emit_set_insn (reg1
,
2602 plus_constant (ptr_mode
,
2603 stack_pointer_rtx
, -(first
+ base
)));
2604 emit_stack_probe (plus_constant (ptr_mode
, reg1
, base
- size
));
2607 /* The run-time loop is made up of 8 insns in the generic case while the
2608 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2609 else if (size
<= 4 * PROBE_INTERVAL
)
2611 HOST_WIDE_INT i
, rem
;
2613 emit_set_insn (reg1
,
2614 plus_constant (ptr_mode
,
2616 -(first
+ PROBE_INTERVAL
)));
2617 emit_stack_probe (reg1
);
2619 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2620 it exceeds SIZE. If only two probes are needed, this will not
2621 generate any code. Then probe at FIRST + SIZE. */
2622 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
2624 emit_set_insn (reg1
,
2625 plus_constant (ptr_mode
, reg1
, -PROBE_INTERVAL
));
2626 emit_stack_probe (reg1
);
2629 rem
= size
- (i
- PROBE_INTERVAL
);
2632 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2634 emit_set_insn (reg1
, plus_constant (ptr_mode
, reg1
, -base
));
2635 emit_stack_probe (plus_constant (ptr_mode
, reg1
, base
- rem
));
2638 emit_stack_probe (plus_constant (ptr_mode
, reg1
, -rem
));
2641 /* Otherwise, do the same as above, but in a loop. Note that we must be
2642 extra careful with variables wrapping around because we might be at
2643 the very top (or the very bottom) of the address space and we have
2644 to be able to handle this case properly; in particular, we use an
2645 equality test for the loop condition. */
2648 rtx reg2
= gen_rtx_REG (ptr_mode
, PROBE_STACK_SECOND_REG
);
2650 /* Step 1: round SIZE to the previous multiple of the interval. */
2652 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
2655 /* Step 2: compute initial and final value of the loop counter. */
2657 /* TEST_ADDR = SP + FIRST. */
2658 emit_set_insn (reg1
,
2659 plus_constant (ptr_mode
, stack_pointer_rtx
, -first
));
2661 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2662 emit_set_insn (reg2
,
2663 plus_constant (ptr_mode
, stack_pointer_rtx
,
2664 -(first
+ rounded_size
)));
2671 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2674 while (TEST_ADDR != LAST_ADDR)
2676 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2677 until it is equal to ROUNDED_SIZE. */
2679 if (ptr_mode
== DImode
)
2680 emit_insn (gen_probe_stack_range_di (reg1
, reg1
, reg2
));
2682 emit_insn (gen_probe_stack_range_si (reg1
, reg1
, reg2
));
2685 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2686 that SIZE is equal to ROUNDED_SIZE. */
2688 if (size
!= rounded_size
)
2690 HOST_WIDE_INT rem
= size
- rounded_size
;
2694 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2696 emit_set_insn (reg2
, plus_constant (ptr_mode
, reg2
, -base
));
2697 emit_stack_probe (plus_constant (ptr_mode
, reg2
, base
- rem
));
2700 emit_stack_probe (plus_constant (ptr_mode
, reg2
, -rem
));
2704 /* Make sure nothing is scheduled before we are done. */
2705 emit_insn (gen_blockage ());
2708 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2709 absolute addresses. */
2712 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
2714 static int labelno
= 0;
2718 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
2721 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
2723 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2725 xops
[1] = GEN_INT (PROBE_INTERVAL
);
2726 output_asm_insn ("sub\t%0, %0, %1", xops
);
2728 /* Probe at TEST_ADDR. */
2729 output_asm_insn ("str\txzr, [%0]", xops
);
2731 /* Test if TEST_ADDR == LAST_ADDR. */
2733 output_asm_insn ("cmp\t%0, %1", xops
);
2736 fputs ("\tb.ne\t", asm_out_file
);
2737 assemble_name_raw (asm_out_file
, loop_lab
);
2738 fputc ('\n', asm_out_file
);
2744 aarch64_frame_pointer_required (void)
2746 /* In aarch64_override_options_after_change
2747 flag_omit_leaf_frame_pointer turns off the frame pointer by
2748 default. Turn it back on now if we've not got a leaf
2750 if (flag_omit_leaf_frame_pointer
2751 && (!crtl
->is_leaf
|| df_regs_ever_live_p (LR_REGNUM
)))
2757 /* Mark the registers that need to be saved by the callee and calculate
2758 the size of the callee-saved registers area and frame record (both FP
2759 and LR may be omitted). */
2761 aarch64_layout_frame (void)
2763 HOST_WIDE_INT offset
= 0;
2764 int regno
, last_fp_reg
= INVALID_REGNUM
;
2766 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
2769 #define SLOT_NOT_REQUIRED (-2)
2770 #define SLOT_REQUIRED (-1)
2772 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
2773 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
2775 /* First mark all the registers that really need to be saved... */
2776 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2777 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2779 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2780 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2782 /* ... that includes the eh data registers (if needed)... */
2783 if (crtl
->calls_eh_return
)
2784 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
2785 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
2788 /* ... and any callee saved register that dataflow says is live. */
2789 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2790 if (df_regs_ever_live_p (regno
)
2791 && (regno
== R30_REGNUM
2792 || !call_used_regs
[regno
]))
2793 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2795 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2796 if (df_regs_ever_live_p (regno
)
2797 && !call_used_regs
[regno
])
2799 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2800 last_fp_reg
= regno
;
2803 if (frame_pointer_needed
)
2805 /* FP and LR are placed in the linkage record. */
2806 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
2807 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
2808 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
2809 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
2810 offset
+= 2 * UNITS_PER_WORD
;
2813 /* Now assign stack slots for them. */
2814 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2815 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2817 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2818 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2819 cfun
->machine
->frame
.wb_candidate1
= regno
;
2820 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
2821 cfun
->machine
->frame
.wb_candidate2
= regno
;
2822 offset
+= UNITS_PER_WORD
;
2825 HOST_WIDE_INT max_int_offset
= offset
;
2826 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2827 bool has_align_gap
= offset
!= max_int_offset
;
2829 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2830 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2832 /* If there is an alignment gap between integer and fp callee-saves,
2833 allocate the last fp register to it if possible. */
2834 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
2836 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
2840 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2841 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2842 cfun
->machine
->frame
.wb_candidate1
= regno
;
2843 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
2844 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
2845 cfun
->machine
->frame
.wb_candidate2
= regno
;
2846 offset
+= UNITS_PER_WORD
;
2849 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2851 cfun
->machine
->frame
.saved_regs_size
= offset
;
2853 HOST_WIDE_INT varargs_and_saved_regs_size
2854 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
2856 cfun
->machine
->frame
.hard_fp_offset
2857 = ROUND_UP (varargs_and_saved_regs_size
+ get_frame_size (),
2858 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2860 cfun
->machine
->frame
.frame_size
2861 = ROUND_UP (cfun
->machine
->frame
.hard_fp_offset
2862 + crtl
->outgoing_args_size
,
2863 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2865 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
2867 cfun
->machine
->frame
.initial_adjust
= 0;
2868 cfun
->machine
->frame
.final_adjust
= 0;
2869 cfun
->machine
->frame
.callee_adjust
= 0;
2870 cfun
->machine
->frame
.callee_offset
= 0;
2872 HOST_WIDE_INT max_push_offset
= 0;
2873 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
2874 max_push_offset
= 512;
2875 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
2876 max_push_offset
= 256;
2878 if (cfun
->machine
->frame
.frame_size
< max_push_offset
2879 && crtl
->outgoing_args_size
== 0)
2881 /* Simple, small frame with no outgoing arguments:
2882 stp reg1, reg2, [sp, -frame_size]!
2883 stp reg3, reg4, [sp, 16] */
2884 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.frame_size
;
2886 else if ((crtl
->outgoing_args_size
2887 + cfun
->machine
->frame
.saved_regs_size
< 512)
2888 && !(cfun
->calls_alloca
2889 && cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
))
2891 /* Frame with small outgoing arguments:
2892 sub sp, sp, frame_size
2893 stp reg1, reg2, [sp, outgoing_args_size]
2894 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2895 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
2896 cfun
->machine
->frame
.callee_offset
2897 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
2899 else if (cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
)
2901 /* Frame with large outgoing arguments but a small local area:
2902 stp reg1, reg2, [sp, -hard_fp_offset]!
2903 stp reg3, reg4, [sp, 16]
2904 sub sp, sp, outgoing_args_size */
2905 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
2906 cfun
->machine
->frame
.final_adjust
2907 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
2909 else if (!frame_pointer_needed
2910 && varargs_and_saved_regs_size
< max_push_offset
)
2912 /* Frame with large local area and outgoing arguments (this pushes the
2913 callee-saves first, followed by the locals and outgoing area):
2914 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2915 stp reg3, reg4, [sp, 16]
2916 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2917 cfun
->machine
->frame
.callee_adjust
= varargs_and_saved_regs_size
;
2918 cfun
->machine
->frame
.final_adjust
2919 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
2920 cfun
->machine
->frame
.hard_fp_offset
= cfun
->machine
->frame
.callee_adjust
;
2921 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.hard_fp_offset
;
2925 /* Frame with large local area and outgoing arguments using frame pointer:
2926 sub sp, sp, hard_fp_offset
2927 stp x29, x30, [sp, 0]
2929 stp reg3, reg4, [sp, 16]
2930 sub sp, sp, outgoing_args_size */
2931 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
2932 cfun
->machine
->frame
.final_adjust
2933 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
2936 cfun
->machine
->frame
.laid_out
= true;
2939 /* Return true if the register REGNO is saved on entry to
2940 the current function. */
2943 aarch64_register_saved_on_entry (int regno
)
2945 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
2948 /* Return the next register up from REGNO up to LIMIT for the callee
2952 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
2954 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
2959 /* Push the register number REGNO of mode MODE to the stack with write-back
2960 adjusting the stack by ADJUSTMENT. */
2963 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
2964 HOST_WIDE_INT adjustment
)
2966 rtx base_rtx
= stack_pointer_rtx
;
2969 reg
= gen_rtx_REG (mode
, regno
);
2970 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
2971 plus_constant (Pmode
, base_rtx
, -adjustment
));
2972 mem
= gen_rtx_MEM (mode
, mem
);
2974 insn
= emit_move_insn (mem
, reg
);
2975 RTX_FRAME_RELATED_P (insn
) = 1;
2978 /* Generate and return an instruction to store the pair of registers
2979 REG and REG2 of mode MODE to location BASE with write-back adjusting
2980 the stack location BASE by ADJUSTMENT. */
2983 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
2984 HOST_WIDE_INT adjustment
)
2989 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
2990 GEN_INT (-adjustment
),
2991 GEN_INT (UNITS_PER_WORD
- adjustment
));
2993 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
2994 GEN_INT (-adjustment
),
2995 GEN_INT (UNITS_PER_WORD
- adjustment
));
3001 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3002 stack pointer by ADJUSTMENT. */
3005 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
3008 machine_mode mode
= (regno1
<= R30_REGNUM
) ? DImode
: DFmode
;
3010 if (regno2
== INVALID_REGNUM
)
3011 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
3013 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3014 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3016 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
3018 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
3019 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3020 RTX_FRAME_RELATED_P (insn
) = 1;
3023 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3024 adjusting it by ADJUSTMENT afterwards. */
3027 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
3028 HOST_WIDE_INT adjustment
)
3033 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3034 GEN_INT (UNITS_PER_WORD
));
3036 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3037 GEN_INT (UNITS_PER_WORD
));
3043 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3044 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3048 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
3051 machine_mode mode
= (regno1
<= R30_REGNUM
) ? DImode
: DFmode
;
3052 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3054 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
3056 if (regno2
== INVALID_REGNUM
)
3058 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
3059 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
3060 emit_move_insn (reg1
, gen_rtx_MEM (mode
, mem
));
3064 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3065 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3066 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
3071 /* Generate and return a store pair instruction of mode MODE to store
3072 register REG1 to MEM1 and register REG2 to MEM2. */
3075 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
3081 return gen_store_pairdi (mem1
, reg1
, mem2
, reg2
);
3084 return gen_store_pairdf (mem1
, reg1
, mem2
, reg2
);
3091 /* Generate and regurn a load pair isntruction of mode MODE to load register
3092 REG1 from MEM1 and register REG2 from MEM2. */
3095 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
3101 return gen_load_pairdi (reg1
, mem1
, reg2
, mem2
);
3104 return gen_load_pairdf (reg1
, mem1
, reg2
, mem2
);
3111 /* Emit code to save the callee-saved registers from register number START
3112 to LIMIT to the stack at the location starting at offset START_OFFSET,
3113 skipping any write-back candidates if SKIP_WB is true. */
3116 aarch64_save_callee_saves (machine_mode mode
, HOST_WIDE_INT start_offset
,
3117 unsigned start
, unsigned limit
, bool skip_wb
)
3120 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
3121 ? gen_frame_mem
: gen_rtx_MEM
);
3125 for (regno
= aarch64_next_callee_save (start
, limit
);
3127 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3130 HOST_WIDE_INT offset
;
3133 && (regno
== cfun
->machine
->frame
.wb_candidate1
3134 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3137 reg
= gen_rtx_REG (mode
, regno
);
3138 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3139 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3142 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3145 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3146 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3149 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3152 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3153 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3155 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
3158 /* The first part of a frame-related parallel insn is
3159 always assumed to be relevant to the frame
3160 calculations; subsequent parts, are only
3161 frame-related if explicitly marked. */
3162 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3166 insn
= emit_move_insn (mem
, reg
);
3168 RTX_FRAME_RELATED_P (insn
) = 1;
3172 /* Emit code to restore the callee registers of mode MODE from register
3173 number START up to and including LIMIT. Restore from the stack offset
3174 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3175 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3178 aarch64_restore_callee_saves (machine_mode mode
,
3179 HOST_WIDE_INT start_offset
, unsigned start
,
3180 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
3182 rtx base_rtx
= stack_pointer_rtx
;
3183 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
3184 ? gen_frame_mem
: gen_rtx_MEM
);
3187 HOST_WIDE_INT offset
;
3189 for (regno
= aarch64_next_callee_save (start
, limit
);
3191 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3196 && (regno
== cfun
->machine
->frame
.wb_candidate1
3197 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3200 reg
= gen_rtx_REG (mode
, regno
);
3201 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3202 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3204 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3207 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3208 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3210 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3213 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3214 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3215 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
3217 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3221 emit_move_insn (reg
, mem
);
3222 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
3226 /* AArch64 stack frames generated by this compiler look like:
3228 +-------------------------------+
3230 | incoming stack arguments |
3232 +-------------------------------+
3233 | | <-- incoming stack pointer (aligned)
3234 | callee-allocated save area |
3235 | for register varargs |
3237 +-------------------------------+
3238 | local variables | <-- frame_pointer_rtx
3240 +-------------------------------+
3242 +-------------------------------+ |
3243 | callee-saved registers | | frame.saved_regs_size
3244 +-------------------------------+ |
3246 +-------------------------------+ |
3247 | FP' | / <- hard_frame_pointer_rtx (aligned)
3248 +-------------------------------+
3249 | dynamic allocation |
3250 +-------------------------------+
3252 +-------------------------------+
3253 | outgoing stack arguments | <-- arg_pointer
3255 +-------------------------------+
3256 | | <-- stack_pointer_rtx (aligned)
3258 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3259 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3262 /* Generate the prologue instructions for entry into a function.
3263 Establish the stack frame by decreasing the stack pointer with a
3264 properly calculated size and, if necessary, create a frame record
3265 filled with the values of LR and previous frame pointer. The
3266 current FP is also set up if it is in use. */
3269 aarch64_expand_prologue (void)
3271 aarch64_layout_frame ();
3273 HOST_WIDE_INT frame_size
= cfun
->machine
->frame
.frame_size
;
3274 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3275 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3276 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3277 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3278 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3279 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3282 if (flag_stack_usage_info
)
3283 current_function_static_stack_size
= frame_size
;
3285 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
3287 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
3289 if (frame_size
> PROBE_INTERVAL
&& frame_size
> STACK_CHECK_PROTECT
)
3290 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT
,
3291 frame_size
- STACK_CHECK_PROTECT
);
3293 else if (frame_size
> 0)
3294 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT
, frame_size
);
3297 aarch64_sub_sp (IP0_REGNUM
, initial_adjust
, true);
3299 if (callee_adjust
!= 0)
3300 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
3302 if (frame_pointer_needed
)
3304 if (callee_adjust
== 0)
3305 aarch64_save_callee_saves (DImode
, callee_offset
, R29_REGNUM
,
3307 insn
= emit_insn (gen_add3_insn (hard_frame_pointer_rtx
,
3309 GEN_INT (callee_offset
)));
3310 RTX_FRAME_RELATED_P (insn
) = 1;
3311 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
3314 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3315 callee_adjust
!= 0 || frame_pointer_needed
);
3316 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3317 callee_adjust
!= 0 || frame_pointer_needed
);
3318 aarch64_sub_sp (IP1_REGNUM
, final_adjust
, !frame_pointer_needed
);
3321 /* Return TRUE if we can use a simple_return insn.
3323 This function checks whether the callee saved stack is empty, which
3324 means no restore actions are need. The pro_and_epilogue will use
3325 this to check whether shrink-wrapping opt is feasible. */
3328 aarch64_use_return_insn_p (void)
3330 if (!reload_completed
)
3336 aarch64_layout_frame ();
3338 return cfun
->machine
->frame
.frame_size
== 0;
3341 /* Generate the epilogue instructions for returning from a function.
3342 This is almost exactly the reverse of the prolog sequence, except
3343 that we need to insert barriers to avoid scheduling loads that read
3344 from a deallocated stack, and we optimize the unwind records by
3345 emitting them all together if possible. */
3347 aarch64_expand_epilogue (bool for_sibcall
)
3349 aarch64_layout_frame ();
3351 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3352 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3353 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3354 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3355 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3356 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3360 /* We need to add memory barrier to prevent read from deallocated stack. */
3361 bool need_barrier_p
= (get_frame_size ()
3362 + cfun
->machine
->frame
.saved_varargs_size
) != 0;
3364 /* Emit a barrier to prevent loads from a deallocated stack. */
3365 if (final_adjust
> crtl
->outgoing_args_size
|| cfun
->calls_alloca
)
3367 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3368 need_barrier_p
= false;
3371 /* Restore the stack pointer from the frame pointer if it may not
3372 be the same as the stack pointer. */
3373 if (frame_pointer_needed
&& (final_adjust
|| cfun
->calls_alloca
))
3375 insn
= emit_insn (gen_add3_insn (stack_pointer_rtx
,
3376 hard_frame_pointer_rtx
,
3377 GEN_INT (-callee_offset
)));
3378 /* If writeback is used when restoring callee-saves, the CFA
3379 is restored on the instruction doing the writeback. */
3380 RTX_FRAME_RELATED_P (insn
) = callee_adjust
== 0;
3383 aarch64_add_sp (IP1_REGNUM
, final_adjust
, df_regs_ever_live_p (IP1_REGNUM
));
3385 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3386 callee_adjust
!= 0, &cfi_ops
);
3387 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3388 callee_adjust
!= 0, &cfi_ops
);
3391 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3393 if (callee_adjust
!= 0)
3394 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
3396 if (callee_adjust
!= 0 || initial_adjust
> 65536)
3398 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3399 insn
= get_last_insn ();
3400 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
3401 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
3402 RTX_FRAME_RELATED_P (insn
) = 1;
3406 aarch64_add_sp (IP0_REGNUM
, initial_adjust
, df_regs_ever_live_p (IP0_REGNUM
));
3410 /* Emit delayed restores and reset the CFA to be SP. */
3411 insn
= get_last_insn ();
3412 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
3413 REG_NOTES (insn
) = cfi_ops
;
3414 RTX_FRAME_RELATED_P (insn
) = 1;
3417 /* Stack adjustment for exception handler. */
3418 if (crtl
->calls_eh_return
)
3420 /* We need to unwind the stack by the offset computed by
3421 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3422 to be SP; letting the CFA move during this adjustment
3423 is just as correct as retaining the CFA from the body
3424 of the function. Therefore, do nothing special. */
3425 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
3428 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
3430 emit_jump_insn (ret_rtx
);
3433 /* Return the place to copy the exception unwinding return address to.
3434 This will probably be a stack slot, but could (in theory be the
3435 return register). */
3437 aarch64_final_eh_return_addr (void)
3439 HOST_WIDE_INT fp_offset
;
3441 aarch64_layout_frame ();
3443 fp_offset
= cfun
->machine
->frame
.frame_size
3444 - cfun
->machine
->frame
.hard_fp_offset
;
3446 if (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] < 0)
3447 return gen_rtx_REG (DImode
, LR_REGNUM
);
3449 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
3450 result in a store to save LR introduced by builtin_eh_return () being
3451 incorrectly deleted because the alias is not detected.
3452 So in the calculation of the address to copy the exception unwinding
3453 return address to, we note 2 cases.
3454 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3455 we return a SP-relative location since all the addresses are SP-relative
3456 in this case. This prevents the store from being optimized away.
3457 If the fp_offset is not 0, then the addresses will be FP-relative and
3458 therefore we return a FP-relative location. */
3460 if (frame_pointer_needed
)
3463 return gen_frame_mem (DImode
,
3464 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
3466 return gen_frame_mem (DImode
,
3467 plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
));
3470 /* If FP is not needed, we calculate the location of LR, which would be
3471 at the top of the saved registers block. */
3473 return gen_frame_mem (DImode
,
3474 plus_constant (Pmode
,
3477 + cfun
->machine
->frame
.saved_regs_size
3478 - 2 * UNITS_PER_WORD
));
3481 /* Output code to add DELTA to the first argument, and then jump
3482 to FUNCTION. Used for C++ multiple inheritance. */
3484 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
3485 HOST_WIDE_INT delta
,
3486 HOST_WIDE_INT vcall_offset
,
3489 /* The this pointer is always in x0. Note that this differs from
3490 Arm where the this pointer maybe bumped to r1 if r0 is required
3491 to return a pointer to an aggregate. On AArch64 a result value
3492 pointer will be in x8. */
3493 int this_regno
= R0_REGNUM
;
3494 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
3497 reload_completed
= 1;
3498 emit_note (NOTE_INSN_PROLOGUE_END
);
3500 if (vcall_offset
== 0)
3501 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3504 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
3506 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
3507 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
3508 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
3513 if (delta
>= -256 && delta
< 256)
3514 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
3515 plus_constant (Pmode
, this_rtx
, delta
));
3517 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3520 if (Pmode
== ptr_mode
)
3521 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
3523 aarch64_emit_move (temp0
,
3524 gen_rtx_ZERO_EXTEND (Pmode
,
3525 gen_rtx_MEM (ptr_mode
, addr
)));
3527 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
3528 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
3531 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
3533 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
3536 if (Pmode
== ptr_mode
)
3537 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
3539 aarch64_emit_move (temp1
,
3540 gen_rtx_SIGN_EXTEND (Pmode
,
3541 gen_rtx_MEM (ptr_mode
, addr
)));
3543 emit_insn (gen_add2_insn (this_rtx
, temp1
));
3546 /* Generate a tail call to the target function. */
3547 if (!TREE_USED (function
))
3549 assemble_external (function
);
3550 TREE_USED (function
) = 1;
3552 funexp
= XEXP (DECL_RTL (function
), 0);
3553 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
3554 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
3555 SIBLING_CALL_P (insn
) = 1;
3557 insn
= get_insns ();
3558 shorten_branches (insn
);
3559 final_start_function (insn
, file
, 1);
3560 final (insn
, file
, 1);
3561 final_end_function ();
3563 /* Stop pretending to be a post-reload pass. */
3564 reload_completed
= 0;
3568 aarch64_tls_referenced_p (rtx x
)
3570 if (!TARGET_HAVE_TLS
)
3572 subrtx_iterator::array_type array
;
3573 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
3575 const_rtx x
= *iter
;
3576 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
3578 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3579 TLS offsets, not real symbol references. */
3580 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
3581 iter
.skip_subrtxes ();
3587 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3588 a left shift of 0 or 12 bits. */
3590 aarch64_uimm12_shift (HOST_WIDE_INT val
)
3592 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
3593 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
3598 /* Return true if val is an immediate that can be loaded into a
3599 register by a MOVZ instruction. */
3601 aarch64_movw_imm (HOST_WIDE_INT val
, machine_mode mode
)
3603 if (GET_MODE_SIZE (mode
) > 4)
3605 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
3606 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
3611 /* Ignore sign extension. */
3612 val
&= (HOST_WIDE_INT
) 0xffffffff;
3614 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
3615 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
3618 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3620 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
3622 0x0000000100000001ull
,
3623 0x0001000100010001ull
,
3624 0x0101010101010101ull
,
3625 0x1111111111111111ull
,
3626 0x5555555555555555ull
,
3630 /* Return true if val is a valid bitmask immediate. */
3633 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
3635 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
3638 /* Check for a single sequence of one bits and return quickly if so.
3639 The special cases of all ones and all zeroes returns false. */
3640 val
= (unsigned HOST_WIDE_INT
) val_in
;
3641 tmp
= val
+ (val
& -val
);
3643 if (tmp
== (tmp
& -tmp
))
3644 return (val
+ 1) > 1;
3646 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3648 val
= (val
<< 32) | (val
& 0xffffffff);
3650 /* Invert if the immediate doesn't start with a zero bit - this means we
3651 only need to search for sequences of one bits. */
3655 /* Find the first set bit and set tmp to val with the first sequence of one
3656 bits removed. Return success if there is a single sequence of ones. */
3657 first_one
= val
& -val
;
3658 tmp
= val
& (val
+ first_one
);
3663 /* Find the next set bit and compute the difference in bit position. */
3664 next_one
= tmp
& -tmp
;
3665 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
3668 /* Check the bit position difference is a power of 2, and that the first
3669 sequence of one bits fits within 'bits' bits. */
3670 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
3673 /* Check the sequence of one bits is repeated 64/bits times. */
3674 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
3678 /* Return true if val is an immediate that can be loaded into a
3679 register in a single instruction. */
3681 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
3683 if (aarch64_movw_imm (val
, mode
) || aarch64_movw_imm (~val
, mode
))
3685 return aarch64_bitmask_imm (val
, mode
);
3689 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
3693 if (GET_CODE (x
) == HIGH
)
3696 split_const (x
, &base
, &offset
);
3697 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
3699 if (aarch64_classify_symbol (base
, offset
)
3700 != SYMBOL_FORCE_TO_MEM
)
3703 /* Avoid generating a 64-bit relocation in ILP32; leave
3704 to aarch64_expand_mov_immediate to handle it properly. */
3705 return mode
!= ptr_mode
;
3708 return aarch64_tls_referenced_p (x
);
3711 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3712 The expansion for a table switch is quite expensive due to the number
3713 of instructions, the table lookup and hard to predict indirect jump.
3714 When optimizing for speed, and -O3 enabled, use the per-core tuning if
3715 set, otherwise use tables for > 16 cases as a tradeoff between size and
3716 performance. When optimizing for size, use the default setting. */
3719 aarch64_case_values_threshold (void)
3721 /* Use the specified limit for the number of cases before using jump
3722 tables at higher optimization levels. */
3724 && selected_cpu
->tune
->max_case_values
!= 0)
3725 return selected_cpu
->tune
->max_case_values
;
3727 return optimize_size
? default_case_values_threshold () : 17;
3730 /* Return true if register REGNO is a valid index register.
3731 STRICT_P is true if REG_OK_STRICT is in effect. */
3734 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
3736 if (!HARD_REGISTER_NUM_P (regno
))
3744 regno
= reg_renumber
[regno
];
3746 return GP_REGNUM_P (regno
);
3749 /* Return true if register REGNO is a valid base register for mode MODE.
3750 STRICT_P is true if REG_OK_STRICT is in effect. */
3753 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
3755 if (!HARD_REGISTER_NUM_P (regno
))
3763 regno
= reg_renumber
[regno
];
3766 /* The fake registers will be eliminated to either the stack or
3767 hard frame pointer, both of which are usually valid base registers.
3768 Reload deals with the cases where the eliminated form isn't valid. */
3769 return (GP_REGNUM_P (regno
)
3770 || regno
== SP_REGNUM
3771 || regno
== FRAME_POINTER_REGNUM
3772 || regno
== ARG_POINTER_REGNUM
);
3775 /* Return true if X is a valid base register for mode MODE.
3776 STRICT_P is true if REG_OK_STRICT is in effect. */
3779 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
3781 if (!strict_p
&& GET_CODE (x
) == SUBREG
)
3784 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
3787 /* Return true if address offset is a valid index. If it is, fill in INFO
3788 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3791 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
3792 machine_mode mode
, bool strict_p
)
3794 enum aarch64_address_type type
;
3799 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
3800 && GET_MODE (x
) == Pmode
)
3802 type
= ADDRESS_REG_REG
;
3806 /* (sign_extend:DI (reg:SI)) */
3807 else if ((GET_CODE (x
) == SIGN_EXTEND
3808 || GET_CODE (x
) == ZERO_EXTEND
)
3809 && GET_MODE (x
) == DImode
3810 && GET_MODE (XEXP (x
, 0)) == SImode
)
3812 type
= (GET_CODE (x
) == SIGN_EXTEND
)
3813 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3814 index
= XEXP (x
, 0);
3817 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3818 else if (GET_CODE (x
) == MULT
3819 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
3820 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
3821 && GET_MODE (XEXP (x
, 0)) == DImode
3822 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
3823 && CONST_INT_P (XEXP (x
, 1)))
3825 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
3826 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3827 index
= XEXP (XEXP (x
, 0), 0);
3828 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
3830 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3831 else if (GET_CODE (x
) == ASHIFT
3832 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
3833 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
3834 && GET_MODE (XEXP (x
, 0)) == DImode
3835 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
3836 && CONST_INT_P (XEXP (x
, 1)))
3838 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
3839 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3840 index
= XEXP (XEXP (x
, 0), 0);
3841 shift
= INTVAL (XEXP (x
, 1));
3843 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3844 else if ((GET_CODE (x
) == SIGN_EXTRACT
3845 || GET_CODE (x
) == ZERO_EXTRACT
)
3846 && GET_MODE (x
) == DImode
3847 && GET_CODE (XEXP (x
, 0)) == MULT
3848 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3849 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
3851 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
3852 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3853 index
= XEXP (XEXP (x
, 0), 0);
3854 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
3855 if (INTVAL (XEXP (x
, 1)) != 32 + shift
3856 || INTVAL (XEXP (x
, 2)) != 0)
3859 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3860 (const_int 0xffffffff<<shift)) */
3861 else if (GET_CODE (x
) == AND
3862 && GET_MODE (x
) == DImode
3863 && GET_CODE (XEXP (x
, 0)) == MULT
3864 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3865 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
3866 && CONST_INT_P (XEXP (x
, 1)))
3868 type
= ADDRESS_REG_UXTW
;
3869 index
= XEXP (XEXP (x
, 0), 0);
3870 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
3871 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
3874 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3875 else if ((GET_CODE (x
) == SIGN_EXTRACT
3876 || GET_CODE (x
) == ZERO_EXTRACT
)
3877 && GET_MODE (x
) == DImode
3878 && GET_CODE (XEXP (x
, 0)) == ASHIFT
3879 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3880 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
3882 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
3883 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3884 index
= XEXP (XEXP (x
, 0), 0);
3885 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
3886 if (INTVAL (XEXP (x
, 1)) != 32 + shift
3887 || INTVAL (XEXP (x
, 2)) != 0)
3890 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3891 (const_int 0xffffffff<<shift)) */
3892 else if (GET_CODE (x
) == AND
3893 && GET_MODE (x
) == DImode
3894 && GET_CODE (XEXP (x
, 0)) == ASHIFT
3895 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3896 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
3897 && CONST_INT_P (XEXP (x
, 1)))
3899 type
= ADDRESS_REG_UXTW
;
3900 index
= XEXP (XEXP (x
, 0), 0);
3901 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
3902 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
3905 /* (mult:P (reg:P) (const_int scale)) */
3906 else if (GET_CODE (x
) == MULT
3907 && GET_MODE (x
) == Pmode
3908 && GET_MODE (XEXP (x
, 0)) == Pmode
3909 && CONST_INT_P (XEXP (x
, 1)))
3911 type
= ADDRESS_REG_REG
;
3912 index
= XEXP (x
, 0);
3913 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
3915 /* (ashift:P (reg:P) (const_int shift)) */
3916 else if (GET_CODE (x
) == ASHIFT
3917 && GET_MODE (x
) == Pmode
3918 && GET_MODE (XEXP (x
, 0)) == Pmode
3919 && CONST_INT_P (XEXP (x
, 1)))
3921 type
= ADDRESS_REG_REG
;
3922 index
= XEXP (x
, 0);
3923 shift
= INTVAL (XEXP (x
, 1));
3928 if (GET_CODE (index
) == SUBREG
)
3929 index
= SUBREG_REG (index
);
3932 (shift
> 0 && shift
<= 3
3933 && (1 << shift
) == GET_MODE_SIZE (mode
)))
3935 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
3938 info
->offset
= index
;
3939 info
->shift
= shift
;
3947 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3949 return (offset
>= -64 * GET_MODE_SIZE (mode
)
3950 && offset
< 64 * GET_MODE_SIZE (mode
)
3951 && offset
% GET_MODE_SIZE (mode
) == 0);
3955 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
3956 HOST_WIDE_INT offset
)
3958 return offset
>= -256 && offset
< 256;
3962 offset_12bit_unsigned_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3965 && offset
< 4096 * GET_MODE_SIZE (mode
)
3966 && offset
% GET_MODE_SIZE (mode
) == 0);
3969 /* Return true if MODE is one of the modes for which we
3970 support LDP/STP operations. */
3973 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
3975 return mode
== SImode
|| mode
== DImode
3976 || mode
== SFmode
|| mode
== DFmode
3977 || (aarch64_vector_mode_supported_p (mode
)
3978 && GET_MODE_SIZE (mode
) == 8);
3981 /* Return true if REGNO is a virtual pointer register, or an eliminable
3982 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
3983 include stack_pointer or hard_frame_pointer. */
3985 virt_or_elim_regno_p (unsigned regno
)
3987 return ((regno
>= FIRST_VIRTUAL_REGISTER
3988 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
3989 || regno
== FRAME_POINTER_REGNUM
3990 || regno
== ARG_POINTER_REGNUM
);
3993 /* Return true if X is a valid address for machine mode MODE. If it is,
3994 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3995 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3998 aarch64_classify_address (struct aarch64_address_info
*info
,
3999 rtx x
, machine_mode mode
,
4000 RTX_CODE outer_code
, bool strict_p
)
4002 enum rtx_code code
= GET_CODE (x
);
4005 /* On BE, we use load/store pair for all large int mode load/stores. */
4006 bool load_store_pair_p
= (outer_code
== PARALLEL
4007 || (BYTES_BIG_ENDIAN
4008 && aarch64_vect_struct_mode_p (mode
)));
4010 bool allow_reg_index_p
=
4012 && (GET_MODE_SIZE (mode
) != 16 || aarch64_vector_mode_supported_p (mode
))
4013 && !aarch64_vect_struct_mode_p (mode
);
4015 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4017 if (aarch64_vect_struct_mode_p (mode
) && !BYTES_BIG_ENDIAN
4018 && (code
!= POST_INC
&& code
!= REG
))
4025 info
->type
= ADDRESS_REG_IMM
;
4027 info
->offset
= const0_rtx
;
4028 return aarch64_base_register_rtx_p (x
, strict_p
);
4036 && virt_or_elim_regno_p (REGNO (op0
))
4037 && CONST_INT_P (op1
))
4039 info
->type
= ADDRESS_REG_IMM
;
4046 if (GET_MODE_SIZE (mode
) != 0
4047 && CONST_INT_P (op1
)
4048 && aarch64_base_register_rtx_p (op0
, strict_p
))
4050 HOST_WIDE_INT offset
= INTVAL (op1
);
4052 info
->type
= ADDRESS_REG_IMM
;
4056 /* TImode and TFmode values are allowed in both pairs of X
4057 registers and individual Q registers. The available
4059 X,X: 7-bit signed scaled offset
4060 Q: 9-bit signed offset
4061 We conservatively require an offset representable in either mode.
4062 When performing the check for pairs of X registers i.e. LDP/STP
4063 pass down DImode since that is the natural size of the LDP/STP
4064 instruction memory accesses. */
4065 if (mode
== TImode
|| mode
== TFmode
)
4066 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
4067 && offset_9bit_signed_unscaled_p (mode
, offset
));
4069 /* A 7bit offset check because OImode will emit a ldp/stp
4070 instruction (only big endian will get here).
4071 For ldp/stp instructions, the offset is scaled for the size of a
4072 single element of the pair. */
4074 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
4076 /* Three 9/12 bit offsets checks because CImode will emit three
4077 ldr/str instructions (only big endian will get here). */
4079 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4080 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
4081 || offset_12bit_unsigned_scaled_p (V16QImode
,
4084 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4085 instructions (only big endian will get here). */
4087 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4088 && aarch64_offset_7bit_signed_scaled_p (TImode
,
4091 if (load_store_pair_p
)
4092 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4093 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4095 return (offset_9bit_signed_unscaled_p (mode
, offset
)
4096 || offset_12bit_unsigned_scaled_p (mode
, offset
));
4099 if (allow_reg_index_p
)
4101 /* Look for base + (scaled/extended) index register. */
4102 if (aarch64_base_register_rtx_p (op0
, strict_p
)
4103 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
4108 if (aarch64_base_register_rtx_p (op1
, strict_p
)
4109 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
4122 info
->type
= ADDRESS_REG_WB
;
4123 info
->base
= XEXP (x
, 0);
4124 info
->offset
= NULL_RTX
;
4125 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
4129 info
->type
= ADDRESS_REG_WB
;
4130 info
->base
= XEXP (x
, 0);
4131 if (GET_CODE (XEXP (x
, 1)) == PLUS
4132 && CONST_INT_P (XEXP (XEXP (x
, 1), 1))
4133 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
4134 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4136 HOST_WIDE_INT offset
;
4137 info
->offset
= XEXP (XEXP (x
, 1), 1);
4138 offset
= INTVAL (info
->offset
);
4140 /* TImode and TFmode values are allowed in both pairs of X
4141 registers and individual Q registers. The available
4143 X,X: 7-bit signed scaled offset
4144 Q: 9-bit signed offset
4145 We conservatively require an offset representable in either mode.
4147 if (mode
== TImode
|| mode
== TFmode
)
4148 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
4149 && offset_9bit_signed_unscaled_p (mode
, offset
));
4151 if (load_store_pair_p
)
4152 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4153 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4155 return offset_9bit_signed_unscaled_p (mode
, offset
);
4162 /* load literal: pc-relative constant pool entry. Only supported
4163 for SI mode or larger. */
4164 info
->type
= ADDRESS_SYMBOLIC
;
4166 if (!load_store_pair_p
&& GET_MODE_SIZE (mode
) >= 4)
4170 split_const (x
, &sym
, &addend
);
4171 return ((GET_CODE (sym
) == LABEL_REF
4172 || (GET_CODE (sym
) == SYMBOL_REF
4173 && CONSTANT_POOL_ADDRESS_P (sym
)
4174 && aarch64_pcrelative_literal_loads
)));
4179 info
->type
= ADDRESS_LO_SUM
;
4180 info
->base
= XEXP (x
, 0);
4181 info
->offset
= XEXP (x
, 1);
4182 if (allow_reg_index_p
4183 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4186 split_const (info
->offset
, &sym
, &offs
);
4187 if (GET_CODE (sym
) == SYMBOL_REF
4188 && (aarch64_classify_symbol (sym
, offs
) == SYMBOL_SMALL_ABSOLUTE
))
4190 /* The symbol and offset must be aligned to the access size. */
4192 unsigned int ref_size
;
4194 if (CONSTANT_POOL_ADDRESS_P (sym
))
4195 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
4196 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
4198 tree exp
= SYMBOL_REF_DECL (sym
);
4199 align
= TYPE_ALIGN (TREE_TYPE (exp
));
4200 align
= CONSTANT_ALIGNMENT (exp
, align
);
4202 else if (SYMBOL_REF_DECL (sym
))
4203 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
4204 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
4205 && SYMBOL_REF_BLOCK (sym
) != NULL
)
4206 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
4208 align
= BITS_PER_UNIT
;
4210 ref_size
= GET_MODE_SIZE (mode
);
4212 ref_size
= GET_MODE_SIZE (DImode
);
4214 return ((INTVAL (offs
) & (ref_size
- 1)) == 0
4215 && ((align
/ BITS_PER_UNIT
) & (ref_size
- 1)) == 0);
4226 aarch64_symbolic_address_p (rtx x
)
4230 split_const (x
, &x
, &offset
);
4231 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
4234 /* Classify the base of symbolic expression X. */
4236 enum aarch64_symbol_type
4237 aarch64_classify_symbolic_expression (rtx x
)
4241 split_const (x
, &x
, &offset
);
4242 return aarch64_classify_symbol (x
, offset
);
4246 /* Return TRUE if X is a legitimate address for accessing memory in
4249 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
4251 struct aarch64_address_info addr
;
4253 return aarch64_classify_address (&addr
, x
, mode
, MEM
, strict_p
);
4256 /* Return TRUE if X is a legitimate address for accessing memory in
4257 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4260 aarch64_legitimate_address_p (machine_mode mode
, rtx x
,
4261 RTX_CODE outer_code
, bool strict_p
)
4263 struct aarch64_address_info addr
;
4265 return aarch64_classify_address (&addr
, x
, mode
, outer_code
, strict_p
);
4268 /* Split an out-of-range address displacement into a base and offset.
4269 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4270 to increase opportunities for sharing the base address of different sizes.
4271 For TI/TFmode and unaligned accesses use a 256-byte range. */
4273 aarch64_legitimize_address_displacement (rtx
*disp
, rtx
*off
, machine_mode mode
)
4275 HOST_WIDE_INT mask
= GET_MODE_SIZE (mode
) < 4 ? 0xfff : 0x3fff;
4277 if (mode
== TImode
|| mode
== TFmode
||
4278 (INTVAL (*disp
) & (GET_MODE_SIZE (mode
) - 1)) != 0)
4281 *off
= GEN_INT (INTVAL (*disp
) & ~mask
);
4282 *disp
= GEN_INT (INTVAL (*disp
) & mask
);
4286 /* Return TRUE if rtx X is immediate constant 0.0 */
4288 aarch64_float_const_zero_rtx_p (rtx x
)
4290 if (GET_MODE (x
) == VOIDmode
)
4293 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
4294 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
4295 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
4298 /* Return the fixed registers used for condition codes. */
4301 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
4304 *p2
= INVALID_REGNUM
;
4308 /* Emit call insn with PAT and do aarch64-specific handling. */
4311 aarch64_emit_call_insn (rtx pat
)
4313 rtx insn
= emit_call_insn (pat
);
4315 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
4316 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
4317 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
4321 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
4323 /* All floating point compares return CCFP if it is an equality
4324 comparison, and CCFPE otherwise. */
4325 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
4352 /* Equality comparisons of short modes against zero can be performed
4353 using the TST instruction with the appropriate bitmask. */
4354 if (y
== const0_rtx
&& REG_P (x
)
4355 && (code
== EQ
|| code
== NE
)
4356 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
4359 /* Similarly, comparisons of zero_extends from shorter modes can
4360 be performed using an ANDS with an immediate mask. */
4361 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
4362 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4363 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
4364 && (code
== EQ
|| code
== NE
))
4367 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4369 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
4370 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
4371 || GET_CODE (x
) == NEG
4372 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
4373 && CONST_INT_P (XEXP (x
, 2)))))
4376 /* A compare with a shifted operand. Because of canonicalization,
4377 the comparison will have to be swapped when we emit the assembly
4379 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4380 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
4381 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
4382 || GET_CODE (x
) == LSHIFTRT
4383 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
4386 /* Similarly for a negated operand, but we can only do this for
4388 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4389 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
4390 && (code
== EQ
|| code
== NE
)
4391 && GET_CODE (x
) == NEG
)
4394 /* A test for unsigned overflow. */
4395 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
4397 && GET_CODE (x
) == PLUS
4398 && GET_CODE (y
) == ZERO_EXTEND
)
4401 /* For everything else, return CCmode. */
4406 aarch64_get_condition_code_1 (enum machine_mode
, enum rtx_code
);
4409 aarch64_get_condition_code (rtx x
)
4411 machine_mode mode
= GET_MODE (XEXP (x
, 0));
4412 enum rtx_code comp_code
= GET_CODE (x
);
4414 if (GET_MODE_CLASS (mode
) != MODE_CC
)
4415 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
4416 return aarch64_get_condition_code_1 (mode
, comp_code
);
4420 aarch64_get_condition_code_1 (enum machine_mode mode
, enum rtx_code comp_code
)
4428 case GE
: return AARCH64_GE
;
4429 case GT
: return AARCH64_GT
;
4430 case LE
: return AARCH64_LS
;
4431 case LT
: return AARCH64_MI
;
4432 case NE
: return AARCH64_NE
;
4433 case EQ
: return AARCH64_EQ
;
4434 case ORDERED
: return AARCH64_VC
;
4435 case UNORDERED
: return AARCH64_VS
;
4436 case UNLT
: return AARCH64_LT
;
4437 case UNLE
: return AARCH64_LE
;
4438 case UNGT
: return AARCH64_HI
;
4439 case UNGE
: return AARCH64_PL
;
4447 case NE
: return AARCH64_NE
;
4448 case EQ
: return AARCH64_EQ
;
4449 case GE
: return AARCH64_GE
;
4450 case GT
: return AARCH64_GT
;
4451 case LE
: return AARCH64_LE
;
4452 case LT
: return AARCH64_LT
;
4453 case GEU
: return AARCH64_CS
;
4454 case GTU
: return AARCH64_HI
;
4455 case LEU
: return AARCH64_LS
;
4456 case LTU
: return AARCH64_CC
;
4464 case NE
: return AARCH64_NE
;
4465 case EQ
: return AARCH64_EQ
;
4466 case GE
: return AARCH64_LE
;
4467 case GT
: return AARCH64_LT
;
4468 case LE
: return AARCH64_GE
;
4469 case LT
: return AARCH64_GT
;
4470 case GEU
: return AARCH64_LS
;
4471 case GTU
: return AARCH64_CC
;
4472 case LEU
: return AARCH64_CS
;
4473 case LTU
: return AARCH64_HI
;
4481 case NE
: return AARCH64_NE
;
4482 case EQ
: return AARCH64_EQ
;
4483 case GE
: return AARCH64_PL
;
4484 case LT
: return AARCH64_MI
;
4492 case NE
: return AARCH64_NE
;
4493 case EQ
: return AARCH64_EQ
;
4501 case NE
: return AARCH64_CS
;
4502 case EQ
: return AARCH64_CC
;
4515 aarch64_const_vec_all_same_in_range_p (rtx x
,
4516 HOST_WIDE_INT minval
,
4517 HOST_WIDE_INT maxval
)
4519 HOST_WIDE_INT firstval
;
4522 if (GET_CODE (x
) != CONST_VECTOR
4523 || GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_INT
)
4526 firstval
= INTVAL (CONST_VECTOR_ELT (x
, 0));
4527 if (firstval
< minval
|| firstval
> maxval
)
4530 count
= CONST_VECTOR_NUNITS (x
);
4531 for (i
= 1; i
< count
; i
++)
4532 if (INTVAL (CONST_VECTOR_ELT (x
, i
)) != firstval
)
4539 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
4541 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
4546 #define AARCH64_CC_V 1
4547 #define AARCH64_CC_C (1 << 1)
4548 #define AARCH64_CC_Z (1 << 2)
4549 #define AARCH64_CC_N (1 << 3)
4551 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4552 static const int aarch64_nzcv_codes
[] =
4554 0, /* EQ, Z == 1. */
4555 AARCH64_CC_Z
, /* NE, Z == 0. */
4556 0, /* CS, C == 1. */
4557 AARCH64_CC_C
, /* CC, C == 0. */
4558 0, /* MI, N == 1. */
4559 AARCH64_CC_N
, /* PL, N == 0. */
4560 0, /* VS, V == 1. */
4561 AARCH64_CC_V
, /* VC, V == 0. */
4562 0, /* HI, C ==1 && Z == 0. */
4563 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
4564 AARCH64_CC_V
, /* GE, N == V. */
4565 0, /* LT, N != V. */
4566 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
4567 0, /* LE, !(Z == 0 && N == V). */
4573 aarch64_print_operand (FILE *f
, rtx x
, int code
)
4577 /* An integer or symbol address without a preceding # sign. */
4579 switch (GET_CODE (x
))
4582 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
4586 output_addr_const (f
, x
);
4590 if (GET_CODE (XEXP (x
, 0)) == PLUS
4591 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
4593 output_addr_const (f
, x
);
4599 output_operand_lossage ("Unsupported operand for code '%c'", code
);
4604 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4608 if (!CONST_INT_P (x
)
4609 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
4611 output_operand_lossage ("invalid operand for '%%%c'", code
);
4627 output_operand_lossage ("invalid operand for '%%%c'", code
);
4637 /* Print N such that 2^N == X. */
4638 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
4640 output_operand_lossage ("invalid operand for '%%%c'", code
);
4644 asm_fprintf (f
, "%d", n
);
4649 /* Print the number of non-zero bits in X (a const_int). */
4650 if (!CONST_INT_P (x
))
4652 output_operand_lossage ("invalid operand for '%%%c'", code
);
4656 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
4660 /* Print the higher numbered register of a pair (TImode) of regs. */
4661 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
4663 output_operand_lossage ("invalid operand for '%%%c'", code
);
4667 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
4674 /* Print a condition (eq, ne, etc) or its inverse. */
4676 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4677 if (x
== const_true_rtx
)
4684 if (!COMPARISON_P (x
))
4686 output_operand_lossage ("invalid operand for '%%%c'", code
);
4690 cond_code
= aarch64_get_condition_code (x
);
4691 gcc_assert (cond_code
>= 0);
4693 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
4694 fputs (aarch64_condition_codes
[cond_code
], f
);
4703 /* Print a scalar FP/SIMD register name. */
4704 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4706 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4709 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
4716 /* Print the first FP/SIMD register name in a list. */
4717 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4719 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4722 asm_fprintf (f
, "v%d", REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
4726 /* Print a scalar FP/SIMD register name + 1. */
4727 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4729 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4732 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
4736 /* Print bottom 16 bits of integer constant in hex. */
4737 if (!CONST_INT_P (x
))
4739 output_operand_lossage ("invalid operand for '%%%c'", code
);
4742 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
4747 /* Print a general register name or the zero register (32-bit or
4750 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
4752 asm_fprintf (f
, "%czr", code
);
4756 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
4758 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
4762 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
4764 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
4771 /* Print a normal operand, if it's a general register, then we
4775 output_operand_lossage ("missing operand");
4779 switch (GET_CODE (x
))
4782 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
4786 output_address (GET_MODE (x
), XEXP (x
, 0));
4792 output_addr_const (asm_out_file
, x
);
4796 asm_fprintf (f
, "%wd", INTVAL (x
));
4800 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
4803 aarch64_const_vec_all_same_in_range_p (x
,
4805 HOST_WIDE_INT_MAX
));
4806 asm_fprintf (f
, "%wd", INTVAL (CONST_VECTOR_ELT (x
, 0)));
4808 else if (aarch64_simd_imm_zero_p (x
, GET_MODE (x
)))
4817 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4818 be getting CONST_DOUBLEs holding integers. */
4819 gcc_assert (GET_MODE (x
) != VOIDmode
);
4820 if (aarch64_float_const_zero_rtx_p (x
))
4825 else if (aarch64_float_const_representable_p (x
))
4828 char float_buf
[buf_size
] = {'\0'};
4829 real_to_decimal_for_mode (float_buf
,
4830 CONST_DOUBLE_REAL_VALUE (x
),
4833 asm_fprintf (asm_out_file
, "%s", float_buf
);
4837 output_operand_lossage ("invalid constant");
4840 output_operand_lossage ("invalid operand");
4846 if (GET_CODE (x
) == HIGH
)
4849 switch (aarch64_classify_symbolic_expression (x
))
4851 case SYMBOL_SMALL_GOT_4G
:
4852 asm_fprintf (asm_out_file
, ":got:");
4855 case SYMBOL_SMALL_TLSGD
:
4856 asm_fprintf (asm_out_file
, ":tlsgd:");
4859 case SYMBOL_SMALL_TLSDESC
:
4860 asm_fprintf (asm_out_file
, ":tlsdesc:");
4863 case SYMBOL_SMALL_TLSIE
:
4864 asm_fprintf (asm_out_file
, ":gottprel:");
4867 case SYMBOL_TLSLE24
:
4868 asm_fprintf (asm_out_file
, ":tprel:");
4871 case SYMBOL_TINY_GOT
:
4878 output_addr_const (asm_out_file
, x
);
4882 switch (aarch64_classify_symbolic_expression (x
))
4884 case SYMBOL_SMALL_GOT_4G
:
4885 asm_fprintf (asm_out_file
, ":lo12:");
4888 case SYMBOL_SMALL_TLSGD
:
4889 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
4892 case SYMBOL_SMALL_TLSDESC
:
4893 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
4896 case SYMBOL_SMALL_TLSIE
:
4897 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
4900 case SYMBOL_TLSLE12
:
4901 asm_fprintf (asm_out_file
, ":tprel_lo12:");
4904 case SYMBOL_TLSLE24
:
4905 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
4908 case SYMBOL_TINY_GOT
:
4909 asm_fprintf (asm_out_file
, ":got:");
4912 case SYMBOL_TINY_TLSIE
:
4913 asm_fprintf (asm_out_file
, ":gottprel:");
4919 output_addr_const (asm_out_file
, x
);
4924 switch (aarch64_classify_symbolic_expression (x
))
4926 case SYMBOL_TLSLE24
:
4927 asm_fprintf (asm_out_file
, ":tprel_hi12:");
4932 output_addr_const (asm_out_file
, x
);
4937 HOST_WIDE_INT cond_code
;
4940 if (!CONST_INT_P (x
))
4942 output_operand_lossage ("invalid operand for '%%%c'", code
);
4946 cond_code
= INTVAL (x
);
4947 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
4948 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
4953 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
4959 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
4961 struct aarch64_address_info addr
;
4963 if (aarch64_classify_address (&addr
, x
, mode
, MEM
, true))
4966 case ADDRESS_REG_IMM
:
4967 if (addr
.offset
== const0_rtx
)
4968 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
4970 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
4971 INTVAL (addr
.offset
));
4974 case ADDRESS_REG_REG
:
4975 if (addr
.shift
== 0)
4976 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
4977 reg_names
[REGNO (addr
.offset
)]);
4979 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
4980 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
4983 case ADDRESS_REG_UXTW
:
4984 if (addr
.shift
== 0)
4985 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
4986 REGNO (addr
.offset
) - R0_REGNUM
);
4988 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
4989 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
4992 case ADDRESS_REG_SXTW
:
4993 if (addr
.shift
== 0)
4994 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
4995 REGNO (addr
.offset
) - R0_REGNUM
);
4997 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
4998 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
5001 case ADDRESS_REG_WB
:
5002 switch (GET_CODE (x
))
5005 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)],
5006 GET_MODE_SIZE (mode
));
5009 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)],
5010 GET_MODE_SIZE (mode
));
5013 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)],
5014 GET_MODE_SIZE (mode
));
5017 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)],
5018 GET_MODE_SIZE (mode
));
5021 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
5022 INTVAL (addr
.offset
));
5025 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
5026 INTVAL (addr
.offset
));
5033 case ADDRESS_LO_SUM
:
5034 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
5035 output_addr_const (f
, addr
.offset
);
5036 asm_fprintf (f
, "]");
5039 case ADDRESS_SYMBOLIC
:
5043 output_addr_const (f
, x
);
5047 aarch64_label_mentioned_p (rtx x
)
5052 if (GET_CODE (x
) == LABEL_REF
)
5055 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5056 referencing instruction, but they are constant offsets, not
5058 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5061 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
5062 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
5068 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
5069 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
5072 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
5079 /* Implement REGNO_REG_CLASS. */
5082 aarch64_regno_regclass (unsigned regno
)
5084 if (GP_REGNUM_P (regno
))
5085 return GENERAL_REGS
;
5087 if (regno
== SP_REGNUM
)
5090 if (regno
== FRAME_POINTER_REGNUM
5091 || regno
== ARG_POINTER_REGNUM
)
5092 return POINTER_REGS
;
5094 if (FP_REGNUM_P (regno
))
5095 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
5101 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
5103 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5104 where mask is selected by alignment and size of the offset.
5105 We try to pick as large a range for the offset as possible to
5106 maximize the chance of a CSE. However, for aligned addresses
5107 we limit the range to 4k so that structures with different sized
5108 elements are likely to use the same base. We need to be careful
5109 not to split a CONST for some forms of address expression, otherwise
5110 it will generate sub-optimal code. */
5112 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
5114 rtx base
= XEXP (x
, 0);
5115 rtx offset_rtx
= XEXP (x
, 1);
5116 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
5118 if (GET_CODE (base
) == PLUS
)
5120 rtx op0
= XEXP (base
, 0);
5121 rtx op1
= XEXP (base
, 1);
5123 /* Force any scaling into a temp for CSE. */
5124 op0
= force_reg (Pmode
, op0
);
5125 op1
= force_reg (Pmode
, op1
);
5127 /* Let the pointer register be in op0. */
5128 if (REG_POINTER (op1
))
5129 std::swap (op0
, op1
);
5131 /* If the pointer is virtual or frame related, then we know that
5132 virtual register instantiation or register elimination is going
5133 to apply a second constant. We want the two constants folded
5134 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5135 if (virt_or_elim_regno_p (REGNO (op0
)))
5137 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
5138 NULL_RTX
, true, OPTAB_DIRECT
);
5139 return gen_rtx_PLUS (Pmode
, base
, op1
);
5142 /* Otherwise, in order to encourage CSE (and thence loop strength
5143 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5144 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
5145 NULL_RTX
, true, OPTAB_DIRECT
);
5146 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
5149 /* Does it look like we'll need a load/store-pair operation? */
5150 HOST_WIDE_INT base_offset
;
5151 if (GET_MODE_SIZE (mode
) > 16
5153 base_offset
= ((offset
+ 64 * GET_MODE_SIZE (mode
))
5154 & ~((128 * GET_MODE_SIZE (mode
)) - 1));
5155 /* For offsets aren't a multiple of the access size, the limit is
5157 else if (offset
& (GET_MODE_SIZE (mode
) - 1))
5159 base_offset
= (offset
+ 0x100) & ~0x1ff;
5161 /* BLKmode typically uses LDP of X-registers. */
5162 if (mode
== BLKmode
)
5163 base_offset
= (offset
+ 512) & ~0x3ff;
5165 /* Small negative offsets are supported. */
5166 else if (IN_RANGE (offset
, -256, 0))
5168 /* Use 12-bit offset by access size. */
5170 base_offset
= offset
& (~0xfff * GET_MODE_SIZE (mode
));
5172 if (base_offset
!= 0)
5174 base
= plus_constant (Pmode
, base
, base_offset
);
5175 base
= force_operand (base
, NULL_RTX
);
5176 return plus_constant (Pmode
, base
, offset
- base_offset
);
5183 /* Return the reload icode required for a constant pool in mode. */
5184 static enum insn_code
5185 aarch64_constant_pool_reload_icode (machine_mode mode
)
5190 return CODE_FOR_aarch64_reload_movcpsfdi
;
5193 return CODE_FOR_aarch64_reload_movcpdfdi
;
5196 return CODE_FOR_aarch64_reload_movcptfdi
;
5199 return CODE_FOR_aarch64_reload_movcpv8qidi
;
5202 return CODE_FOR_aarch64_reload_movcpv16qidi
;
5205 return CODE_FOR_aarch64_reload_movcpv4hidi
;
5208 return CODE_FOR_aarch64_reload_movcpv8hidi
;
5211 return CODE_FOR_aarch64_reload_movcpv2sidi
;
5214 return CODE_FOR_aarch64_reload_movcpv4sidi
;
5217 return CODE_FOR_aarch64_reload_movcpv2didi
;
5220 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
5229 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
5232 secondary_reload_info
*sri
)
5235 /* If we have to disable direct literal pool loads and stores because the
5236 function is too big, then we need a scratch register. */
5237 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
5238 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
5239 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
5240 && !aarch64_pcrelative_literal_loads
)
5242 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
5246 /* Without the TARGET_SIMD instructions we cannot move a Q register
5247 to a Q register directly. We need a scratch. */
5248 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
5249 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
5250 && reg_class_subset_p (rclass
, FP_REGS
))
5253 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
5254 else if (mode
== TImode
)
5255 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
5259 /* A TFmode or TImode memory access should be handled via an FP_REGS
5260 because AArch64 has richer addressing modes for LDR/STR instructions
5261 than LDP/STP instructions. */
5262 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
5263 && GET_MODE_SIZE (mode
) == 16 && MEM_P (x
))
5266 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
5267 return GENERAL_REGS
;
5273 aarch64_can_eliminate (const int from
, const int to
)
5275 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5276 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5278 if (frame_pointer_needed
)
5280 if (from
== ARG_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5282 if (from
== ARG_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
)
5284 if (from
== FRAME_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
5285 && !cfun
->calls_alloca
)
5287 if (from
== FRAME_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5294 /* If we decided that we didn't need a leaf frame pointer but then used
5295 LR in the function, then we'll want a frame pointer after all, so
5296 prevent this elimination to ensure a frame pointer is used. */
5297 if (to
== STACK_POINTER_REGNUM
5298 && flag_omit_leaf_frame_pointer
5299 && df_regs_ever_live_p (LR_REGNUM
))
5307 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
5309 aarch64_layout_frame ();
5311 if (to
== HARD_FRAME_POINTER_REGNUM
)
5313 if (from
== ARG_POINTER_REGNUM
)
5314 return cfun
->machine
->frame
.hard_fp_offset
;
5316 if (from
== FRAME_POINTER_REGNUM
)
5317 return cfun
->machine
->frame
.hard_fp_offset
5318 - cfun
->machine
->frame
.locals_offset
;
5321 if (to
== STACK_POINTER_REGNUM
)
5323 if (from
== FRAME_POINTER_REGNUM
)
5324 return cfun
->machine
->frame
.frame_size
5325 - cfun
->machine
->frame
.locals_offset
;
5328 return cfun
->machine
->frame
.frame_size
;
5331 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5335 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
5339 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
5344 aarch64_asm_trampoline_template (FILE *f
)
5348 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
5349 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
5353 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
5354 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
5356 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
5357 assemble_aligned_integer (4, const0_rtx
);
5358 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5359 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5363 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
5365 rtx fnaddr
, mem
, a_tramp
;
5366 const int tramp_code_sz
= 16;
5368 /* Don't need to copy the trailing D-words, we fill those in below. */
5369 emit_block_move (m_tramp
, assemble_trampoline_template (),
5370 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
5371 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
5372 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
5373 if (GET_MODE (fnaddr
) != ptr_mode
)
5374 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
5375 emit_move_insn (mem
, fnaddr
);
5377 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
5378 emit_move_insn (mem
, chain_value
);
5380 /* XXX We should really define a "clear_cache" pattern and use
5381 gen_clear_cache(). */
5382 a_tramp
= XEXP (m_tramp
, 0);
5383 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
5384 LCT_NORMAL
, VOIDmode
, 2, a_tramp
, ptr_mode
,
5385 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
5389 static unsigned char
5390 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
5394 case CALLER_SAVE_REGS
:
5401 aarch64_vector_mode_p (mode
)
5402 ? (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
5403 : (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
5417 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
5419 if (regclass
== POINTER_REGS
)
5420 return GENERAL_REGS
;
5422 if (regclass
== STACK_REG
)
5425 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
5431 /* If it's an integer immediate that MOVI can't handle, then
5432 FP_REGS is not an option, so we return NO_REGS instead. */
5433 if (CONST_INT_P (x
) && reg_class_subset_p (regclass
, FP_REGS
)
5434 && !aarch64_simd_imm_scalar_p (x
, GET_MODE (x
)))
5437 /* Register eliminiation can result in a request for
5438 SP+constant->FP_REGS. We cannot support such operations which
5439 use SP as source and an FP_REG as destination, so reject out
5441 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
5443 rtx lhs
= XEXP (x
, 0);
5445 /* Look through a possible SUBREG introduced by ILP32. */
5446 if (GET_CODE (lhs
) == SUBREG
)
5447 lhs
= SUBREG_REG (lhs
);
5449 gcc_assert (REG_P (lhs
));
5450 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
5459 aarch64_asm_output_labelref (FILE* f
, const char *name
)
5461 asm_fprintf (f
, "%U%s", name
);
5465 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
5467 if (priority
== DEFAULT_INIT_PRIORITY
)
5468 default_ctor_section_asm_out_constructor (symbol
, priority
);
5473 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
5474 s
= get_section (buf
, SECTION_WRITE
, NULL
);
5475 switch_to_section (s
);
5476 assemble_align (POINTER_SIZE
);
5477 assemble_aligned_integer (POINTER_BYTES
, symbol
);
5482 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
5484 if (priority
== DEFAULT_INIT_PRIORITY
)
5485 default_dtor_section_asm_out_destructor (symbol
, priority
);
5490 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
5491 s
= get_section (buf
, SECTION_WRITE
, NULL
);
5492 switch_to_section (s
);
5493 assemble_align (POINTER_SIZE
);
5494 assemble_aligned_integer (POINTER_BYTES
, symbol
);
5499 aarch64_output_casesi (rtx
*operands
)
5503 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
5505 static const char *const patterns
[4][2] =
5508 "ldrb\t%w3, [%0,%w1,uxtw]",
5509 "add\t%3, %4, %w3, sxtb #2"
5512 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5513 "add\t%3, %4, %w3, sxth #2"
5516 "ldr\t%w3, [%0,%w1,uxtw #2]",
5517 "add\t%3, %4, %w3, sxtw #2"
5519 /* We assume that DImode is only generated when not optimizing and
5520 that we don't really need 64-bit address offsets. That would
5521 imply an object file with 8GB of code in a single function! */
5523 "ldr\t%w3, [%0,%w1,uxtw #2]",
5524 "add\t%3, %4, %w3, sxtw #2"
5528 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
5530 index
= exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec
)));
5532 gcc_assert (index
>= 0 && index
<= 3);
5534 /* Need to implement table size reduction, by chaning the code below. */
5535 output_asm_insn (patterns
[index
][0], operands
);
5536 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
5537 snprintf (buf
, sizeof (buf
),
5538 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
5539 output_asm_insn (buf
, operands
);
5540 output_asm_insn (patterns
[index
][1], operands
);
5541 output_asm_insn ("br\t%3", operands
);
5542 assemble_label (asm_out_file
, label
);
5547 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5548 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5552 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
5554 if (shift
>= 0 && shift
<= 3)
5557 for (size
= 8; size
<= 32; size
*= 2)
5559 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
5560 if (mask
== bits
<< shift
)
5567 /* Constant pools are per function only when PC relative
5568 literal loads are true or we are in the large memory
5572 aarch64_can_use_per_function_literal_pools_p (void)
5574 return (aarch64_pcrelative_literal_loads
5575 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
5579 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
5581 /* Fixme:: In an ideal world this would work similar
5582 to the logic in aarch64_select_rtx_section but this
5583 breaks bootstrap in gcc go. For now we workaround
5584 this by returning false here. */
5588 /* Select appropriate section for constants depending
5589 on where we place literal pools. */
5592 aarch64_select_rtx_section (machine_mode mode
,
5594 unsigned HOST_WIDE_INT align
)
5596 if (aarch64_can_use_per_function_literal_pools_p ())
5597 return function_section (current_function_decl
);
5599 return default_elf_select_rtx_section (mode
, x
, align
);
5602 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5604 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
5605 HOST_WIDE_INT offset
)
5607 /* When using per-function literal pools, we must ensure that any code
5608 section is aligned to the minimal instruction length, lest we get
5609 errors from the assembler re "unaligned instructions". */
5610 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
5611 ASM_OUTPUT_ALIGN (f
, 2);
5616 /* Helper function for rtx cost calculation. Strip a shift expression
5617 from X. Returns the inner operand if successful, or the original
5618 expression on failure. */
5620 aarch64_strip_shift (rtx x
)
5624 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5625 we can convert both to ROR during final output. */
5626 if ((GET_CODE (op
) == ASHIFT
5627 || GET_CODE (op
) == ASHIFTRT
5628 || GET_CODE (op
) == LSHIFTRT
5629 || GET_CODE (op
) == ROTATERT
5630 || GET_CODE (op
) == ROTATE
)
5631 && CONST_INT_P (XEXP (op
, 1)))
5632 return XEXP (op
, 0);
5634 if (GET_CODE (op
) == MULT
5635 && CONST_INT_P (XEXP (op
, 1))
5636 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
5637 return XEXP (op
, 0);
5642 /* Helper function for rtx cost calculation. Strip an extend
5643 expression from X. Returns the inner operand if successful, or the
5644 original expression on failure. We deal with a number of possible
5645 canonicalization variations here. */
5647 aarch64_strip_extend (rtx x
)
5651 /* Zero and sign extraction of a widened value. */
5652 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
5653 && XEXP (op
, 2) == const0_rtx
5654 && GET_CODE (XEXP (op
, 0)) == MULT
5655 && aarch64_is_extend_from_extract (GET_MODE (op
), XEXP (XEXP (op
, 0), 1),
5657 return XEXP (XEXP (op
, 0), 0);
5659 /* It can also be represented (for zero-extend) as an AND with an
5661 if (GET_CODE (op
) == AND
5662 && GET_CODE (XEXP (op
, 0)) == MULT
5663 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
5664 && CONST_INT_P (XEXP (op
, 1))
5665 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
5666 INTVAL (XEXP (op
, 1))) != 0)
5667 return XEXP (XEXP (op
, 0), 0);
5669 /* Now handle extended register, as this may also have an optional
5670 left shift by 1..4. */
5671 if (GET_CODE (op
) == ASHIFT
5672 && CONST_INT_P (XEXP (op
, 1))
5673 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
5676 if (GET_CODE (op
) == ZERO_EXTEND
5677 || GET_CODE (op
) == SIGN_EXTEND
)
5686 /* Return true iff CODE is a shift supported in combination
5687 with arithmetic instructions. */
5690 aarch64_shift_p (enum rtx_code code
)
5692 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
5695 /* Helper function for rtx cost calculation. Calculate the cost of
5696 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5697 Return the calculated cost of the expression, recursing manually in to
5698 operands where needed. */
5701 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
5704 const struct cpu_cost_table
*extra_cost
5705 = aarch64_tune_params
.insn_extra_cost
;
5707 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
5708 machine_mode mode
= GET_MODE (x
);
5710 gcc_checking_assert (code
== MULT
);
5715 if (VECTOR_MODE_P (mode
))
5716 mode
= GET_MODE_INNER (mode
);
5718 /* Integer multiply/fma. */
5719 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5721 /* The multiply will be canonicalized as a shift, cost it as such. */
5722 if (aarch64_shift_p (GET_CODE (x
))
5723 || (CONST_INT_P (op1
)
5724 && exact_log2 (INTVAL (op1
)) > 0))
5726 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
5727 || GET_CODE (op0
) == SIGN_EXTEND
;
5733 /* ARITH + shift-by-register. */
5734 cost
+= extra_cost
->alu
.arith_shift_reg
;
5736 /* ARITH + extended register. We don't have a cost field
5737 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5738 cost
+= extra_cost
->alu
.extend_arith
;
5740 /* ARITH + shift-by-immediate. */
5741 cost
+= extra_cost
->alu
.arith_shift
;
5744 /* LSL (immediate). */
5745 cost
+= extra_cost
->alu
.shift
;
5748 /* Strip extends as we will have costed them in the case above. */
5750 op0
= aarch64_strip_extend (op0
);
5752 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
5757 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5758 compound and let the below cases handle it. After all, MNEG is a
5759 special-case alias of MSUB. */
5760 if (GET_CODE (op0
) == NEG
)
5762 op0
= XEXP (op0
, 0);
5766 /* Integer multiplies or FMAs have zero/sign extending variants. */
5767 if ((GET_CODE (op0
) == ZERO_EXTEND
5768 && GET_CODE (op1
) == ZERO_EXTEND
)
5769 || (GET_CODE (op0
) == SIGN_EXTEND
5770 && GET_CODE (op1
) == SIGN_EXTEND
))
5772 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
5773 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
5778 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5779 cost
+= extra_cost
->mult
[0].extend_add
;
5781 /* MUL/SMULL/UMULL. */
5782 cost
+= extra_cost
->mult
[0].extend
;
5788 /* This is either an integer multiply or a MADD. In both cases
5789 we want to recurse and cost the operands. */
5790 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
5791 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
5797 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
5800 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
5809 /* Floating-point FMA/FMUL can also support negations of the
5810 operands, unless the rounding mode is upward or downward in
5811 which case FNMUL is different than FMUL with operand negation. */
5812 bool neg0
= GET_CODE (op0
) == NEG
;
5813 bool neg1
= GET_CODE (op1
) == NEG
;
5814 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
5817 op0
= XEXP (op0
, 0);
5819 op1
= XEXP (op1
, 0);
5823 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5824 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
5827 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
5830 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
5831 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
5837 aarch64_address_cost (rtx x
,
5839 addr_space_t as ATTRIBUTE_UNUSED
,
5842 enum rtx_code c
= GET_CODE (x
);
5843 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
5844 struct aarch64_address_info info
;
5848 if (!aarch64_classify_address (&info
, x
, mode
, c
, false))
5850 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
5852 /* This is a CONST or SYMBOL ref which will be split
5853 in a different way depending on the code model in use.
5854 Cost it through the generic infrastructure. */
5855 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
5856 /* Divide through by the cost of one instruction to
5857 bring it to the same units as the address costs. */
5858 cost_symbol_ref
/= COSTS_N_INSNS (1);
5859 /* The cost is then the cost of preparing the address,
5860 followed by an immediate (possibly 0) offset. */
5861 return cost_symbol_ref
+ addr_cost
->imm_offset
;
5865 /* This is most likely a jump table from a case
5867 return addr_cost
->register_offset
;
5873 case ADDRESS_LO_SUM
:
5874 case ADDRESS_SYMBOLIC
:
5875 case ADDRESS_REG_IMM
:
5876 cost
+= addr_cost
->imm_offset
;
5879 case ADDRESS_REG_WB
:
5880 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
5881 cost
+= addr_cost
->pre_modify
;
5882 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
5883 cost
+= addr_cost
->post_modify
;
5889 case ADDRESS_REG_REG
:
5890 cost
+= addr_cost
->register_offset
;
5893 case ADDRESS_REG_SXTW
:
5894 cost
+= addr_cost
->register_sextend
;
5897 case ADDRESS_REG_UXTW
:
5898 cost
+= addr_cost
->register_zextend
;
5908 /* For the sake of calculating the cost of the shifted register
5909 component, we can treat same sized modes in the same way. */
5910 switch (GET_MODE_BITSIZE (mode
))
5913 cost
+= addr_cost
->addr_scale_costs
.hi
;
5917 cost
+= addr_cost
->addr_scale_costs
.si
;
5921 cost
+= addr_cost
->addr_scale_costs
.di
;
5924 /* We can't tell, or this is a 128-bit vector. */
5926 cost
+= addr_cost
->addr_scale_costs
.ti
;
5934 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5935 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5939 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
5941 /* When optimizing for speed, use the cost of unpredictable branches. */
5942 const struct cpu_branch_cost
*branch_costs
=
5943 aarch64_tune_params
.branch_costs
;
5945 if (!speed_p
|| predictable_p
)
5946 return branch_costs
->predictable
;
5948 return branch_costs
->unpredictable
;
5951 /* Return true if the RTX X in mode MODE is a zero or sign extract
5952 usable in an ADD or SUB (extended register) instruction. */
5954 aarch64_rtx_arith_op_extract_p (rtx x
, machine_mode mode
)
5956 /* Catch add with a sign extract.
5957 This is add_<optab><mode>_multp2. */
5958 if (GET_CODE (x
) == SIGN_EXTRACT
5959 || GET_CODE (x
) == ZERO_EXTRACT
)
5961 rtx op0
= XEXP (x
, 0);
5962 rtx op1
= XEXP (x
, 1);
5963 rtx op2
= XEXP (x
, 2);
5965 if (GET_CODE (op0
) == MULT
5966 && CONST_INT_P (op1
)
5967 && op2
== const0_rtx
5968 && CONST_INT_P (XEXP (op0
, 1))
5969 && aarch64_is_extend_from_extract (mode
,
5976 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5978 else if (GET_CODE (x
) == SIGN_EXTEND
5979 || GET_CODE (x
) == ZERO_EXTEND
)
5980 return REG_P (XEXP (x
, 0));
5986 aarch64_frint_unspec_p (unsigned int u
)
6004 /* Return true iff X is an rtx that will match an extr instruction
6005 i.e. as described in the *extr<mode>5_insn family of patterns.
6006 OP0 and OP1 will be set to the operands of the shifts involved
6007 on success and will be NULL_RTX otherwise. */
6010 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
6013 machine_mode mode
= GET_MODE (x
);
6015 *res_op0
= NULL_RTX
;
6016 *res_op1
= NULL_RTX
;
6018 if (GET_CODE (x
) != IOR
)
6024 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
6025 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
6027 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6028 if (GET_CODE (op1
) == ASHIFT
)
6029 std::swap (op0
, op1
);
6031 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
6034 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
6035 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
6037 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
6038 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
6040 *res_op0
= XEXP (op0
, 0);
6041 *res_op1
= XEXP (op1
, 0);
6049 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6050 storing it in *COST. Result is true if the total cost of the operation
6051 has now been calculated. */
6053 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
6057 enum rtx_code cmpcode
;
6059 if (COMPARISON_P (op0
))
6061 inner
= XEXP (op0
, 0);
6062 comparator
= XEXP (op0
, 1);
6063 cmpcode
= GET_CODE (op0
);
6068 comparator
= const0_rtx
;
6072 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
6074 /* Conditional branch. */
6075 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6079 if (cmpcode
== NE
|| cmpcode
== EQ
)
6081 if (comparator
== const0_rtx
)
6083 /* TBZ/TBNZ/CBZ/CBNZ. */
6084 if (GET_CODE (inner
) == ZERO_EXTRACT
)
6086 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
6087 ZERO_EXTRACT
, 0, speed
);
6090 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
6095 else if (cmpcode
== LT
|| cmpcode
== GE
)
6098 if (comparator
== const0_rtx
)
6103 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6106 if (GET_CODE (op1
) == COMPARE
)
6108 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6109 if (XEXP (op1
, 1) == const0_rtx
)
6113 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
6114 const struct cpu_cost_table
*extra_cost
6115 = aarch64_tune_params
.insn_extra_cost
;
6117 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6118 *cost
+= extra_cost
->alu
.arith
;
6120 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
6125 /* It's a conditional operation based on the status flags,
6126 so it must be some flavor of CSEL. */
6128 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6129 if (GET_CODE (op1
) == NEG
6130 || GET_CODE (op1
) == NOT
6131 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
6132 op1
= XEXP (op1
, 0);
6133 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
6135 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6136 op1
= XEXP (op1
, 0);
6137 op2
= XEXP (op2
, 0);
6140 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
6141 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
6145 /* We don't know what this is, cost all operands. */
6149 /* Check whether X is a bitfield operation of the form shift + extend that
6150 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6151 operand to which the bitfield operation is applied. Otherwise return
6155 aarch64_extend_bitfield_pattern_p (rtx x
)
6157 rtx_code outer_code
= GET_CODE (x
);
6158 machine_mode outer_mode
= GET_MODE (x
);
6160 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
6161 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
6164 rtx inner
= XEXP (x
, 0);
6165 rtx_code inner_code
= GET_CODE (inner
);
6166 machine_mode inner_mode
= GET_MODE (inner
);
6172 if (CONST_INT_P (XEXP (inner
, 1))
6173 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6174 op
= XEXP (inner
, 0);
6177 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6178 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6179 op
= XEXP (inner
, 0);
6182 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6183 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6184 op
= XEXP (inner
, 0);
6193 /* Return true if the mask and a shift amount from an RTX of the form
6194 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6195 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6198 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode
, rtx mask
, rtx shft_amnt
)
6200 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
6201 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
6202 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
6203 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
6206 /* Calculate the cost of calculating X, storing it in *COST. Result
6207 is true if the total cost of the operation has now been calculated. */
6209 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
6210 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
6213 const struct cpu_cost_table
*extra_cost
6214 = aarch64_tune_params
.insn_extra_cost
;
6215 int code
= GET_CODE (x
);
6217 /* By default, assume that everything has equivalent cost to the
6218 cheapest instruction. Any additional costs are applied as a delta
6219 above this default. */
6220 *cost
= COSTS_N_INSNS (1);
6225 /* The cost depends entirely on the operands to SET. */
6230 switch (GET_CODE (op0
))
6235 rtx address
= XEXP (op0
, 0);
6236 if (VECTOR_MODE_P (mode
))
6237 *cost
+= extra_cost
->ldst
.storev
;
6238 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6239 *cost
+= extra_cost
->ldst
.store
;
6240 else if (mode
== SFmode
)
6241 *cost
+= extra_cost
->ldst
.storef
;
6242 else if (mode
== DFmode
)
6243 *cost
+= extra_cost
->ldst
.stored
;
6246 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6250 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6254 if (! REG_P (SUBREG_REG (op0
)))
6255 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
6259 /* The cost is one per vector-register copied. */
6260 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
6262 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6263 / GET_MODE_SIZE (V4SImode
);
6264 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6266 /* const0_rtx is in general free, but we will use an
6267 instruction to set a register to 0. */
6268 else if (REG_P (op1
) || op1
== const0_rtx
)
6270 /* The cost is 1 per register copied. */
6271 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6273 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6276 /* Cost is just the cost of the RHS of the set. */
6277 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6282 /* Bit-field insertion. Strip any redundant widening of
6283 the RHS to meet the width of the target. */
6284 if (GET_CODE (op1
) == SUBREG
)
6285 op1
= SUBREG_REG (op1
);
6286 if ((GET_CODE (op1
) == ZERO_EXTEND
6287 || GET_CODE (op1
) == SIGN_EXTEND
)
6288 && CONST_INT_P (XEXP (op0
, 1))
6289 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1
, 0)))
6290 >= INTVAL (XEXP (op0
, 1))))
6291 op1
= XEXP (op1
, 0);
6293 if (CONST_INT_P (op1
))
6295 /* MOV immediate is assumed to always be cheap. */
6296 *cost
= COSTS_N_INSNS (1);
6302 *cost
+= extra_cost
->alu
.bfi
;
6303 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
6309 /* We can't make sense of this, assume default cost. */
6310 *cost
= COSTS_N_INSNS (1);
6316 /* If an instruction can incorporate a constant within the
6317 instruction, the instruction's expression avoids calling
6318 rtx_cost() on the constant. If rtx_cost() is called on a
6319 constant, then it is usually because the constant must be
6320 moved into a register by one or more instructions.
6322 The exception is constant 0, which can be expressed
6323 as XZR/WZR and is therefore free. The exception to this is
6324 if we have (set (reg) (const0_rtx)) in which case we must cost
6325 the move. However, we can catch that when we cost the SET, so
6326 we don't need to consider that here. */
6327 if (x
== const0_rtx
)
6331 /* To an approximation, building any other constant is
6332 proportionally expensive to the number of instructions
6333 required to build that constant. This is true whether we
6334 are compiling for SPEED or otherwise. */
6335 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
6336 (NULL_RTX
, x
, false, mode
));
6343 /* mov[df,sf]_aarch64. */
6344 if (aarch64_float_const_representable_p (x
))
6345 /* FMOV (scalar immediate). */
6346 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
6347 else if (!aarch64_float_const_zero_rtx_p (x
))
6349 /* This will be a load from memory. */
6351 *cost
+= extra_cost
->ldst
.loadd
;
6353 *cost
+= extra_cost
->ldst
.loadf
;
6356 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6357 or MOV v0.s[0], wzr - neither of which are modeled by the
6358 cost tables. Just use the default cost. */
6368 /* For loads we want the base cost of a load, plus an
6369 approximation for the additional cost of the addressing
6371 rtx address
= XEXP (x
, 0);
6372 if (VECTOR_MODE_P (mode
))
6373 *cost
+= extra_cost
->ldst
.loadv
;
6374 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6375 *cost
+= extra_cost
->ldst
.load
;
6376 else if (mode
== SFmode
)
6377 *cost
+= extra_cost
->ldst
.loadf
;
6378 else if (mode
== DFmode
)
6379 *cost
+= extra_cost
->ldst
.loadd
;
6382 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6391 if (VECTOR_MODE_P (mode
))
6396 *cost
+= extra_cost
->vect
.alu
;
6401 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6403 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
6404 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
6407 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
6411 /* Cost this as SUB wzr, X. */
6412 op0
= CONST0_RTX (mode
);
6417 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
6419 /* Support (neg(fma...)) as a single instruction only if
6420 sign of zeros is unimportant. This matches the decision
6421 making in aarch64.md. */
6422 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
6425 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
6428 if (GET_CODE (op0
) == MULT
)
6431 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
6436 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
6446 if (VECTOR_MODE_P (mode
))
6447 *cost
+= extra_cost
->vect
.alu
;
6449 *cost
+= extra_cost
->alu
.clz
;
6458 if (op1
== const0_rtx
6459 && GET_CODE (op0
) == AND
)
6462 mode
= GET_MODE (op0
);
6466 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
6468 /* TODO: A write to the CC flags possibly costs extra, this
6469 needs encoding in the cost tables. */
6471 mode
= GET_MODE (op0
);
6473 if (GET_CODE (op0
) == AND
)
6479 if (GET_CODE (op0
) == PLUS
)
6481 /* ADDS (and CMN alias). */
6486 if (GET_CODE (op0
) == MINUS
)
6493 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
6494 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
6495 && CONST_INT_P (XEXP (op0
, 2)))
6497 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6498 Handle it here directly rather than going to cost_logic
6499 since we know the immediate generated for the TST is valid
6500 so we can avoid creating an intermediate rtx for it only
6501 for costing purposes. */
6503 *cost
+= extra_cost
->alu
.logical
;
6505 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
6506 ZERO_EXTRACT
, 0, speed
);
6510 if (GET_CODE (op1
) == NEG
)
6514 *cost
+= extra_cost
->alu
.arith
;
6516 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
6517 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
6523 Compare can freely swap the order of operands, and
6524 canonicalization puts the more complex operation first.
6525 But the integer MINUS logic expects the shift/extend
6526 operation in op1. */
6528 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
6536 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
6540 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
6542 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
6544 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
6545 /* FCMP supports constant 0.0 for no extra cost. */
6551 if (VECTOR_MODE_P (mode
))
6553 /* Vector compare. */
6555 *cost
+= extra_cost
->vect
.alu
;
6557 if (aarch64_float_const_zero_rtx_p (op1
))
6559 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6573 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
6575 /* Detect valid immediates. */
6576 if ((GET_MODE_CLASS (mode
) == MODE_INT
6577 || (GET_MODE_CLASS (mode
) == MODE_CC
6578 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
6579 && CONST_INT_P (op1
)
6580 && aarch64_uimm12_shift (INTVAL (op1
)))
6583 /* SUB(S) (immediate). */
6584 *cost
+= extra_cost
->alu
.arith
;
6588 /* Look for SUB (extended register). */
6589 if (aarch64_rtx_arith_op_extract_p (op1
, mode
))
6592 *cost
+= extra_cost
->alu
.extend_arith
;
6594 op1
= aarch64_strip_extend (op1
);
6595 *cost
+= rtx_cost (op1
, VOIDmode
,
6596 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
6600 rtx new_op1
= aarch64_strip_extend (op1
);
6602 /* Cost this as an FMA-alike operation. */
6603 if ((GET_CODE (new_op1
) == MULT
6604 || aarch64_shift_p (GET_CODE (new_op1
)))
6607 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
6608 (enum rtx_code
) code
,
6613 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
6617 if (VECTOR_MODE_P (mode
))
6620 *cost
+= extra_cost
->vect
.alu
;
6622 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6625 *cost
+= extra_cost
->alu
.arith
;
6627 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
6630 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
6644 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
6645 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
6648 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
6649 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
6653 if (GET_MODE_CLASS (mode
) == MODE_INT
6654 && CONST_INT_P (op1
)
6655 && aarch64_uimm12_shift (INTVAL (op1
)))
6657 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
6660 /* ADD (immediate). */
6661 *cost
+= extra_cost
->alu
.arith
;
6665 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
6667 /* Look for ADD (extended register). */
6668 if (aarch64_rtx_arith_op_extract_p (op0
, mode
))
6671 *cost
+= extra_cost
->alu
.extend_arith
;
6673 op0
= aarch64_strip_extend (op0
);
6674 *cost
+= rtx_cost (op0
, VOIDmode
,
6675 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
6679 /* Strip any extend, leave shifts behind as we will
6680 cost them through mult_cost. */
6681 new_op0
= aarch64_strip_extend (op0
);
6683 if (GET_CODE (new_op0
) == MULT
6684 || aarch64_shift_p (GET_CODE (new_op0
)))
6686 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
6691 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
6695 if (VECTOR_MODE_P (mode
))
6698 *cost
+= extra_cost
->vect
.alu
;
6700 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6703 *cost
+= extra_cost
->alu
.arith
;
6705 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
6708 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
6715 *cost
= COSTS_N_INSNS (1);
6719 if (VECTOR_MODE_P (mode
))
6720 *cost
+= extra_cost
->vect
.alu
;
6722 *cost
+= extra_cost
->alu
.rev
;
6727 if (aarch_rev16_p (x
))
6729 *cost
= COSTS_N_INSNS (1);
6733 if (VECTOR_MODE_P (mode
))
6734 *cost
+= extra_cost
->vect
.alu
;
6736 *cost
+= extra_cost
->alu
.rev
;
6741 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
6743 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
6744 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
6746 *cost
+= extra_cost
->alu
.shift
;
6757 if (VECTOR_MODE_P (mode
))
6760 *cost
+= extra_cost
->vect
.alu
;
6765 && GET_CODE (op0
) == MULT
6766 && CONST_INT_P (XEXP (op0
, 1))
6767 && CONST_INT_P (op1
)
6768 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
6771 /* This is a UBFM/SBFM. */
6772 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
6774 *cost
+= extra_cost
->alu
.bfx
;
6778 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6780 if (CONST_INT_P (op1
))
6782 /* We have a mask + shift version of a UBFIZ
6783 i.e. the *andim_ashift<mode>_bfiz pattern. */
6784 if (GET_CODE (op0
) == ASHIFT
6785 && aarch64_mask_and_shift_for_ubfiz_p (mode
, op1
,
6788 *cost
+= rtx_cost (XEXP (op0
, 0), mode
,
6789 (enum rtx_code
) code
, 0, speed
);
6791 *cost
+= extra_cost
->alu
.bfx
;
6795 else if (aarch64_bitmask_imm (INTVAL (op1
), mode
))
6797 /* We possibly get the immediate for free, this is not
6799 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
6801 *cost
+= extra_cost
->alu
.logical
;
6810 /* Handle ORN, EON, or BIC. */
6811 if (GET_CODE (op0
) == NOT
)
6812 op0
= XEXP (op0
, 0);
6814 new_op0
= aarch64_strip_shift (op0
);
6816 /* If we had a shift on op0 then this is a logical-shift-
6817 by-register/immediate operation. Otherwise, this is just
6818 a logical operation. */
6823 /* Shift by immediate. */
6824 if (CONST_INT_P (XEXP (op0
, 1)))
6825 *cost
+= extra_cost
->alu
.log_shift
;
6827 *cost
+= extra_cost
->alu
.log_shift_reg
;
6830 *cost
+= extra_cost
->alu
.logical
;
6833 /* In both cases we want to cost both operands. */
6834 *cost
+= rtx_cost (new_op0
, mode
, (enum rtx_code
) code
, 0, speed
);
6835 *cost
+= rtx_cost (op1
, mode
, (enum rtx_code
) code
, 1, speed
);
6844 op0
= aarch64_strip_shift (x
);
6846 if (VECTOR_MODE_P (mode
))
6849 *cost
+= extra_cost
->vect
.alu
;
6853 /* MVN-shifted-reg. */
6856 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
6859 *cost
+= extra_cost
->alu
.log_shift
;
6863 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6864 Handle the second form here taking care that 'a' in the above can
6866 else if (GET_CODE (op0
) == XOR
)
6868 rtx newop0
= XEXP (op0
, 0);
6869 rtx newop1
= XEXP (op0
, 1);
6870 rtx op0_stripped
= aarch64_strip_shift (newop0
);
6872 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
6873 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
6877 if (op0_stripped
!= newop0
)
6878 *cost
+= extra_cost
->alu
.log_shift
;
6880 *cost
+= extra_cost
->alu
.logical
;
6887 *cost
+= extra_cost
->alu
.logical
;
6894 /* If a value is written in SI mode, then zero extended to DI
6895 mode, the operation will in general be free as a write to
6896 a 'w' register implicitly zeroes the upper bits of an 'x'
6897 register. However, if this is
6899 (set (reg) (zero_extend (reg)))
6901 we must cost the explicit register move. */
6903 && GET_MODE (op0
) == SImode
6906 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
6908 /* If OP_COST is non-zero, then the cost of the zero extend
6909 is effectively the cost of the inner operation. Otherwise
6910 we have a MOV instruction and we take the cost from the MOV
6911 itself. This is true independently of whether we are
6912 optimizing for space or time. */
6918 else if (MEM_P (op0
))
6920 /* All loads can zero extend to any size for free. */
6921 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
6925 op0
= aarch64_extend_bitfield_pattern_p (x
);
6928 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
6930 *cost
+= extra_cost
->alu
.bfx
;
6936 if (VECTOR_MODE_P (mode
))
6939 *cost
+= extra_cost
->vect
.alu
;
6943 /* We generate an AND instead of UXTB/UXTH. */
6944 *cost
+= extra_cost
->alu
.logical
;
6950 if (MEM_P (XEXP (x
, 0)))
6955 rtx address
= XEXP (XEXP (x
, 0), 0);
6956 *cost
+= extra_cost
->ldst
.load_sign_extend
;
6959 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6965 op0
= aarch64_extend_bitfield_pattern_p (x
);
6968 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
6970 *cost
+= extra_cost
->alu
.bfx
;
6976 if (VECTOR_MODE_P (mode
))
6977 *cost
+= extra_cost
->vect
.alu
;
6979 *cost
+= extra_cost
->alu
.extend
;
6987 if (CONST_INT_P (op1
))
6991 if (VECTOR_MODE_P (mode
))
6993 /* Vector shift (immediate). */
6994 *cost
+= extra_cost
->vect
.alu
;
6998 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7000 *cost
+= extra_cost
->alu
.shift
;
7004 /* We can incorporate zero/sign extend for free. */
7005 if (GET_CODE (op0
) == ZERO_EXTEND
7006 || GET_CODE (op0
) == SIGN_EXTEND
)
7007 op0
= XEXP (op0
, 0);
7009 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
7016 if (VECTOR_MODE_P (mode
))
7018 /* Vector shift (register). */
7019 *cost
+= extra_cost
->vect
.alu
;
7024 *cost
+= extra_cost
->alu
.shift_reg
;
7027 return false; /* All arguments need to be in registers. */
7037 if (CONST_INT_P (op1
))
7039 /* ASR (immediate) and friends. */
7042 if (VECTOR_MODE_P (mode
))
7043 *cost
+= extra_cost
->vect
.alu
;
7045 *cost
+= extra_cost
->alu
.shift
;
7048 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7054 /* ASR (register) and friends. */
7057 if (VECTOR_MODE_P (mode
))
7058 *cost
+= extra_cost
->vect
.alu
;
7060 *cost
+= extra_cost
->alu
.shift_reg
;
7062 return false; /* All arguments need to be in registers. */
7067 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
7068 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
7072 *cost
+= extra_cost
->ldst
.load
;
7074 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
7075 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
7077 /* ADRP, followed by ADD. */
7078 *cost
+= COSTS_N_INSNS (1);
7080 *cost
+= 2 * extra_cost
->alu
.arith
;
7082 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
7083 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
7087 *cost
+= extra_cost
->alu
.arith
;
7092 /* One extra load instruction, after accessing the GOT. */
7093 *cost
+= COSTS_N_INSNS (1);
7095 *cost
+= extra_cost
->ldst
.load
;
7101 /* ADRP/ADD (immediate). */
7103 *cost
+= extra_cost
->alu
.arith
;
7111 if (VECTOR_MODE_P (mode
))
7112 *cost
+= extra_cost
->vect
.alu
;
7114 *cost
+= extra_cost
->alu
.bfx
;
7117 /* We can trust that the immediates used will be correct (there
7118 are no by-register forms), so we need only cost op0. */
7119 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
7123 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
7124 /* aarch64_rtx_mult_cost always handles recursion to its
7129 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7130 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7131 an unconditional negate. This case should only ever be reached through
7132 the set_smod_pow2_cheap check in expmed.c. */
7133 if (CONST_INT_P (XEXP (x
, 1))
7134 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
7135 && (mode
== SImode
|| mode
== DImode
))
7137 /* We expand to 4 instructions. Reset the baseline. */
7138 *cost
= COSTS_N_INSNS (4);
7141 *cost
+= 2 * extra_cost
->alu
.logical
7142 + 2 * extra_cost
->alu
.arith
;
7151 if (VECTOR_MODE_P (mode
))
7152 *cost
+= extra_cost
->vect
.alu
;
7153 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7154 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
7155 + extra_cost
->mult
[mode
== DImode
].idiv
);
7156 else if (mode
== DFmode
)
7157 *cost
+= (extra_cost
->fp
[1].mult
7158 + extra_cost
->fp
[1].div
);
7159 else if (mode
== SFmode
)
7160 *cost
+= (extra_cost
->fp
[0].mult
7161 + extra_cost
->fp
[0].div
);
7163 return false; /* All arguments need to be in registers. */
7170 if (VECTOR_MODE_P (mode
))
7171 *cost
+= extra_cost
->vect
.alu
;
7172 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7173 /* There is no integer SQRT, so only DIV and UDIV can get
7175 *cost
+= extra_cost
->mult
[mode
== DImode
].idiv
;
7177 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
7179 return false; /* All arguments need to be in registers. */
7182 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
7183 XEXP (x
, 2), cost
, speed
);
7196 return false; /* All arguments must be in registers. */
7205 if (VECTOR_MODE_P (mode
))
7206 *cost
+= extra_cost
->vect
.alu
;
7208 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
7211 /* FMSUB, FNMADD, and FNMSUB are free. */
7212 if (GET_CODE (op0
) == NEG
)
7213 op0
= XEXP (op0
, 0);
7215 if (GET_CODE (op2
) == NEG
)
7216 op2
= XEXP (op2
, 0);
7218 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7219 and the by-element operand as operand 0. */
7220 if (GET_CODE (op1
) == NEG
)
7221 op1
= XEXP (op1
, 0);
7223 /* Catch vector-by-element operations. The by-element operand can
7224 either be (vec_duplicate (vec_select (x))) or just
7225 (vec_select (x)), depending on whether we are multiplying by
7226 a vector or a scalar.
7228 Canonicalization is not very good in these cases, FMA4 will put the
7229 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7230 if (GET_CODE (op0
) == VEC_DUPLICATE
)
7231 op0
= XEXP (op0
, 0);
7232 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
7233 op1
= XEXP (op1
, 0);
7235 if (GET_CODE (op0
) == VEC_SELECT
)
7236 op0
= XEXP (op0
, 0);
7237 else if (GET_CODE (op1
) == VEC_SELECT
)
7238 op1
= XEXP (op1
, 0);
7240 /* If the remaining parameters are not registers,
7241 get the cost to put them into registers. */
7242 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
7243 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
7244 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
7248 case UNSIGNED_FLOAT
:
7250 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
7256 if (VECTOR_MODE_P (mode
))
7258 /*Vector truncate. */
7259 *cost
+= extra_cost
->vect
.alu
;
7262 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
7266 case FLOAT_TRUNCATE
:
7269 if (VECTOR_MODE_P (mode
))
7271 /*Vector conversion. */
7272 *cost
+= extra_cost
->vect
.alu
;
7275 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
7282 /* Strip the rounding part. They will all be implemented
7283 by the fcvt* family of instructions anyway. */
7284 if (GET_CODE (x
) == UNSPEC
)
7286 unsigned int uns_code
= XINT (x
, 1);
7288 if (uns_code
== UNSPEC_FRINTA
7289 || uns_code
== UNSPEC_FRINTM
7290 || uns_code
== UNSPEC_FRINTN
7291 || uns_code
== UNSPEC_FRINTP
7292 || uns_code
== UNSPEC_FRINTZ
)
7293 x
= XVECEXP (x
, 0, 0);
7298 if (VECTOR_MODE_P (mode
))
7299 *cost
+= extra_cost
->vect
.alu
;
7301 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
7304 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7305 fixed-point fcvt. */
7306 if (GET_CODE (x
) == MULT
7307 && ((VECTOR_MODE_P (mode
)
7308 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
7309 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
7311 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
7316 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
7320 if (VECTOR_MODE_P (mode
))
7324 *cost
+= extra_cost
->vect
.alu
;
7326 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7330 /* FABD, which is analogous to FADD. */
7331 if (GET_CODE (op0
) == MINUS
)
7333 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
7334 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
7336 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7340 /* Simple FABS is analogous to FNEG. */
7342 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
7346 /* Integer ABS will either be split to
7347 two arithmetic instructions, or will be an ABS
7348 (scalar), which we don't model. */
7349 *cost
= COSTS_N_INSNS (2);
7351 *cost
+= 2 * extra_cost
->alu
.arith
;
7359 if (VECTOR_MODE_P (mode
))
7360 *cost
+= extra_cost
->vect
.alu
;
7363 /* FMAXNM/FMINNM/FMAX/FMIN.
7364 TODO: This may not be accurate for all implementations, but
7365 we do not model this in the cost tables. */
7366 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7372 /* The floating point round to integer frint* instructions. */
7373 if (aarch64_frint_unspec_p (XINT (x
, 1)))
7376 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
7381 if (XINT (x
, 1) == UNSPEC_RBIT
)
7384 *cost
+= extra_cost
->alu
.rev
;
7392 /* Decompose <su>muldi3_highpart. */
7393 if (/* (truncate:DI */
7396 && GET_MODE (XEXP (x
, 0)) == TImode
7397 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
7399 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
7400 /* (ANY_EXTEND:TI (reg:DI))
7401 (ANY_EXTEND:TI (reg:DI))) */
7402 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
7403 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
7404 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
7405 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
7406 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
7407 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
7408 /* (const_int 64) */
7409 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7410 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
7414 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
7415 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
7416 mode
, MULT
, 0, speed
);
7417 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
7418 mode
, MULT
, 1, speed
);
7427 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
7429 "\nFailed to cost RTX. Assuming default cost.\n");
7434 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7435 calculated for X. This cost is stored in *COST. Returns true
7436 if the total cost of X was calculated. */
7438 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
7439 int param
, int *cost
, bool speed
)
7441 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
7443 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
7445 print_rtl_single (dump_file
, x
);
7446 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
7447 speed
? "Hot" : "Cold",
7448 *cost
, result
? "final" : "partial");
7455 aarch64_register_move_cost (machine_mode mode
,
7456 reg_class_t from_i
, reg_class_t to_i
)
7458 enum reg_class from
= (enum reg_class
) from_i
;
7459 enum reg_class to
= (enum reg_class
) to_i
;
7460 const struct cpu_regmove_cost
*regmove_cost
7461 = aarch64_tune_params
.regmove_cost
;
7463 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7464 if (to
== CALLER_SAVE_REGS
|| to
== POINTER_REGS
)
7467 if (from
== CALLER_SAVE_REGS
|| from
== POINTER_REGS
)
7468 from
= GENERAL_REGS
;
7470 /* Moving between GPR and stack cost is the same as GP2GP. */
7471 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
7472 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
7473 return regmove_cost
->GP2GP
;
7475 /* To/From the stack register, we move via the gprs. */
7476 if (to
== STACK_REG
|| from
== STACK_REG
)
7477 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
7478 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
7480 if (GET_MODE_SIZE (mode
) == 16)
7482 /* 128-bit operations on general registers require 2 instructions. */
7483 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
7484 return regmove_cost
->GP2GP
* 2;
7485 else if (from
== GENERAL_REGS
)
7486 return regmove_cost
->GP2FP
* 2;
7487 else if (to
== GENERAL_REGS
)
7488 return regmove_cost
->FP2GP
* 2;
7490 /* When AdvSIMD instructions are disabled it is not possible to move
7491 a 128-bit value directly between Q registers. This is handled in
7492 secondary reload. A general register is used as a scratch to move
7493 the upper DI value and the lower DI value is moved directly,
7494 hence the cost is the sum of three moves. */
7496 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
7498 return regmove_cost
->FP2FP
;
7501 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
7502 return regmove_cost
->GP2GP
;
7503 else if (from
== GENERAL_REGS
)
7504 return regmove_cost
->GP2FP
;
7505 else if (to
== GENERAL_REGS
)
7506 return regmove_cost
->FP2GP
;
7508 return regmove_cost
->FP2FP
;
7512 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
7513 reg_class_t rclass ATTRIBUTE_UNUSED
,
7514 bool in ATTRIBUTE_UNUSED
)
7516 return aarch64_tune_params
.memmov_cost
;
7519 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7520 to optimize 1.0/sqrt. */
7523 use_rsqrt_p (machine_mode mode
)
7525 return (!flag_trapping_math
7526 && flag_unsafe_math_optimizations
7527 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
7528 & AARCH64_APPROX_MODE (mode
))
7529 || flag_mrecip_low_precision_sqrt
));
7532 /* Function to decide when to use the approximate reciprocal square root
7536 aarch64_builtin_reciprocal (tree fndecl
)
7538 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
7540 if (!use_rsqrt_p (mode
))
7542 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
7545 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
7547 /* Select reciprocal square root initial estimate insn depending on machine
7551 get_rsqrte_type (machine_mode mode
)
7555 case DFmode
: return gen_aarch64_rsqrtedf
;
7556 case SFmode
: return gen_aarch64_rsqrtesf
;
7557 case V2DFmode
: return gen_aarch64_rsqrtev2df
;
7558 case V2SFmode
: return gen_aarch64_rsqrtev2sf
;
7559 case V4SFmode
: return gen_aarch64_rsqrtev4sf
;
7560 default: gcc_unreachable ();
7564 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
7566 /* Select reciprocal square root series step insn depending on machine mode. */
7569 get_rsqrts_type (machine_mode mode
)
7573 case DFmode
: return gen_aarch64_rsqrtsdf
;
7574 case SFmode
: return gen_aarch64_rsqrtssf
;
7575 case V2DFmode
: return gen_aarch64_rsqrtsv2df
;
7576 case V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
7577 case V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
7578 default: gcc_unreachable ();
7582 /* Emit instruction sequence to compute either the approximate square root
7583 or its approximate reciprocal, depending on the flag RECP, and return
7584 whether the sequence was emitted or not. */
7587 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
7589 machine_mode mode
= GET_MODE (dst
);
7591 if (GET_MODE_INNER (mode
) == HFmode
)
7594 machine_mode mmsk
= mode_for_vector
7595 (int_mode_for_mode (GET_MODE_INNER (mode
)),
7596 GET_MODE_NUNITS (mode
));
7597 bool use_approx_sqrt_p
= (!recp
7598 && (flag_mlow_precision_sqrt
7599 || (aarch64_tune_params
.approx_modes
->sqrt
7600 & AARCH64_APPROX_MODE (mode
))));
7601 bool use_approx_rsqrt_p
= (recp
7602 && (flag_mrecip_low_precision_sqrt
7603 || (aarch64_tune_params
.approx_modes
->recip_sqrt
7604 & AARCH64_APPROX_MODE (mode
))));
7606 if (!flag_finite_math_only
7607 || flag_trapping_math
7608 || !flag_unsafe_math_optimizations
7609 || !(use_approx_sqrt_p
|| use_approx_rsqrt_p
)
7610 || optimize_function_for_size_p (cfun
))
7613 rtx xmsk
= gen_reg_rtx (mmsk
);
7615 /* When calculating the approximate square root, compare the argument with
7616 0.0 and create a mask. */
7617 emit_insn (gen_rtx_SET (xmsk
, gen_rtx_NEG (mmsk
, gen_rtx_EQ (mmsk
, src
,
7618 CONST0_RTX (mode
)))));
7620 /* Estimate the approximate reciprocal square root. */
7621 rtx xdst
= gen_reg_rtx (mode
);
7622 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
7624 /* Iterate over the series twice for SF and thrice for DF. */
7625 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
7627 /* Optionally iterate over the series once less for faster performance
7628 while sacrificing the accuracy. */
7629 if ((recp
&& flag_mrecip_low_precision_sqrt
)
7630 || (!recp
&& flag_mlow_precision_sqrt
))
7633 /* Iterate over the series to calculate the approximate reciprocal square
7635 rtx x1
= gen_reg_rtx (mode
);
7636 while (iterations
--)
7638 rtx x2
= gen_reg_rtx (mode
);
7639 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
7641 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
7644 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
7649 /* Qualify the approximate reciprocal square root when the argument is
7650 0.0 by squashing the intermediary result to 0.0. */
7651 rtx xtmp
= gen_reg_rtx (mmsk
);
7652 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
7653 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
7654 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
7656 /* Calculate the approximate square root. */
7657 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
7660 /* Finalize the approximation. */
7661 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
7666 typedef rtx (*recpe_type
) (rtx
, rtx
);
7668 /* Select reciprocal initial estimate insn depending on machine mode. */
7671 get_recpe_type (machine_mode mode
)
7675 case SFmode
: return (gen_aarch64_frecpesf
);
7676 case V2SFmode
: return (gen_aarch64_frecpev2sf
);
7677 case V4SFmode
: return (gen_aarch64_frecpev4sf
);
7678 case DFmode
: return (gen_aarch64_frecpedf
);
7679 case V2DFmode
: return (gen_aarch64_frecpev2df
);
7680 default: gcc_unreachable ();
7684 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
7686 /* Select reciprocal series step insn depending on machine mode. */
7689 get_recps_type (machine_mode mode
)
7693 case SFmode
: return (gen_aarch64_frecpssf
);
7694 case V2SFmode
: return (gen_aarch64_frecpsv2sf
);
7695 case V4SFmode
: return (gen_aarch64_frecpsv4sf
);
7696 case DFmode
: return (gen_aarch64_frecpsdf
);
7697 case V2DFmode
: return (gen_aarch64_frecpsv2df
);
7698 default: gcc_unreachable ();
7702 /* Emit the instruction sequence to compute the approximation for the division
7703 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
7706 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
7708 machine_mode mode
= GET_MODE (quo
);
7710 if (GET_MODE_INNER (mode
) == HFmode
)
7713 bool use_approx_division_p
= (flag_mlow_precision_div
7714 || (aarch64_tune_params
.approx_modes
->division
7715 & AARCH64_APPROX_MODE (mode
)));
7717 if (!flag_finite_math_only
7718 || flag_trapping_math
7719 || !flag_unsafe_math_optimizations
7720 || optimize_function_for_size_p (cfun
)
7721 || !use_approx_division_p
)
7724 /* Estimate the approximate reciprocal. */
7725 rtx xrcp
= gen_reg_rtx (mode
);
7726 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
7728 /* Iterate over the series twice for SF and thrice for DF. */
7729 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
7731 /* Optionally iterate over the series once less for faster performance,
7732 while sacrificing the accuracy. */
7733 if (flag_mlow_precision_div
)
7736 /* Iterate over the series to calculate the approximate reciprocal. */
7737 rtx xtmp
= gen_reg_rtx (mode
);
7738 while (iterations
--)
7740 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
7743 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
7746 if (num
!= CONST1_RTX (mode
))
7748 /* As the approximate reciprocal of DEN is already calculated, only
7749 calculate the approximate division when NUM is not 1.0. */
7750 rtx xnum
= force_reg (mode
, num
);
7751 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
7754 /* Finalize the approximation. */
7755 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
7759 /* Return the number of instructions that can be issued per cycle. */
7761 aarch64_sched_issue_rate (void)
7763 return aarch64_tune_params
.issue_rate
;
7767 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7769 int issue_rate
= aarch64_sched_issue_rate ();
7771 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
7775 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7776 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7777 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7780 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
7783 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
7787 /* Vectorizer cost model target hooks. */
7789 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7791 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
7793 int misalign ATTRIBUTE_UNUSED
)
7797 switch (type_of_cost
)
7800 return aarch64_tune_params
.vec_costs
->scalar_stmt_cost
;
7803 return aarch64_tune_params
.vec_costs
->scalar_load_cost
;
7806 return aarch64_tune_params
.vec_costs
->scalar_store_cost
;
7809 return aarch64_tune_params
.vec_costs
->vec_stmt_cost
;
7812 return aarch64_tune_params
.vec_costs
->vec_align_load_cost
;
7815 return aarch64_tune_params
.vec_costs
->vec_store_cost
;
7818 return aarch64_tune_params
.vec_costs
->vec_to_scalar_cost
;
7821 return aarch64_tune_params
.vec_costs
->scalar_to_vec_cost
;
7823 case unaligned_load
:
7824 return aarch64_tune_params
.vec_costs
->vec_unalign_load_cost
;
7826 case unaligned_store
:
7827 return aarch64_tune_params
.vec_costs
->vec_unalign_store_cost
;
7829 case cond_branch_taken
:
7830 return aarch64_tune_params
.vec_costs
->cond_taken_branch_cost
;
7832 case cond_branch_not_taken
:
7833 return aarch64_tune_params
.vec_costs
->cond_not_taken_branch_cost
;
7836 return aarch64_tune_params
.vec_costs
->vec_permute_cost
;
7838 case vec_promote_demote
:
7839 return aarch64_tune_params
.vec_costs
->vec_stmt_cost
;
7842 elements
= TYPE_VECTOR_SUBPARTS (vectype
);
7843 return elements
/ 2 + 1;
7850 /* Implement targetm.vectorize.add_stmt_cost. */
7852 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
7853 struct _stmt_vec_info
*stmt_info
, int misalign
,
7854 enum vect_cost_model_location where
)
7856 unsigned *cost
= (unsigned *) data
;
7857 unsigned retval
= 0;
7859 if (flag_vect_cost_model
)
7861 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
7863 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
7865 /* Statements in an inner loop relative to the loop being
7866 vectorized are weighted more heavily. The value here is
7867 arbitrary and could potentially be improved with analysis. */
7868 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
7869 count
*= 50; /* FIXME */
7871 retval
= (unsigned) (count
* stmt_cost
);
7872 cost
[where
] += retval
;
7878 static void initialize_aarch64_code_model (struct gcc_options
*);
7880 /* Parse the TO_PARSE string and put the architecture struct that it
7881 selects into RES and the architectural features into ISA_FLAGS.
7882 Return an aarch64_parse_opt_result describing the parse result.
7883 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7885 static enum aarch64_parse_opt_result
7886 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
7887 unsigned long *isa_flags
)
7890 const struct processor
*arch
;
7891 char *str
= (char *) alloca (strlen (to_parse
) + 1);
7894 strcpy (str
, to_parse
);
7896 ext
= strchr (str
, '+');
7904 return AARCH64_PARSE_MISSING_ARG
;
7907 /* Loop through the list of supported ARCHes to find a match. */
7908 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
7910 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
7912 unsigned long isa_temp
= arch
->flags
;
7916 /* TO_PARSE string contains at least one extension. */
7917 enum aarch64_parse_opt_result ext_res
7918 = aarch64_parse_extension (ext
, &isa_temp
);
7920 if (ext_res
!= AARCH64_PARSE_OK
)
7923 /* Extension parsing was successful. Confirm the result
7924 arch and ISA flags. */
7926 *isa_flags
= isa_temp
;
7927 return AARCH64_PARSE_OK
;
7931 /* ARCH name not found in list. */
7932 return AARCH64_PARSE_INVALID_ARG
;
7935 /* Parse the TO_PARSE string and put the result tuning in RES and the
7936 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7937 describing the parse result. If there is an error parsing, RES and
7938 ISA_FLAGS are left unchanged. */
7940 static enum aarch64_parse_opt_result
7941 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
7942 unsigned long *isa_flags
)
7945 const struct processor
*cpu
;
7946 char *str
= (char *) alloca (strlen (to_parse
) + 1);
7949 strcpy (str
, to_parse
);
7951 ext
= strchr (str
, '+');
7959 return AARCH64_PARSE_MISSING_ARG
;
7962 /* Loop through the list of supported CPUs to find a match. */
7963 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
7965 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
7967 unsigned long isa_temp
= cpu
->flags
;
7972 /* TO_PARSE string contains at least one extension. */
7973 enum aarch64_parse_opt_result ext_res
7974 = aarch64_parse_extension (ext
, &isa_temp
);
7976 if (ext_res
!= AARCH64_PARSE_OK
)
7979 /* Extension parsing was successfull. Confirm the result
7980 cpu and ISA flags. */
7982 *isa_flags
= isa_temp
;
7983 return AARCH64_PARSE_OK
;
7987 /* CPU name not found in list. */
7988 return AARCH64_PARSE_INVALID_ARG
;
7991 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7992 Return an aarch64_parse_opt_result describing the parse result.
7993 If the parsing fails the RES does not change. */
7995 static enum aarch64_parse_opt_result
7996 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
7998 const struct processor
*cpu
;
7999 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8001 strcpy (str
, to_parse
);
8003 /* Loop through the list of supported CPUs to find a match. */
8004 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
8006 if (strcmp (cpu
->name
, str
) == 0)
8009 return AARCH64_PARSE_OK
;
8013 /* CPU name not found in list. */
8014 return AARCH64_PARSE_INVALID_ARG
;
8017 /* Parse TOKEN, which has length LENGTH to see if it is an option
8018 described in FLAG. If it is, return the index bit for that fusion type.
8019 If not, error (printing OPTION_NAME) and return zero. */
8022 aarch64_parse_one_option_token (const char *token
,
8024 const struct aarch64_flag_desc
*flag
,
8025 const char *option_name
)
8027 for (; flag
->name
!= NULL
; flag
++)
8029 if (length
== strlen (flag
->name
)
8030 && !strncmp (flag
->name
, token
, length
))
8034 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
8038 /* Parse OPTION which is a comma-separated list of flags to enable.
8039 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8040 default state we inherit from the CPU tuning structures. OPTION_NAME
8041 gives the top-level option we are parsing in the -moverride string,
8042 for use in error messages. */
8045 aarch64_parse_boolean_options (const char *option
,
8046 const struct aarch64_flag_desc
*flags
,
8047 unsigned int initial_state
,
8048 const char *option_name
)
8050 const char separator
= '.';
8051 const char* specs
= option
;
8052 const char* ntoken
= option
;
8053 unsigned int found_flags
= initial_state
;
8055 while ((ntoken
= strchr (specs
, separator
)))
8057 size_t token_length
= ntoken
- specs
;
8058 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8062 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8063 in the token stream, reset the supported operations. So:
8065 adrp+add.cmp+branch.none.adrp+add
8067 would have the result of turning on only adrp+add fusion. */
8071 found_flags
|= token_ops
;
8075 /* We ended with a comma, print something. */
8078 error ("%s string ill-formed\n", option_name
);
8082 /* We still have one more token to parse. */
8083 size_t token_length
= strlen (specs
);
8084 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8091 found_flags
|= token_ops
;
8095 /* Support for overriding instruction fusion. */
8098 aarch64_parse_fuse_string (const char *fuse_string
,
8099 struct tune_params
*tune
)
8101 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
8102 aarch64_fusible_pairs
,
8107 /* Support for overriding other tuning flags. */
8110 aarch64_parse_tune_string (const char *tune_string
,
8111 struct tune_params
*tune
)
8113 tune
->extra_tuning_flags
8114 = aarch64_parse_boolean_options (tune_string
,
8115 aarch64_tuning_flags
,
8116 tune
->extra_tuning_flags
,
8120 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8121 we understand. If it is, extract the option string and handoff to
8122 the appropriate function. */
8125 aarch64_parse_one_override_token (const char* token
,
8127 struct tune_params
*tune
)
8129 const struct aarch64_tuning_override_function
*fn
8130 = aarch64_tuning_override_functions
;
8132 const char *option_part
= strchr (token
, '=');
8135 error ("tuning string missing in option (%s)", token
);
8139 /* Get the length of the option name. */
8140 length
= option_part
- token
;
8141 /* Skip the '=' to get to the option string. */
8144 for (; fn
->name
!= NULL
; fn
++)
8146 if (!strncmp (fn
->name
, token
, length
))
8148 fn
->parse_override (option_part
, tune
);
8153 error ("unknown tuning option (%s)",token
);
8157 /* A checking mechanism for the implementation of the tls size. */
8160 initialize_aarch64_tls_size (struct gcc_options
*opts
)
8162 if (aarch64_tls_size
== 0)
8163 aarch64_tls_size
= 24;
8165 switch (opts
->x_aarch64_cmodel_var
)
8167 case AARCH64_CMODEL_TINY
:
8168 /* Both the default and maximum TLS size allowed under tiny is 1M which
8169 needs two instructions to address, so we clamp the size to 24. */
8170 if (aarch64_tls_size
> 24)
8171 aarch64_tls_size
= 24;
8173 case AARCH64_CMODEL_SMALL
:
8174 /* The maximum TLS size allowed under small is 4G. */
8175 if (aarch64_tls_size
> 32)
8176 aarch64_tls_size
= 32;
8178 case AARCH64_CMODEL_LARGE
:
8179 /* The maximum TLS size allowed under large is 16E.
8180 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8181 if (aarch64_tls_size
> 48)
8182 aarch64_tls_size
= 48;
8191 /* Parse STRING looking for options in the format:
8192 string :: option:string
8193 option :: name=substring
8195 substring :: defined by option. */
8198 aarch64_parse_override_string (const char* input_string
,
8199 struct tune_params
* tune
)
8201 const char separator
= ':';
8202 size_t string_length
= strlen (input_string
) + 1;
8203 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
8204 char *string
= string_root
;
8205 strncpy (string
, input_string
, string_length
);
8206 string
[string_length
- 1] = '\0';
8208 char* ntoken
= string
;
8210 while ((ntoken
= strchr (string
, separator
)))
8212 size_t token_length
= ntoken
- string
;
8213 /* Make this substring look like a string. */
8215 aarch64_parse_one_override_token (string
, token_length
, tune
);
8219 /* One last option to parse. */
8220 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
8226 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
8228 /* The logic here is that if we are disabling all frame pointer generation
8229 then we do not need to disable leaf frame pointer generation as a
8230 separate operation. But if we are *only* disabling leaf frame pointer
8231 generation then we set flag_omit_frame_pointer to true, but in
8232 aarch64_frame_pointer_required we return false only for leaf functions.
8234 PR 70044: We have to be careful about being called multiple times for the
8235 same function. Once we have decided to set flag_omit_frame_pointer just
8236 so that we can omit leaf frame pointers, we must then not interpret a
8237 second call as meaning that all frame pointer generation should be
8238 omitted. We do this by setting flag_omit_frame_pointer to a special,
8240 if (opts
->x_flag_omit_frame_pointer
== 2)
8241 opts
->x_flag_omit_frame_pointer
= 0;
8243 if (opts
->x_flag_omit_frame_pointer
)
8244 opts
->x_flag_omit_leaf_frame_pointer
= false;
8245 else if (opts
->x_flag_omit_leaf_frame_pointer
)
8246 opts
->x_flag_omit_frame_pointer
= 2;
8248 /* If not optimizing for size, set the default
8249 alignment to what the target wants. */
8250 if (!opts
->x_optimize_size
)
8252 if (opts
->x_align_loops
<= 0)
8253 opts
->x_align_loops
= aarch64_tune_params
.loop_align
;
8254 if (opts
->x_align_jumps
<= 0)
8255 opts
->x_align_jumps
= aarch64_tune_params
.jump_align
;
8256 if (opts
->x_align_functions
<= 0)
8257 opts
->x_align_functions
= aarch64_tune_params
.function_align
;
8260 /* We default to no pc-relative literal loads. */
8262 aarch64_pcrelative_literal_loads
= false;
8264 /* If -mpc-relative-literal-loads is set on the command line, this
8265 implies that the user asked for PC relative literal loads. */
8266 if (opts
->x_pcrelative_literal_loads
== 1)
8267 aarch64_pcrelative_literal_loads
= true;
8269 /* This is PR70113. When building the Linux kernel with
8270 CONFIG_ARM64_ERRATUM_843419, support for relocations
8271 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8272 removed from the kernel to avoid loading objects with possibly
8273 offending sequences. Without -mpc-relative-literal-loads we would
8274 generate such relocations, preventing the kernel build from
8276 if (opts
->x_pcrelative_literal_loads
== 2
8277 && TARGET_FIX_ERR_A53_843419
)
8278 aarch64_pcrelative_literal_loads
= true;
8280 /* In the tiny memory model it makes no sense to disallow PC relative
8281 literal pool loads. */
8282 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
8283 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
8284 aarch64_pcrelative_literal_loads
= true;
8286 /* When enabling the lower precision Newton series for the square root, also
8287 enable it for the reciprocal square root, since the latter is an
8288 intermediary step for the former. */
8289 if (flag_mlow_precision_sqrt
)
8290 flag_mrecip_low_precision_sqrt
= true;
8293 /* 'Unpack' up the internal tuning structs and update the options
8294 in OPTS. The caller must have set up selected_tune and selected_arch
8295 as all the other target-specific codegen decisions are
8296 derived from them. */
8299 aarch64_override_options_internal (struct gcc_options
*opts
)
8301 aarch64_tune_flags
= selected_tune
->flags
;
8302 aarch64_tune
= selected_tune
->sched_core
;
8303 /* Make a copy of the tuning parameters attached to the core, which
8304 we may later overwrite. */
8305 aarch64_tune_params
= *(selected_tune
->tune
);
8306 aarch64_architecture_version
= selected_arch
->architecture_version
;
8308 if (opts
->x_aarch64_override_tune_string
)
8309 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
8310 &aarch64_tune_params
);
8312 /* This target defaults to strict volatile bitfields. */
8313 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
8314 opts
->x_flag_strict_volatile_bitfields
= 1;
8316 initialize_aarch64_code_model (opts
);
8317 initialize_aarch64_tls_size (opts
);
8319 int queue_depth
= 0;
8320 switch (aarch64_tune_params
.autoprefetcher_model
)
8322 case tune_params::AUTOPREFETCHER_OFF
:
8325 case tune_params::AUTOPREFETCHER_WEAK
:
8328 case tune_params::AUTOPREFETCHER_STRONG
:
8329 queue_depth
= max_insn_queue_index
+ 1;
8335 /* We don't mind passing in global_options_set here as we don't use
8336 the *options_set structs anyway. */
8337 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
8339 opts
->x_param_values
,
8340 global_options_set
.x_param_values
);
8342 /* Set the L1 cache line size. */
8343 if (selected_cpu
->tune
->cache_line_size
!= 0)
8344 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
8345 selected_cpu
->tune
->cache_line_size
,
8346 opts
->x_param_values
,
8347 global_options_set
.x_param_values
);
8349 aarch64_override_options_after_change_1 (opts
);
8352 /* Print a hint with a suggestion for a core or architecture name that
8353 most closely resembles what the user passed in STR. ARCH is true if
8354 the user is asking for an architecture name. ARCH is false if the user
8355 is asking for a core name. */
8358 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
8360 auto_vec
<const char *> candidates
;
8361 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
8362 for (; entry
->name
!= NULL
; entry
++)
8363 candidates
.safe_push (entry
->name
);
8365 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
8367 inform (input_location
, "valid arguments are: %s;"
8368 " did you mean %qs?", s
, hint
);
8372 /* Print a hint with a suggestion for a core name that most closely resembles
8373 what the user passed in STR. */
8376 aarch64_print_hint_for_core (const char *str
)
8378 aarch64_print_hint_for_core_or_arch (str
, false);
8381 /* Print a hint with a suggestion for an architecture name that most closely
8382 resembles what the user passed in STR. */
8385 aarch64_print_hint_for_arch (const char *str
)
8387 aarch64_print_hint_for_core_or_arch (str
, true);
8390 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8391 specified in STR and throw errors if appropriate. Put the results if
8392 they are valid in RES and ISA_FLAGS. Return whether the option is
8396 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
8397 unsigned long *isa_flags
)
8399 enum aarch64_parse_opt_result parse_res
8400 = aarch64_parse_cpu (str
, res
, isa_flags
);
8402 if (parse_res
== AARCH64_PARSE_OK
)
8407 case AARCH64_PARSE_MISSING_ARG
:
8408 error ("missing cpu name in -mcpu=%qs", str
);
8410 case AARCH64_PARSE_INVALID_ARG
:
8411 error ("unknown value %qs for -mcpu", str
);
8412 aarch64_print_hint_for_core (str
);
8414 case AARCH64_PARSE_INVALID_FEATURE
:
8415 error ("invalid feature modifier in -mcpu=%qs", str
);
8424 /* Validate a command-line -march option. Parse the arch and extensions
8425 (if any) specified in STR and throw errors if appropriate. Put the
8426 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8430 aarch64_validate_march (const char *str
, const struct processor
**res
,
8431 unsigned long *isa_flags
)
8433 enum aarch64_parse_opt_result parse_res
8434 = aarch64_parse_arch (str
, res
, isa_flags
);
8436 if (parse_res
== AARCH64_PARSE_OK
)
8441 case AARCH64_PARSE_MISSING_ARG
:
8442 error ("missing arch name in -march=%qs", str
);
8444 case AARCH64_PARSE_INVALID_ARG
:
8445 error ("unknown value %qs for -march", str
);
8446 aarch64_print_hint_for_arch (str
);
8448 case AARCH64_PARSE_INVALID_FEATURE
:
8449 error ("invalid feature modifier in -march=%qs", str
);
8458 /* Validate a command-line -mtune option. Parse the cpu
8459 specified in STR and throw errors if appropriate. Put the
8460 result, if it is valid, in RES. Return whether the option is
8464 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
8466 enum aarch64_parse_opt_result parse_res
8467 = aarch64_parse_tune (str
, res
);
8469 if (parse_res
== AARCH64_PARSE_OK
)
8474 case AARCH64_PARSE_MISSING_ARG
:
8475 error ("missing cpu name in -mtune=%qs", str
);
8477 case AARCH64_PARSE_INVALID_ARG
:
8478 error ("unknown value %qs for -mtune", str
);
8479 aarch64_print_hint_for_core (str
);
8487 /* Return the CPU corresponding to the enum CPU.
8488 If it doesn't specify a cpu, return the default. */
8490 static const struct processor
*
8491 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
8493 if (cpu
!= aarch64_none
)
8494 return &all_cores
[cpu
];
8496 /* The & 0x3f is to extract the bottom 6 bits that encode the
8497 default cpu as selected by the --with-cpu GCC configure option
8499 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8500 flags mechanism should be reworked to make it more sane. */
8501 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
8504 /* Return the architecture corresponding to the enum ARCH.
8505 If it doesn't specify a valid architecture, return the default. */
8507 static const struct processor
*
8508 aarch64_get_arch (enum aarch64_arch arch
)
8510 if (arch
!= aarch64_no_arch
)
8511 return &all_architectures
[arch
];
8513 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
8515 return &all_architectures
[cpu
->arch
];
8518 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8519 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8520 tuning structs. In particular it must set selected_tune and
8521 aarch64_isa_flags that define the available ISA features and tuning
8522 decisions. It must also set selected_arch as this will be used to
8523 output the .arch asm tags for each function. */
8526 aarch64_override_options (void)
8528 unsigned long cpu_isa
= 0;
8529 unsigned long arch_isa
= 0;
8530 aarch64_isa_flags
= 0;
8532 bool valid_cpu
= true;
8533 bool valid_tune
= true;
8534 bool valid_arch
= true;
8536 selected_cpu
= NULL
;
8537 selected_arch
= NULL
;
8538 selected_tune
= NULL
;
8540 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8541 If either of -march or -mtune is given, they override their
8542 respective component of -mcpu. */
8543 if (aarch64_cpu_string
)
8544 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
8547 if (aarch64_arch_string
)
8548 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
8551 if (aarch64_tune_string
)
8552 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
8554 /* If the user did not specify a processor, choose the default
8555 one for them. This will be the CPU set during configuration using
8556 --with-cpu, otherwise it is "generic". */
8561 selected_cpu
= &all_cores
[selected_arch
->ident
];
8562 aarch64_isa_flags
= arch_isa
;
8563 explicit_arch
= selected_arch
->arch
;
8567 /* Get default configure-time CPU. */
8568 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
8569 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
8573 explicit_tune_core
= selected_tune
->ident
;
8575 /* If both -mcpu and -march are specified check that they are architecturally
8576 compatible, warn if they're not and prefer the -march ISA flags. */
8577 else if (selected_arch
)
8579 if (selected_arch
->arch
!= selected_cpu
->arch
)
8581 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8582 all_architectures
[selected_cpu
->arch
].name
,
8583 selected_arch
->name
);
8585 aarch64_isa_flags
= arch_isa
;
8586 explicit_arch
= selected_arch
->arch
;
8587 explicit_tune_core
= selected_tune
? selected_tune
->ident
8588 : selected_cpu
->ident
;
8592 /* -mcpu but no -march. */
8593 aarch64_isa_flags
= cpu_isa
;
8594 explicit_tune_core
= selected_tune
? selected_tune
->ident
8595 : selected_cpu
->ident
;
8596 gcc_assert (selected_cpu
);
8597 selected_arch
= &all_architectures
[selected_cpu
->arch
];
8598 explicit_arch
= selected_arch
->arch
;
8601 /* Set the arch as well as we will need it when outputing
8602 the .arch directive in assembly. */
8605 gcc_assert (selected_cpu
);
8606 selected_arch
= &all_architectures
[selected_cpu
->arch
];
8610 selected_tune
= selected_cpu
;
8612 #ifndef HAVE_AS_MABI_OPTION
8613 /* The compiler may have been configured with 2.23.* binutils, which does
8614 not have support for ILP32. */
8616 error ("Assembler does not support -mabi=ilp32");
8619 /* Make sure we properly set up the explicit options. */
8620 if ((aarch64_cpu_string
&& valid_cpu
)
8621 || (aarch64_tune_string
&& valid_tune
))
8622 gcc_assert (explicit_tune_core
!= aarch64_none
);
8624 if ((aarch64_cpu_string
&& valid_cpu
)
8625 || (aarch64_arch_string
&& valid_arch
))
8626 gcc_assert (explicit_arch
!= aarch64_no_arch
);
8628 aarch64_override_options_internal (&global_options
);
8630 /* Save these options as the default ones in case we push and pop them later
8631 while processing functions with potential target attributes. */
8632 target_option_default_node
= target_option_current_node
8633 = build_target_option_node (&global_options
);
8636 /* Implement targetm.override_options_after_change. */
8639 aarch64_override_options_after_change (void)
8641 aarch64_override_options_after_change_1 (&global_options
);
8644 static struct machine_function
*
8645 aarch64_init_machine_status (void)
8647 struct machine_function
*machine
;
8648 machine
= ggc_cleared_alloc
<machine_function
> ();
8653 aarch64_init_expanders (void)
8655 init_machine_status
= aarch64_init_machine_status
;
8658 /* A checking mechanism for the implementation of the various code models. */
8660 initialize_aarch64_code_model (struct gcc_options
*opts
)
8662 if (opts
->x_flag_pic
)
8664 switch (opts
->x_aarch64_cmodel_var
)
8666 case AARCH64_CMODEL_TINY
:
8667 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
8669 case AARCH64_CMODEL_SMALL
:
8670 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8671 aarch64_cmodel
= (flag_pic
== 2
8672 ? AARCH64_CMODEL_SMALL_PIC
8673 : AARCH64_CMODEL_SMALL_SPIC
);
8675 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
8678 case AARCH64_CMODEL_LARGE
:
8679 sorry ("code model %qs with -f%s", "large",
8680 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
8687 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
8690 /* Implement TARGET_OPTION_SAVE. */
8693 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
8695 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
8698 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8699 using the information saved in PTR. */
8702 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
8704 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
8705 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
8706 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
8707 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
8708 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
8710 aarch64_override_options_internal (opts
);
8713 /* Implement TARGET_OPTION_PRINT. */
8716 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
8718 const struct processor
*cpu
8719 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
8720 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
8721 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
8722 std::string extension
8723 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
8725 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
8726 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
8727 arch
->name
, extension
.c_str ());
8730 static GTY(()) tree aarch64_previous_fndecl
;
8733 aarch64_reset_previous_fndecl (void)
8735 aarch64_previous_fndecl
= NULL
;
8738 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8739 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8740 make sure optab availability predicates are recomputed when necessary. */
8743 aarch64_save_restore_target_globals (tree new_tree
)
8745 if (TREE_TARGET_GLOBALS (new_tree
))
8746 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
8747 else if (new_tree
== target_option_default_node
)
8748 restore_target_globals (&default_target_globals
);
8750 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
8753 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8754 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8755 of the function, if such exists. This function may be called multiple
8756 times on a single function so use aarch64_previous_fndecl to avoid
8757 setting up identical state. */
8760 aarch64_set_current_function (tree fndecl
)
8762 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
8765 tree old_tree
= (aarch64_previous_fndecl
8766 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
8769 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
8771 /* If current function has no attributes but the previous one did,
8772 use the default node. */
8773 if (!new_tree
&& old_tree
)
8774 new_tree
= target_option_default_node
;
8776 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
8777 the default have been handled by aarch64_save_restore_target_globals from
8778 aarch64_pragma_target_parse. */
8779 if (old_tree
== new_tree
)
8782 aarch64_previous_fndecl
= fndecl
;
8784 /* First set the target options. */
8785 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
8787 aarch64_save_restore_target_globals (new_tree
);
8790 /* Enum describing the various ways we can handle attributes.
8791 In many cases we can reuse the generic option handling machinery. */
8793 enum aarch64_attr_opt_type
8795 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
8796 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
8797 aarch64_attr_enum
, /* Attribute sets an enum variable. */
8798 aarch64_attr_custom
/* Attribute requires a custom handling function. */
8801 /* All the information needed to handle a target attribute.
8802 NAME is the name of the attribute.
8803 ATTR_TYPE specifies the type of behavior of the attribute as described
8804 in the definition of enum aarch64_attr_opt_type.
8805 ALLOW_NEG is true if the attribute supports a "no-" form.
8806 HANDLER is the function that takes the attribute string and whether
8807 it is a pragma or attribute and handles the option. It is needed only
8808 when the ATTR_TYPE is aarch64_attr_custom.
8809 OPT_NUM is the enum specifying the option that the attribute modifies.
8810 This is needed for attributes that mirror the behavior of a command-line
8811 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8812 aarch64_attr_enum. */
8814 struct aarch64_attribute_info
8817 enum aarch64_attr_opt_type attr_type
;
8819 bool (*handler
) (const char *, const char *);
8820 enum opt_code opt_num
;
8823 /* Handle the ARCH_STR argument to the arch= target attribute.
8824 PRAGMA_OR_ATTR is used in potential error messages. */
8827 aarch64_handle_attr_arch (const char *str
, const char *pragma_or_attr
)
8829 const struct processor
*tmp_arch
= NULL
;
8830 enum aarch64_parse_opt_result parse_res
8831 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
8833 if (parse_res
== AARCH64_PARSE_OK
)
8835 gcc_assert (tmp_arch
);
8836 selected_arch
= tmp_arch
;
8837 explicit_arch
= selected_arch
->arch
;
8843 case AARCH64_PARSE_MISSING_ARG
:
8844 error ("missing architecture name in 'arch' target %s", pragma_or_attr
);
8846 case AARCH64_PARSE_INVALID_ARG
:
8847 error ("unknown value %qs for 'arch' target %s", str
, pragma_or_attr
);
8848 aarch64_print_hint_for_arch (str
);
8850 case AARCH64_PARSE_INVALID_FEATURE
:
8851 error ("invalid feature modifier %qs for 'arch' target %s",
8852 str
, pragma_or_attr
);
8861 /* Handle the argument CPU_STR to the cpu= target attribute.
8862 PRAGMA_OR_ATTR is used in potential error messages. */
8865 aarch64_handle_attr_cpu (const char *str
, const char *pragma_or_attr
)
8867 const struct processor
*tmp_cpu
= NULL
;
8868 enum aarch64_parse_opt_result parse_res
8869 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
8871 if (parse_res
== AARCH64_PARSE_OK
)
8873 gcc_assert (tmp_cpu
);
8874 selected_tune
= tmp_cpu
;
8875 explicit_tune_core
= selected_tune
->ident
;
8877 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
8878 explicit_arch
= selected_arch
->arch
;
8884 case AARCH64_PARSE_MISSING_ARG
:
8885 error ("missing cpu name in 'cpu' target %s", pragma_or_attr
);
8887 case AARCH64_PARSE_INVALID_ARG
:
8888 error ("unknown value %qs for 'cpu' target %s", str
, pragma_or_attr
);
8889 aarch64_print_hint_for_core (str
);
8891 case AARCH64_PARSE_INVALID_FEATURE
:
8892 error ("invalid feature modifier %qs for 'cpu' target %s",
8893 str
, pragma_or_attr
);
8902 /* Handle the argument STR to the tune= target attribute.
8903 PRAGMA_OR_ATTR is used in potential error messages. */
8906 aarch64_handle_attr_tune (const char *str
, const char *pragma_or_attr
)
8908 const struct processor
*tmp_tune
= NULL
;
8909 enum aarch64_parse_opt_result parse_res
8910 = aarch64_parse_tune (str
, &tmp_tune
);
8912 if (parse_res
== AARCH64_PARSE_OK
)
8914 gcc_assert (tmp_tune
);
8915 selected_tune
= tmp_tune
;
8916 explicit_tune_core
= selected_tune
->ident
;
8922 case AARCH64_PARSE_INVALID_ARG
:
8923 error ("unknown value %qs for 'tune' target %s", str
, pragma_or_attr
);
8924 aarch64_print_hint_for_core (str
);
8933 /* Parse an architecture extensions target attribute string specified in STR.
8934 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8935 if successful. Update aarch64_isa_flags to reflect the ISA features
8937 PRAGMA_OR_ATTR is used in potential error messages. */
8940 aarch64_handle_attr_isa_flags (char *str
, const char *pragma_or_attr
)
8942 enum aarch64_parse_opt_result parse_res
;
8943 unsigned long isa_flags
= aarch64_isa_flags
;
8945 /* We allow "+nothing" in the beginning to clear out all architectural
8946 features if the user wants to handpick specific features. */
8947 if (strncmp ("+nothing", str
, 8) == 0)
8953 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
8955 if (parse_res
== AARCH64_PARSE_OK
)
8957 aarch64_isa_flags
= isa_flags
;
8963 case AARCH64_PARSE_MISSING_ARG
:
8964 error ("missing feature modifier in target %s %qs",
8965 pragma_or_attr
, str
);
8968 case AARCH64_PARSE_INVALID_FEATURE
:
8969 error ("invalid feature modifier in target %s %qs",
8970 pragma_or_attr
, str
);
8980 /* The target attributes that we support. On top of these we also support just
8981 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8982 handled explicitly in aarch64_process_one_target_attr. */
8984 static const struct aarch64_attribute_info aarch64_attributes
[] =
8986 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
8987 OPT_mgeneral_regs_only
},
8988 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
8989 OPT_mfix_cortex_a53_835769
},
8990 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
8991 OPT_mfix_cortex_a53_843419
},
8992 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
8993 { "strict-align", aarch64_attr_mask
, false, NULL
, OPT_mstrict_align
},
8994 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
8995 OPT_momit_leaf_frame_pointer
},
8996 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
8997 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
8999 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
9000 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
9002 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
9005 /* Parse ARG_STR which contains the definition of one target attribute.
9006 Show appropriate errors if any or return true if the attribute is valid.
9007 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9008 we're processing a target attribute or pragma. */
9011 aarch64_process_one_target_attr (char *arg_str
, const char* pragma_or_attr
)
9013 bool invert
= false;
9015 size_t len
= strlen (arg_str
);
9019 error ("malformed target %s", pragma_or_attr
);
9023 char *str_to_check
= (char *) alloca (len
+ 1);
9024 strcpy (str_to_check
, arg_str
);
9026 /* Skip leading whitespace. */
9027 while (*str_to_check
== ' ' || *str_to_check
== '\t')
9030 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9031 It is easier to detect and handle it explicitly here rather than going
9032 through the machinery for the rest of the target attributes in this
9034 if (*str_to_check
== '+')
9035 return aarch64_handle_attr_isa_flags (str_to_check
, pragma_or_attr
);
9037 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
9042 char *arg
= strchr (str_to_check
, '=');
9044 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9045 and point ARG to "foo". */
9051 const struct aarch64_attribute_info
*p_attr
;
9053 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
9055 /* If the names don't match up, or the user has given an argument
9056 to an attribute that doesn't accept one, or didn't give an argument
9057 to an attribute that expects one, fail to match. */
9058 if (strcmp (str_to_check
, p_attr
->name
) != 0)
9062 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
9063 || p_attr
->attr_type
== aarch64_attr_enum
;
9065 if (attr_need_arg_p
^ (arg
!= NULL
))
9067 error ("target %s %qs does not accept an argument",
9068 pragma_or_attr
, str_to_check
);
9072 /* If the name matches but the attribute does not allow "no-" versions
9073 then we can't match. */
9074 if (invert
&& !p_attr
->allow_neg
)
9076 error ("target %s %qs does not allow a negated form",
9077 pragma_or_attr
, str_to_check
);
9081 switch (p_attr
->attr_type
)
9083 /* Has a custom handler registered.
9084 For example, cpu=, arch=, tune=. */
9085 case aarch64_attr_custom
:
9086 gcc_assert (p_attr
->handler
);
9087 if (!p_attr
->handler (arg
, pragma_or_attr
))
9091 /* Either set or unset a boolean option. */
9092 case aarch64_attr_bool
:
9094 struct cl_decoded_option decoded
;
9096 generate_option (p_attr
->opt_num
, NULL
, !invert
,
9097 CL_TARGET
, &decoded
);
9098 aarch64_handle_option (&global_options
, &global_options_set
,
9099 &decoded
, input_location
);
9102 /* Set or unset a bit in the target_flags. aarch64_handle_option
9103 should know what mask to apply given the option number. */
9104 case aarch64_attr_mask
:
9106 struct cl_decoded_option decoded
;
9107 /* We only need to specify the option number.
9108 aarch64_handle_option will know which mask to apply. */
9109 decoded
.opt_index
= p_attr
->opt_num
;
9110 decoded
.value
= !invert
;
9111 aarch64_handle_option (&global_options
, &global_options_set
,
9112 &decoded
, input_location
);
9115 /* Use the option setting machinery to set an option to an enum. */
9116 case aarch64_attr_enum
:
9121 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
9125 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
9126 NULL
, DK_UNSPECIFIED
, input_location
,
9131 error ("target %s %s=%s is not valid",
9132 pragma_or_attr
, str_to_check
, arg
);
9141 /* If we reached here we either have found an attribute and validated
9142 it or didn't match any. If we matched an attribute but its arguments
9143 were malformed we will have returned false already. */
9147 /* Count how many times the character C appears in
9148 NULL-terminated string STR. */
9151 num_occurences_in_str (char c
, char *str
)
9153 unsigned int res
= 0;
9154 while (*str
!= '\0')
9165 /* Parse the tree in ARGS that contains the target attribute information
9166 and update the global target options space. PRAGMA_OR_ATTR is a string
9167 to be used in error messages, specifying whether this is processing
9168 a target attribute or a target pragma. */
9171 aarch64_process_target_attr (tree args
, const char* pragma_or_attr
)
9173 if (TREE_CODE (args
) == TREE_LIST
)
9177 tree head
= TREE_VALUE (args
);
9180 if (!aarch64_process_target_attr (head
, pragma_or_attr
))
9183 args
= TREE_CHAIN (args
);
9188 /* We expect to find a string to parse. */
9189 gcc_assert (TREE_CODE (args
) == STRING_CST
);
9191 size_t len
= strlen (TREE_STRING_POINTER (args
));
9192 char *str_to_check
= (char *) alloca (len
+ 1);
9193 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
9197 error ("malformed target %s value", pragma_or_attr
);
9201 /* Used to catch empty spaces between commas i.e.
9202 attribute ((target ("attr1,,attr2"))). */
9203 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
9205 /* Handle multiple target attributes separated by ','. */
9206 char *token
= strtok (str_to_check
, ",");
9208 unsigned int num_attrs
= 0;
9212 if (!aarch64_process_one_target_attr (token
, pragma_or_attr
))
9214 error ("target %s %qs is invalid", pragma_or_attr
, token
);
9218 token
= strtok (NULL
, ",");
9221 if (num_attrs
!= num_commas
+ 1)
9223 error ("malformed target %s list %qs",
9224 pragma_or_attr
, TREE_STRING_POINTER (args
));
9231 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9232 process attribute ((target ("..."))). */
9235 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
9237 struct cl_target_option cur_target
;
9240 tree new_target
, new_optimize
;
9241 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
9243 /* If what we're processing is the current pragma string then the
9244 target option node is already stored in target_option_current_node
9245 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9246 having to re-parse the string. This is especially useful to keep
9247 arm_neon.h compile times down since that header contains a lot
9248 of intrinsics enclosed in pragmas. */
9249 if (!existing_target
&& args
== current_target_pragma
)
9251 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
9254 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
9256 old_optimize
= build_optimization_node (&global_options
);
9257 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
9259 /* If the function changed the optimization levels as well as setting
9260 target options, start with the optimizations specified. */
9261 if (func_optimize
&& func_optimize
!= old_optimize
)
9262 cl_optimization_restore (&global_options
,
9263 TREE_OPTIMIZATION (func_optimize
));
9265 /* Save the current target options to restore at the end. */
9266 cl_target_option_save (&cur_target
, &global_options
);
9268 /* If fndecl already has some target attributes applied to it, unpack
9269 them so that we add this attribute on top of them, rather than
9270 overwriting them. */
9271 if (existing_target
)
9273 struct cl_target_option
*existing_options
9274 = TREE_TARGET_OPTION (existing_target
);
9276 if (existing_options
)
9277 cl_target_option_restore (&global_options
, existing_options
);
9280 cl_target_option_restore (&global_options
,
9281 TREE_TARGET_OPTION (target_option_current_node
));
9284 ret
= aarch64_process_target_attr (args
, "attribute");
9286 /* Set up any additional state. */
9289 aarch64_override_options_internal (&global_options
);
9290 /* Initialize SIMD builtins if we haven't already.
9291 Set current_target_pragma to NULL for the duration so that
9292 the builtin initialization code doesn't try to tag the functions
9293 being built with the attributes specified by any current pragma, thus
9294 going into an infinite recursion. */
9297 tree saved_current_target_pragma
= current_target_pragma
;
9298 current_target_pragma
= NULL
;
9299 aarch64_init_simd_builtins ();
9300 current_target_pragma
= saved_current_target_pragma
;
9302 new_target
= build_target_option_node (&global_options
);
9307 new_optimize
= build_optimization_node (&global_options
);
9311 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
9313 if (old_optimize
!= new_optimize
)
9314 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
9317 cl_target_option_restore (&global_options
, &cur_target
);
9319 if (old_optimize
!= new_optimize
)
9320 cl_optimization_restore (&global_options
,
9321 TREE_OPTIMIZATION (old_optimize
));
9325 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9326 tri-bool options (yes, no, don't care) and the default value is
9327 DEF, determine whether to reject inlining. */
9330 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
9331 int dont_care
, int def
)
9333 /* If the callee doesn't care, always allow inlining. */
9334 if (callee
== dont_care
)
9337 /* If the caller doesn't care, always allow inlining. */
9338 if (caller
== dont_care
)
9341 /* Otherwise, allow inlining if either the callee and caller values
9342 agree, or if the callee is using the default value. */
9343 return (callee
== caller
|| callee
== def
);
9346 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9347 to inline CALLEE into CALLER based on target-specific info.
9348 Make sure that the caller and callee have compatible architectural
9349 features. Then go through the other possible target attributes
9350 and see if they can block inlining. Try not to reject always_inline
9351 callees unless they are incompatible architecturally. */
9354 aarch64_can_inline_p (tree caller
, tree callee
)
9356 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
9357 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
9359 /* If callee has no option attributes, then it is ok to inline. */
9363 struct cl_target_option
*caller_opts
9364 = TREE_TARGET_OPTION (caller_tree
? caller_tree
9365 : target_option_default_node
);
9367 struct cl_target_option
*callee_opts
= TREE_TARGET_OPTION (callee_tree
);
9370 /* Callee's ISA flags should be a subset of the caller's. */
9371 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
9372 != callee_opts
->x_aarch64_isa_flags
)
9375 /* Allow non-strict aligned functions inlining into strict
9377 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
9378 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
9379 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
9380 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
9383 bool always_inline
= lookup_attribute ("always_inline",
9384 DECL_ATTRIBUTES (callee
));
9386 /* If the architectural features match up and the callee is always_inline
9387 then the other attributes don't matter. */
9391 if (caller_opts
->x_aarch64_cmodel_var
9392 != callee_opts
->x_aarch64_cmodel_var
)
9395 if (caller_opts
->x_aarch64_tls_dialect
9396 != callee_opts
->x_aarch64_tls_dialect
)
9399 /* Honour explicit requests to workaround errata. */
9400 if (!aarch64_tribools_ok_for_inlining_p (
9401 caller_opts
->x_aarch64_fix_a53_err835769
,
9402 callee_opts
->x_aarch64_fix_a53_err835769
,
9403 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
9406 if (!aarch64_tribools_ok_for_inlining_p (
9407 caller_opts
->x_aarch64_fix_a53_err843419
,
9408 callee_opts
->x_aarch64_fix_a53_err843419
,
9409 2, TARGET_FIX_ERR_A53_843419
))
9412 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9413 caller and calle and they don't match up, reject inlining. */
9414 if (!aarch64_tribools_ok_for_inlining_p (
9415 caller_opts
->x_flag_omit_leaf_frame_pointer
,
9416 callee_opts
->x_flag_omit_leaf_frame_pointer
,
9420 /* If the callee has specific tuning overrides, respect them. */
9421 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
9422 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
9425 /* If the user specified tuning override strings for the
9426 caller and callee and they don't match up, reject inlining.
9427 We just do a string compare here, we don't analyze the meaning
9428 of the string, as it would be too costly for little gain. */
9429 if (callee_opts
->x_aarch64_override_tune_string
9430 && caller_opts
->x_aarch64_override_tune_string
9431 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
9432 caller_opts
->x_aarch64_override_tune_string
) != 0))
9438 /* Return true if SYMBOL_REF X binds locally. */
9441 aarch64_symbol_binds_local_p (const_rtx x
)
9443 return (SYMBOL_REF_DECL (x
)
9444 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
9445 : SYMBOL_REF_LOCAL_P (x
));
9448 /* Return true if SYMBOL_REF X is thread local */
9450 aarch64_tls_symbol_p (rtx x
)
9452 if (! TARGET_HAVE_TLS
)
9455 if (GET_CODE (x
) != SYMBOL_REF
)
9458 return SYMBOL_REF_TLS_MODEL (x
) != 0;
9461 /* Classify a TLS symbol into one of the TLS kinds. */
9462 enum aarch64_symbol_type
9463 aarch64_classify_tls_symbol (rtx x
)
9465 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
9469 case TLS_MODEL_GLOBAL_DYNAMIC
:
9470 case TLS_MODEL_LOCAL_DYNAMIC
:
9471 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
9473 case TLS_MODEL_INITIAL_EXEC
:
9474 switch (aarch64_cmodel
)
9476 case AARCH64_CMODEL_TINY
:
9477 case AARCH64_CMODEL_TINY_PIC
:
9478 return SYMBOL_TINY_TLSIE
;
9480 return SYMBOL_SMALL_TLSIE
;
9483 case TLS_MODEL_LOCAL_EXEC
:
9484 if (aarch64_tls_size
== 12)
9485 return SYMBOL_TLSLE12
;
9486 else if (aarch64_tls_size
== 24)
9487 return SYMBOL_TLSLE24
;
9488 else if (aarch64_tls_size
== 32)
9489 return SYMBOL_TLSLE32
;
9490 else if (aarch64_tls_size
== 48)
9491 return SYMBOL_TLSLE48
;
9495 case TLS_MODEL_EMULATED
:
9496 case TLS_MODEL_NONE
:
9497 return SYMBOL_FORCE_TO_MEM
;
9504 /* Return the method that should be used to access SYMBOL_REF or
9507 enum aarch64_symbol_type
9508 aarch64_classify_symbol (rtx x
, rtx offset
)
9510 if (GET_CODE (x
) == LABEL_REF
)
9512 switch (aarch64_cmodel
)
9514 case AARCH64_CMODEL_LARGE
:
9515 return SYMBOL_FORCE_TO_MEM
;
9517 case AARCH64_CMODEL_TINY_PIC
:
9518 case AARCH64_CMODEL_TINY
:
9519 return SYMBOL_TINY_ABSOLUTE
;
9521 case AARCH64_CMODEL_SMALL_SPIC
:
9522 case AARCH64_CMODEL_SMALL_PIC
:
9523 case AARCH64_CMODEL_SMALL
:
9524 return SYMBOL_SMALL_ABSOLUTE
;
9531 if (GET_CODE (x
) == SYMBOL_REF
)
9533 if (aarch64_tls_symbol_p (x
))
9534 return aarch64_classify_tls_symbol (x
);
9536 switch (aarch64_cmodel
)
9538 case AARCH64_CMODEL_TINY
:
9539 /* When we retrieve symbol + offset address, we have to make sure
9540 the offset does not cause overflow of the final address. But
9541 we have no way of knowing the address of symbol at compile time
9542 so we can't accurately say if the distance between the PC and
9543 symbol + offset is outside the addressible range of +/-1M in the
9544 TINY code model. So we rely on images not being greater than
9545 1M and cap the offset at 1M and anything beyond 1M will have to
9546 be loaded using an alternative mechanism. Furthermore if the
9547 symbol is a weak reference to something that isn't known to
9548 resolve to a symbol in this module, then force to memory. */
9549 if ((SYMBOL_REF_WEAK (x
)
9550 && !aarch64_symbol_binds_local_p (x
))
9551 || INTVAL (offset
) < -1048575 || INTVAL (offset
) > 1048575)
9552 return SYMBOL_FORCE_TO_MEM
;
9553 return SYMBOL_TINY_ABSOLUTE
;
9555 case AARCH64_CMODEL_SMALL
:
9556 /* Same reasoning as the tiny code model, but the offset cap here is
9558 if ((SYMBOL_REF_WEAK (x
)
9559 && !aarch64_symbol_binds_local_p (x
))
9560 || !IN_RANGE (INTVAL (offset
), HOST_WIDE_INT_C (-4294967263),
9561 HOST_WIDE_INT_C (4294967264)))
9562 return SYMBOL_FORCE_TO_MEM
;
9563 return SYMBOL_SMALL_ABSOLUTE
;
9565 case AARCH64_CMODEL_TINY_PIC
:
9566 if (!aarch64_symbol_binds_local_p (x
))
9567 return SYMBOL_TINY_GOT
;
9568 return SYMBOL_TINY_ABSOLUTE
;
9570 case AARCH64_CMODEL_SMALL_SPIC
:
9571 case AARCH64_CMODEL_SMALL_PIC
:
9572 if (!aarch64_symbol_binds_local_p (x
))
9573 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
9574 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
9575 return SYMBOL_SMALL_ABSOLUTE
;
9577 case AARCH64_CMODEL_LARGE
:
9578 /* This is alright even in PIC code as the constant
9579 pool reference is always PC relative and within
9580 the same translation unit. */
9581 if (CONSTANT_POOL_ADDRESS_P (x
))
9582 return SYMBOL_SMALL_ABSOLUTE
;
9584 return SYMBOL_FORCE_TO_MEM
;
9591 /* By default push everything into the constant pool. */
9592 return SYMBOL_FORCE_TO_MEM
;
9596 aarch64_constant_address_p (rtx x
)
9598 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
9602 aarch64_legitimate_pic_operand_p (rtx x
)
9604 if (GET_CODE (x
) == SYMBOL_REF
9605 || (GET_CODE (x
) == CONST
9606 && GET_CODE (XEXP (x
, 0)) == PLUS
9607 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
9613 /* Return true if X holds either a quarter-precision or
9614 floating-point +0.0 constant. */
9616 aarch64_valid_floating_const (machine_mode mode
, rtx x
)
9618 if (!CONST_DOUBLE_P (x
))
9621 if (aarch64_float_const_zero_rtx_p (x
))
9624 /* We only handle moving 0.0 to a TFmode register. */
9625 if (!(mode
== SFmode
|| mode
== DFmode
))
9628 return aarch64_float_const_representable_p (x
);
9632 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
9634 /* Do not allow vector struct mode constants. We could support
9635 0 and -1 easily, but they need support in aarch64-simd.md. */
9636 if (TARGET_SIMD
&& aarch64_vect_struct_mode_p (mode
))
9639 /* This could probably go away because
9640 we now decompose CONST_INTs according to expand_mov_immediate. */
9641 if ((GET_CODE (x
) == CONST_VECTOR
9642 && aarch64_simd_valid_immediate (x
, mode
, false, NULL
))
9643 || CONST_INT_P (x
) || aarch64_valid_floating_const (mode
, x
))
9644 return !targetm
.cannot_force_const_mem (mode
, x
);
9646 if (GET_CODE (x
) == HIGH
9647 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
9650 return aarch64_constant_address_p (x
);
9654 aarch64_load_tp (rtx target
)
9657 || GET_MODE (target
) != Pmode
9658 || !register_operand (target
, Pmode
))
9659 target
= gen_reg_rtx (Pmode
);
9661 /* Can return in any reg. */
9662 emit_insn (gen_aarch64_load_tp_hard (target
));
9666 /* On AAPCS systems, this is the "struct __va_list". */
9667 static GTY(()) tree va_list_type
;
9669 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9670 Return the type to use as __builtin_va_list.
9672 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9684 aarch64_build_builtin_va_list (void)
9687 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
9689 /* Create the type. */
9690 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
9691 /* Give it the required name. */
9692 va_list_name
= build_decl (BUILTINS_LOCATION
,
9694 get_identifier ("__va_list"),
9696 DECL_ARTIFICIAL (va_list_name
) = 1;
9697 TYPE_NAME (va_list_type
) = va_list_name
;
9698 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
9700 /* Create the fields. */
9701 f_stack
= build_decl (BUILTINS_LOCATION
,
9702 FIELD_DECL
, get_identifier ("__stack"),
9704 f_grtop
= build_decl (BUILTINS_LOCATION
,
9705 FIELD_DECL
, get_identifier ("__gr_top"),
9707 f_vrtop
= build_decl (BUILTINS_LOCATION
,
9708 FIELD_DECL
, get_identifier ("__vr_top"),
9710 f_groff
= build_decl (BUILTINS_LOCATION
,
9711 FIELD_DECL
, get_identifier ("__gr_offs"),
9713 f_vroff
= build_decl (BUILTINS_LOCATION
,
9714 FIELD_DECL
, get_identifier ("__vr_offs"),
9717 /* Tell tree-stdarg pass about our internal offset fields.
9718 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9719 purpose to identify whether the code is updating va_list internal
9720 offset fields through irregular way. */
9721 va_list_gpr_counter_field
= f_groff
;
9722 va_list_fpr_counter_field
= f_vroff
;
9724 DECL_ARTIFICIAL (f_stack
) = 1;
9725 DECL_ARTIFICIAL (f_grtop
) = 1;
9726 DECL_ARTIFICIAL (f_vrtop
) = 1;
9727 DECL_ARTIFICIAL (f_groff
) = 1;
9728 DECL_ARTIFICIAL (f_vroff
) = 1;
9730 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
9731 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
9732 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
9733 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
9734 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
9736 TYPE_FIELDS (va_list_type
) = f_stack
;
9737 DECL_CHAIN (f_stack
) = f_grtop
;
9738 DECL_CHAIN (f_grtop
) = f_vrtop
;
9739 DECL_CHAIN (f_vrtop
) = f_groff
;
9740 DECL_CHAIN (f_groff
) = f_vroff
;
9742 /* Compute its layout. */
9743 layout_type (va_list_type
);
9745 return va_list_type
;
9748 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9750 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
9752 const CUMULATIVE_ARGS
*cum
;
9753 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
9754 tree stack
, grtop
, vrtop
, groff
, vroff
;
9756 int gr_save_area_size
= cfun
->va_list_gpr_size
;
9757 int vr_save_area_size
= cfun
->va_list_fpr_size
;
9760 cum
= &crtl
->args
.info
;
9761 if (cfun
->va_list_gpr_size
)
9762 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
9763 cfun
->va_list_gpr_size
);
9764 if (cfun
->va_list_fpr_size
)
9765 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
9766 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
9770 gcc_assert (cum
->aapcs_nvrn
== 0);
9771 vr_save_area_size
= 0;
9774 f_stack
= TYPE_FIELDS (va_list_type_node
);
9775 f_grtop
= DECL_CHAIN (f_stack
);
9776 f_vrtop
= DECL_CHAIN (f_grtop
);
9777 f_groff
= DECL_CHAIN (f_vrtop
);
9778 f_vroff
= DECL_CHAIN (f_groff
);
9780 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
9782 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
9784 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
9786 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
9788 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
9791 /* Emit code to initialize STACK, which points to the next varargs stack
9792 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9793 by named arguments. STACK is 8-byte aligned. */
9794 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
9795 if (cum
->aapcs_stack_size
> 0)
9796 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
9797 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
9798 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
9800 /* Emit code to initialize GRTOP, the top of the GR save area.
9801 virtual_incoming_args_rtx should have been 16 byte aligned. */
9802 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
9803 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
9804 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
9806 /* Emit code to initialize VRTOP, the top of the VR save area.
9807 This address is gr_save_area_bytes below GRTOP, rounded
9808 down to the next 16-byte boundary. */
9809 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
9810 vr_offset
= ROUND_UP (gr_save_area_size
,
9811 STACK_BOUNDARY
/ BITS_PER_UNIT
);
9814 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
9815 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
9816 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
9818 /* Emit code to initialize GROFF, the offset from GRTOP of the
9819 next GPR argument. */
9820 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
9821 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
9822 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
9824 /* Likewise emit code to initialize VROFF, the offset from FTOP
9825 of the next VR argument. */
9826 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
9827 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
9828 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
9831 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9834 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
9835 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
9839 bool is_ha
; /* is HFA or HVA. */
9840 bool dw_align
; /* double-word align. */
9841 machine_mode ag_mode
= VOIDmode
;
9845 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
9846 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
9847 HOST_WIDE_INT size
, rsize
, adjust
, align
;
9848 tree t
, u
, cond1
, cond2
;
9850 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
9852 type
= build_pointer_type (type
);
9854 mode
= TYPE_MODE (type
);
9856 f_stack
= TYPE_FIELDS (va_list_type_node
);
9857 f_grtop
= DECL_CHAIN (f_stack
);
9858 f_vrtop
= DECL_CHAIN (f_grtop
);
9859 f_groff
= DECL_CHAIN (f_vrtop
);
9860 f_vroff
= DECL_CHAIN (f_groff
);
9862 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
9863 f_stack
, NULL_TREE
);
9864 size
= int_size_in_bytes (type
);
9865 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
9869 if (aarch64_vfp_is_call_or_return_candidate (mode
,
9875 /* TYPE passed in fp/simd registers. */
9877 aarch64_err_no_fpadvsimd (mode
, "varargs");
9879 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
9880 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
9881 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
9882 unshare_expr (valist
), f_vroff
, NULL_TREE
);
9884 rsize
= nregs
* UNITS_PER_VREG
;
9888 if (BYTES_BIG_ENDIAN
&& GET_MODE_SIZE (ag_mode
) < UNITS_PER_VREG
)
9889 adjust
= UNITS_PER_VREG
- GET_MODE_SIZE (ag_mode
);
9891 else if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
9892 && size
< UNITS_PER_VREG
)
9894 adjust
= UNITS_PER_VREG
- size
;
9899 /* TYPE passed in general registers. */
9900 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
9901 unshare_expr (valist
), f_grtop
, NULL_TREE
);
9902 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
9903 unshare_expr (valist
), f_groff
, NULL_TREE
);
9904 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
9905 nregs
= rsize
/ UNITS_PER_WORD
;
9910 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
9911 && size
< UNITS_PER_WORD
)
9913 adjust
= UNITS_PER_WORD
- size
;
9917 /* Get a local temporary for the field value. */
9918 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
9920 /* Emit code to branch if off >= 0. */
9921 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
9922 build_int_cst (TREE_TYPE (off
), 0));
9923 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
9927 /* Emit: offs = (offs + 15) & -16. */
9928 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
9929 build_int_cst (TREE_TYPE (off
), 15));
9930 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
9931 build_int_cst (TREE_TYPE (off
), -16));
9932 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
9937 /* Update ap.__[g|v]r_offs */
9938 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
9939 build_int_cst (TREE_TYPE (off
), rsize
));
9940 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
9944 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
9946 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9947 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
9948 build_int_cst (TREE_TYPE (f_off
), 0));
9949 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
9951 /* String up: make sure the assignment happens before the use. */
9952 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
9953 COND_EXPR_ELSE (cond1
) = t
;
9955 /* Prepare the trees handling the argument that is passed on the stack;
9956 the top level node will store in ON_STACK. */
9957 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
9960 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9961 t
= fold_convert (intDI_type_node
, arg
);
9962 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
9963 build_int_cst (TREE_TYPE (t
), 15));
9964 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
9965 build_int_cst (TREE_TYPE (t
), -16));
9966 t
= fold_convert (TREE_TYPE (arg
), t
);
9967 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
9971 /* Advance ap.__stack */
9972 t
= fold_convert (intDI_type_node
, arg
);
9973 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
9974 build_int_cst (TREE_TYPE (t
), size
+ 7));
9975 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
9976 build_int_cst (TREE_TYPE (t
), -8));
9977 t
= fold_convert (TREE_TYPE (arg
), t
);
9978 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
9979 /* String up roundup and advance. */
9981 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
9982 /* String up with arg */
9983 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
9984 /* Big-endianness related address adjustment. */
9985 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
9986 && size
< UNITS_PER_WORD
)
9988 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
9989 size_int (UNITS_PER_WORD
- size
));
9990 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
9993 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
9994 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
9996 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9999 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
10000 build_int_cst (TREE_TYPE (off
), adjust
));
10002 t
= fold_convert (sizetype
, t
);
10003 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
10007 /* type ha; // treat as "struct {ftype field[n];}"
10008 ... [computing offs]
10009 for (i = 0; i <nregs; ++i, offs += 16)
10010 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10013 tree tmp_ha
, field_t
, field_ptr_t
;
10015 /* Declare a local variable. */
10016 tmp_ha
= create_tmp_var_raw (type
, "ha");
10017 gimple_add_tmp_var (tmp_ha
);
10019 /* Establish the base type. */
10023 field_t
= float_type_node
;
10024 field_ptr_t
= float_ptr_type_node
;
10027 field_t
= double_type_node
;
10028 field_ptr_t
= double_ptr_type_node
;
10031 field_t
= long_double_type_node
;
10032 field_ptr_t
= long_double_ptr_type_node
;
10035 field_t
= aarch64_fp16_type_node
;
10036 field_ptr_t
= aarch64_fp16_ptr_type_node
;
10041 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
10042 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
10043 field_ptr_t
= build_pointer_type (field_t
);
10050 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10051 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
10053 t
= fold_convert (field_ptr_t
, addr
);
10054 t
= build2 (MODIFY_EXPR
, field_t
,
10055 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
10056 build1 (INDIRECT_REF
, field_t
, t
));
10058 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10059 for (i
= 1; i
< nregs
; ++i
)
10061 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
10062 u
= fold_convert (field_ptr_t
, addr
);
10063 u
= build2 (MODIFY_EXPR
, field_t
,
10064 build2 (MEM_REF
, field_t
, tmp_ha
,
10065 build_int_cst (field_ptr_t
,
10067 int_size_in_bytes (field_t
)))),
10068 build1 (INDIRECT_REF
, field_t
, u
));
10069 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
10072 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
10073 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
10076 COND_EXPR_ELSE (cond2
) = t
;
10077 addr
= fold_convert (build_pointer_type (type
), cond1
);
10078 addr
= build_va_arg_indirect_ref (addr
);
10081 addr
= build_va_arg_indirect_ref (addr
);
10086 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10089 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
10090 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
10093 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
10094 CUMULATIVE_ARGS local_cum
;
10095 int gr_saved
= cfun
->va_list_gpr_size
;
10096 int vr_saved
= cfun
->va_list_fpr_size
;
10098 /* The caller has advanced CUM up to, but not beyond, the last named
10099 argument. Advance a local copy of CUM past the last "real" named
10100 argument, to find out how many registers are left over. */
10102 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
10104 /* Found out how many registers we need to save.
10105 Honor tree-stdvar analysis results. */
10106 if (cfun
->va_list_gpr_size
)
10107 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
10108 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
10109 if (cfun
->va_list_fpr_size
)
10110 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
10111 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
10115 gcc_assert (local_cum
.aapcs_nvrn
== 0);
10125 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10126 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
10127 - gr_saved
* UNITS_PER_WORD
);
10128 mem
= gen_frame_mem (BLKmode
, ptr
);
10129 set_mem_alias_set (mem
, get_varargs_alias_set ());
10131 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
10136 /* We can't use move_block_from_reg, because it will use
10137 the wrong mode, storing D regs only. */
10138 machine_mode mode
= TImode
;
10139 int off
, i
, vr_start
;
10141 /* Set OFF to the offset from virtual_incoming_args_rtx of
10142 the first vector register. The VR save area lies below
10143 the GR one, and is aligned to 16 bytes. */
10144 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10145 STACK_BOUNDARY
/ BITS_PER_UNIT
);
10146 off
-= vr_saved
* UNITS_PER_VREG
;
10148 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
10149 for (i
= 0; i
< vr_saved
; ++i
)
10153 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
10154 mem
= gen_frame_mem (mode
, ptr
);
10155 set_mem_alias_set (mem
, get_varargs_alias_set ());
10156 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
10157 off
+= UNITS_PER_VREG
;
10162 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10163 any complication of having crtl->args.pretend_args_size changed. */
10164 cfun
->machine
->frame
.saved_varargs_size
10165 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10166 STACK_BOUNDARY
/ BITS_PER_UNIT
)
10167 + vr_saved
* UNITS_PER_VREG
);
10171 aarch64_conditional_register_usage (void)
10176 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
10179 call_used_regs
[i
] = 1;
10184 /* Walk down the type tree of TYPE counting consecutive base elements.
10185 If *MODEP is VOIDmode, then set it to the first valid floating point
10186 type. If a non-floating point type is found, or if a floating point
10187 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10188 otherwise return the count in the sub-tree. */
10190 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
10193 HOST_WIDE_INT size
;
10195 switch (TREE_CODE (type
))
10198 mode
= TYPE_MODE (type
);
10199 if (mode
!= DFmode
&& mode
!= SFmode
10200 && mode
!= TFmode
&& mode
!= HFmode
)
10203 if (*modep
== VOIDmode
)
10206 if (*modep
== mode
)
10212 mode
= TYPE_MODE (TREE_TYPE (type
));
10213 if (mode
!= DFmode
&& mode
!= SFmode
10214 && mode
!= TFmode
&& mode
!= HFmode
)
10217 if (*modep
== VOIDmode
)
10220 if (*modep
== mode
)
10226 /* Use V2SImode and V4SImode as representatives of all 64-bit
10227 and 128-bit vector types. */
10228 size
= int_size_in_bytes (type
);
10241 if (*modep
== VOIDmode
)
10244 /* Vector modes are considered to be opaque: two vectors are
10245 equivalent for the purposes of being homogeneous aggregates
10246 if they are the same size. */
10247 if (*modep
== mode
)
10255 tree index
= TYPE_DOMAIN (type
);
10257 /* Can't handle incomplete types nor sizes that are not
10259 if (!COMPLETE_TYPE_P (type
)
10260 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
10263 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
10266 || !TYPE_MAX_VALUE (index
)
10267 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
10268 || !TYPE_MIN_VALUE (index
)
10269 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
10273 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
10274 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
10276 /* There must be no padding. */
10277 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
10289 /* Can't handle incomplete types nor sizes that are not
10291 if (!COMPLETE_TYPE_P (type
)
10292 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
10295 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
10297 if (TREE_CODE (field
) != FIELD_DECL
)
10300 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
10303 count
+= sub_count
;
10306 /* There must be no padding. */
10307 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
10314 case QUAL_UNION_TYPE
:
10316 /* These aren't very interesting except in a degenerate case. */
10321 /* Can't handle incomplete types nor sizes that are not
10323 if (!COMPLETE_TYPE_P (type
)
10324 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
10327 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
10329 if (TREE_CODE (field
) != FIELD_DECL
)
10332 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
10335 count
= count
> sub_count
? count
: sub_count
;
10338 /* There must be no padding. */
10339 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
10352 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10353 type as described in AAPCS64 \S 4.1.2.
10355 See the comment above aarch64_composite_type_p for the notes on MODE. */
10358 aarch64_short_vector_p (const_tree type
,
10361 HOST_WIDE_INT size
= -1;
10363 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
10364 size
= int_size_in_bytes (type
);
10365 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
10366 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
10367 size
= GET_MODE_SIZE (mode
);
10369 return (size
== 8 || size
== 16);
10372 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10373 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10374 array types. The C99 floating-point complex types are also considered
10375 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10376 types, which are GCC extensions and out of the scope of AAPCS64, are
10377 treated as composite types here as well.
10379 Note that MODE itself is not sufficient in determining whether a type
10380 is such a composite type or not. This is because
10381 stor-layout.c:compute_record_mode may have already changed the MODE
10382 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10383 structure with only one field may have its MODE set to the mode of the
10384 field. Also an integer mode whose size matches the size of the
10385 RECORD_TYPE type may be used to substitute the original mode
10386 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10387 solely relied on. */
10390 aarch64_composite_type_p (const_tree type
,
10393 if (aarch64_short_vector_p (type
, mode
))
10396 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
10399 if (mode
== BLKmode
10400 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
10401 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
10407 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10408 shall be passed or returned in simd/fp register(s) (providing these
10409 parameter passing registers are available).
10411 Upon successful return, *COUNT returns the number of needed registers,
10412 *BASE_MODE returns the mode of the individual register and when IS_HAF
10413 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10414 floating-point aggregate or a homogeneous short-vector aggregate. */
10417 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
10419 machine_mode
*base_mode
,
10423 machine_mode new_mode
= VOIDmode
;
10424 bool composite_p
= aarch64_composite_type_p (type
, mode
);
10426 if (is_ha
!= NULL
) *is_ha
= false;
10428 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10429 || aarch64_short_vector_p (type
, mode
))
10434 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
10436 if (is_ha
!= NULL
) *is_ha
= true;
10438 new_mode
= GET_MODE_INNER (mode
);
10440 else if (type
&& composite_p
)
10442 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
10444 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
10446 if (is_ha
!= NULL
) *is_ha
= true;
10455 *base_mode
= new_mode
;
10459 /* Implement TARGET_STRUCT_VALUE_RTX. */
10462 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
10463 int incoming ATTRIBUTE_UNUSED
)
10465 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
10468 /* Implements target hook vector_mode_supported_p. */
10470 aarch64_vector_mode_supported_p (machine_mode mode
)
10473 && (mode
== V4SImode
|| mode
== V8HImode
10474 || mode
== V16QImode
|| mode
== V2DImode
10475 || mode
== V2SImode
|| mode
== V4HImode
10476 || mode
== V8QImode
|| mode
== V2SFmode
10477 || mode
== V4SFmode
|| mode
== V2DFmode
10478 || mode
== V4HFmode
|| mode
== V8HFmode
10479 || mode
== V1DFmode
))
10485 /* Return appropriate SIMD container
10486 for MODE within a vector of WIDTH bits. */
10487 static machine_mode
10488 aarch64_simd_container_mode (machine_mode mode
, unsigned width
)
10490 gcc_assert (width
== 64 || width
== 128);
10529 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10530 static machine_mode
10531 aarch64_preferred_simd_mode (machine_mode mode
)
10533 return aarch64_simd_container_mode (mode
, 128);
10536 /* Return the bitmask of possible vector sizes for the vectorizer
10537 to iterate over. */
10538 static unsigned int
10539 aarch64_autovectorize_vector_sizes (void)
10544 /* Implement TARGET_MANGLE_TYPE. */
10546 static const char *
10547 aarch64_mangle_type (const_tree type
)
10549 /* The AArch64 ABI documents say that "__va_list" has to be
10550 managled as if it is in the "std" namespace. */
10551 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
10552 return "St9__va_list";
10554 /* Half-precision float. */
10555 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
10558 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10560 if (TYPE_NAME (type
) != NULL
)
10561 return aarch64_mangle_builtin_type (type
);
10563 /* Use the default mangling. */
10568 /* Return true if the rtx_insn contains a MEM RTX somewhere
10572 has_memory_op (rtx_insn
*mem_insn
)
10574 subrtx_iterator::array_type array
;
10575 FOR_EACH_SUBRTX (iter
, array
, PATTERN (mem_insn
), ALL
)
10582 /* Find the first rtx_insn before insn that will generate an assembly
10586 aarch64_prev_real_insn (rtx_insn
*insn
)
10593 insn
= prev_real_insn (insn
);
10595 while (insn
&& recog_memoized (insn
) < 0);
10601 is_madd_op (enum attr_type t1
)
10604 /* A number of these may be AArch32 only. */
10605 enum attr_type mlatypes
[] = {
10606 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
10607 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
10608 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
10611 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
10613 if (t1
== mlatypes
[i
])
10620 /* Check if there is a register dependency between a load and the insn
10621 for which we hold recog_data. */
10624 dep_between_memop_and_curr (rtx memop
)
10629 gcc_assert (GET_CODE (memop
) == SET
);
10631 if (!REG_P (SET_DEST (memop
)))
10634 load_reg
= SET_DEST (memop
);
10635 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
10637 rtx operand
= recog_data
.operand
[opno
];
10638 if (REG_P (operand
)
10639 && reg_overlap_mentioned_p (load_reg
, operand
))
10647 /* When working around the Cortex-A53 erratum 835769,
10648 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10649 instruction and has a preceding memory instruction such that a NOP
10650 should be inserted between them. */
10653 aarch64_madd_needs_nop (rtx_insn
* insn
)
10655 enum attr_type attr_type
;
10659 if (!TARGET_FIX_ERR_A53_835769
)
10662 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
10665 attr_type
= get_attr_type (insn
);
10666 if (!is_madd_op (attr_type
))
10669 prev
= aarch64_prev_real_insn (insn
);
10670 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10671 Restore recog state to INSN to avoid state corruption. */
10672 extract_constrain_insn_cached (insn
);
10674 if (!prev
|| !has_memory_op (prev
))
10677 body
= single_set (prev
);
10679 /* If the previous insn is a memory op and there is no dependency between
10680 it and the DImode madd, emit a NOP between them. If body is NULL then we
10681 have a complex memory operation, probably a load/store pair.
10682 Be conservative for now and emit a NOP. */
10683 if (GET_MODE (recog_data
.operand
[0]) == DImode
10684 && (!body
|| !dep_between_memop_and_curr (body
)))
10692 /* Implement FINAL_PRESCAN_INSN. */
10695 aarch64_final_prescan_insn (rtx_insn
*insn
)
10697 if (aarch64_madd_needs_nop (insn
))
10698 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
10702 /* Return the equivalent letter for size. */
10704 sizetochar (int size
)
10708 case 64: return 'd';
10709 case 32: return 's';
10710 case 16: return 'h';
10711 case 8 : return 'b';
10712 default: gcc_unreachable ();
10716 /* Return true iff x is a uniform vector of floating-point
10717 constants, and the constant can be represented in
10718 quarter-precision form. Note, as aarch64_float_const_representable
10719 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10721 aarch64_vect_float_const_representable_p (rtx x
)
10724 return (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
10725 && const_vec_duplicate_p (x
, &elt
)
10726 && aarch64_float_const_representable_p (elt
));
10729 /* Return true for valid and false for invalid. */
10731 aarch64_simd_valid_immediate (rtx op
, machine_mode mode
, bool inverse
,
10732 struct simd_immediate_info
*info
)
10734 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10736 for (i = 0; i < idx; i += (STRIDE)) \
10741 immtype = (CLASS); \
10742 elsize = (ELSIZE); \
10743 eshift = (SHIFT); \
10748 unsigned int i
, elsize
= 0, idx
= 0, n_elts
= CONST_VECTOR_NUNITS (op
);
10749 unsigned int innersize
= GET_MODE_UNIT_SIZE (mode
);
10750 unsigned char bytes
[16];
10751 int immtype
= -1, matches
;
10752 unsigned int invmask
= inverse
? 0xff : 0;
10755 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
10757 if (! (aarch64_simd_imm_zero_p (op
, mode
)
10758 || aarch64_vect_float_const_representable_p (op
)))
10763 info
->value
= CONST_VECTOR_ELT (op
, 0);
10764 info
->element_width
= GET_MODE_BITSIZE (GET_MODE (info
->value
));
10772 /* Splat vector constant out into a byte vector. */
10773 for (i
= 0; i
< n_elts
; i
++)
10775 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10776 it must be laid out in the vector register in reverse order. */
10777 rtx el
= CONST_VECTOR_ELT (op
, BYTES_BIG_ENDIAN
? (n_elts
- 1 - i
) : i
);
10778 unsigned HOST_WIDE_INT elpart
;
10780 gcc_assert (CONST_INT_P (el
));
10781 elpart
= INTVAL (el
);
10783 for (unsigned int byte
= 0; byte
< innersize
; byte
++)
10785 bytes
[idx
++] = (elpart
& 0xff) ^ invmask
;
10786 elpart
>>= BITS_PER_UNIT
;
10791 /* Sanity check. */
10792 gcc_assert (idx
== GET_MODE_SIZE (mode
));
10796 CHECK (4, 32, 0, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0
10797 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 0, 0);
10799 CHECK (4, 32, 1, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
10800 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
10802 CHECK (4, 32, 2, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
10803 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
10805 CHECK (4, 32, 3, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
10806 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == bytes
[3], 24, 0);
10808 CHECK (2, 16, 4, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0, 0, 0);
10810 CHECK (2, 16, 5, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1], 8, 0);
10812 CHECK (4, 32, 6, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff
10813 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 0, 1);
10815 CHECK (4, 32, 7, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
10816 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
10818 CHECK (4, 32, 8, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
10819 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
10821 CHECK (4, 32, 9, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
10822 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == bytes
[3], 24, 1);
10824 CHECK (2, 16, 10, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff, 0, 1);
10826 CHECK (2, 16, 11, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1], 8, 1);
10828 CHECK (4, 32, 12, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
10829 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
10831 CHECK (4, 32, 13, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
10832 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
10834 CHECK (4, 32, 14, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
10835 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
10837 CHECK (4, 32, 15, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
10838 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
10840 CHECK (1, 8, 16, bytes
[i
] == bytes
[0], 0, 0);
10842 CHECK (1, 64, 17, (bytes
[i
] == 0 || bytes
[i
] == 0xff)
10843 && bytes
[i
] == bytes
[(i
+ 8) % idx
], 0, 0);
10852 info
->element_width
= elsize
;
10853 info
->mvn
= emvn
!= 0;
10854 info
->shift
= eshift
;
10856 unsigned HOST_WIDE_INT imm
= 0;
10858 if (immtype
>= 12 && immtype
<= 15)
10861 /* Un-invert bytes of recognized vector, if necessary. */
10863 for (i
= 0; i
< idx
; i
++)
10864 bytes
[i
] ^= invmask
;
10868 /* FIXME: Broken on 32-bit H_W_I hosts. */
10869 gcc_assert (sizeof (HOST_WIDE_INT
) == 8);
10871 for (i
= 0; i
< 8; i
++)
10872 imm
|= (unsigned HOST_WIDE_INT
) (bytes
[i
] ? 0xff : 0)
10873 << (i
* BITS_PER_UNIT
);
10876 info
->value
= GEN_INT (imm
);
10880 for (i
= 0; i
< elsize
/ BITS_PER_UNIT
; i
++)
10881 imm
|= (unsigned HOST_WIDE_INT
) bytes
[i
] << (i
* BITS_PER_UNIT
);
10883 /* Construct 'abcdefgh' because the assembler cannot handle
10884 generic constants. */
10887 imm
= (imm
>> info
->shift
) & 0xff;
10888 info
->value
= GEN_INT (imm
);
10896 /* Check of immediate shift constants are within range. */
10898 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
10900 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
10902 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
10904 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
10907 /* Return true if X is a uniform vector where all elements
10908 are either the floating-point constant 0.0 or the
10909 integer constant 0. */
10911 aarch64_simd_imm_zero_p (rtx x
, machine_mode mode
)
10913 return x
== CONST0_RTX (mode
);
10917 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10918 operation of width WIDTH at bit position POS. */
10921 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
10923 gcc_assert (CONST_INT_P (width
));
10924 gcc_assert (CONST_INT_P (pos
));
10926 unsigned HOST_WIDE_INT mask
10927 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
10928 return GEN_INT (mask
<< UINTVAL (pos
));
10932 aarch64_simd_imm_scalar_p (rtx x
, machine_mode mode ATTRIBUTE_UNUSED
)
10934 HOST_WIDE_INT imm
= INTVAL (x
);
10937 for (i
= 0; i
< 8; i
++)
10939 unsigned int byte
= imm
& 0xff;
10940 if (byte
!= 0xff && byte
!= 0)
10949 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
10951 if (GET_CODE (x
) == HIGH
10952 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
10955 if (CONST_INT_P (x
))
10958 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
10961 return aarch64_classify_symbolic_expression (x
)
10962 == SYMBOL_TINY_ABSOLUTE
;
10965 /* Return a const_int vector of VAL. */
10967 aarch64_simd_gen_const_vector_dup (machine_mode mode
, int val
)
10969 int nunits
= GET_MODE_NUNITS (mode
);
10970 rtvec v
= rtvec_alloc (nunits
);
10973 for (i
=0; i
< nunits
; i
++)
10974 RTVEC_ELT (v
, i
) = GEN_INT (val
);
10976 return gen_rtx_CONST_VECTOR (mode
, v
);
10979 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10982 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, machine_mode mode
)
10984 machine_mode vmode
;
10986 gcc_assert (!VECTOR_MODE_P (mode
));
10987 vmode
= aarch64_preferred_simd_mode (mode
);
10988 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
10989 return aarch64_simd_valid_immediate (op_v
, vmode
, false, NULL
);
10992 /* Construct and return a PARALLEL RTX vector with elements numbering the
10993 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10994 the vector - from the perspective of the architecture. This does not
10995 line up with GCC's perspective on lane numbers, so we end up with
10996 different masks depending on our target endian-ness. The diagram
10997 below may help. We must draw the distinction when building masks
10998 which select one half of the vector. An instruction selecting
10999 architectural low-lanes for a big-endian target, must be described using
11000 a mask selecting GCC high-lanes.
11002 Big-Endian Little-Endian
11004 GCC 0 1 2 3 3 2 1 0
11005 | x | x | x | x | | x | x | x | x |
11006 Architecture 3 2 1 0 3 2 1 0
11008 Low Mask: { 2, 3 } { 0, 1 }
11009 High Mask: { 0, 1 } { 2, 3 }
11013 aarch64_simd_vect_par_cnst_half (machine_mode mode
, bool high
)
11015 int nunits
= GET_MODE_NUNITS (mode
);
11016 rtvec v
= rtvec_alloc (nunits
/ 2);
11017 int high_base
= nunits
/ 2;
11023 if (BYTES_BIG_ENDIAN
)
11024 base
= high
? low_base
: high_base
;
11026 base
= high
? high_base
: low_base
;
11028 for (i
= 0; i
< nunits
/ 2; i
++)
11029 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
11031 t1
= gen_rtx_PARALLEL (mode
, v
);
11035 /* Check OP for validity as a PARALLEL RTX vector with elements
11036 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11037 from the perspective of the architecture. See the diagram above
11038 aarch64_simd_vect_par_cnst_half for more details. */
11041 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
11044 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, high
);
11045 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
11046 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
11049 if (!VECTOR_MODE_P (mode
))
11052 if (count_op
!= count_ideal
)
11055 for (i
= 0; i
< count_ideal
; i
++)
11057 rtx elt_op
= XVECEXP (op
, 0, i
);
11058 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
11060 if (!CONST_INT_P (elt_op
)
11061 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
11067 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11068 HIGH (exclusive). */
11070 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
11073 HOST_WIDE_INT lane
;
11074 gcc_assert (CONST_INT_P (operand
));
11075 lane
= INTVAL (operand
);
11077 if (lane
< low
|| lane
>= high
)
11080 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
11082 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
11086 /* Return TRUE if OP is a valid vector addressing mode. */
11088 aarch64_simd_mem_operand_p (rtx op
)
11090 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
11091 || REG_P (XEXP (op
, 0)));
11094 /* Emit a register copy from operand to operand, taking care not to
11095 early-clobber source registers in the process.
11097 COUNT is the number of components into which the copy needs to be
11100 aarch64_simd_emit_reg_reg_move (rtx
*operands
, enum machine_mode mode
,
11101 unsigned int count
)
11104 int rdest
= REGNO (operands
[0]);
11105 int rsrc
= REGNO (operands
[1]);
11107 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
11109 for (i
= 0; i
< count
; i
++)
11110 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
11111 gen_rtx_REG (mode
, rsrc
+ i
));
11113 for (i
= 0; i
< count
; i
++)
11114 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
11115 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
11118 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11119 one of VSTRUCT modes: OI, CI, or XI. */
11121 aarch64_simd_attr_length_rglist (enum machine_mode mode
)
11123 return (GET_MODE_SIZE (mode
) / UNITS_PER_VREG
) * 4;
11126 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11127 alignment of a vector to 128 bits. */
11128 static HOST_WIDE_INT
11129 aarch64_simd_vector_alignment (const_tree type
)
11131 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
11132 return MIN (align
, 128);
11135 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11137 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
11142 /* We guarantee alignment for vectors up to 128-bits. */
11143 if (tree_int_cst_compare (TYPE_SIZE (type
),
11144 bitsize_int (BIGGEST_ALIGNMENT
)) > 0)
11147 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11151 /* If VALS is a vector constant that can be loaded into a register
11152 using DUP, generate instructions to do so and return an RTX to
11153 assign to the register. Otherwise return NULL_RTX. */
11155 aarch64_simd_dup_constant (rtx vals
)
11157 machine_mode mode
= GET_MODE (vals
);
11158 machine_mode inner_mode
= GET_MODE_INNER (mode
);
11161 if (!const_vec_duplicate_p (vals
, &x
))
11164 /* We can load this constant by using DUP and a constant in a
11165 single ARM register. This will be cheaper than a vector
11167 x
= copy_to_mode_reg (inner_mode
, x
);
11168 return gen_rtx_VEC_DUPLICATE (mode
, x
);
11172 /* Generate code to load VALS, which is a PARALLEL containing only
11173 constants (for vec_init) or CONST_VECTOR, efficiently into a
11174 register. Returns an RTX to copy into the register, or NULL_RTX
11175 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11177 aarch64_simd_make_constant (rtx vals
)
11179 machine_mode mode
= GET_MODE (vals
);
11181 rtx const_vec
= NULL_RTX
;
11182 int n_elts
= GET_MODE_NUNITS (mode
);
11186 if (GET_CODE (vals
) == CONST_VECTOR
)
11188 else if (GET_CODE (vals
) == PARALLEL
)
11190 /* A CONST_VECTOR must contain only CONST_INTs and
11191 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11192 Only store valid constants in a CONST_VECTOR. */
11193 for (i
= 0; i
< n_elts
; ++i
)
11195 rtx x
= XVECEXP (vals
, 0, i
);
11196 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11199 if (n_const
== n_elts
)
11200 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
11203 gcc_unreachable ();
11205 if (const_vec
!= NULL_RTX
11206 && aarch64_simd_valid_immediate (const_vec
, mode
, false, NULL
))
11207 /* Load using MOVI/MVNI. */
11209 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
11210 /* Loaded using DUP. */
11212 else if (const_vec
!= NULL_RTX
)
11213 /* Load from constant pool. We can not take advantage of single-cycle
11214 LD1 because we need a PC-relative addressing mode. */
11217 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11218 We can not construct an initializer. */
11222 /* Expand a vector initialisation sequence, such that TARGET is
11223 initialised to contain VALS. */
11226 aarch64_expand_vector_init (rtx target
, rtx vals
)
11228 machine_mode mode
= GET_MODE (target
);
11229 machine_mode inner_mode
= GET_MODE_INNER (mode
);
11230 /* The number of vector elements. */
11231 int n_elts
= GET_MODE_NUNITS (mode
);
11232 /* The number of vector elements which are not constant. */
11234 rtx any_const
= NULL_RTX
;
11235 /* The first element of vals. */
11236 rtx v0
= XVECEXP (vals
, 0, 0);
11237 bool all_same
= true;
11239 /* Count the number of variable elements to initialise. */
11240 for (int i
= 0; i
< n_elts
; ++i
)
11242 rtx x
= XVECEXP (vals
, 0, i
);
11243 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
11248 all_same
&= rtx_equal_p (x
, v0
);
11251 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11252 how best to handle this. */
11255 rtx constant
= aarch64_simd_make_constant (vals
);
11256 if (constant
!= NULL_RTX
)
11258 emit_move_insn (target
, constant
);
11263 /* Splat a single non-constant element if we can. */
11266 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
11267 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
11271 /* Initialise a vector which is part-variable. We want to first try
11272 to build those lanes which are constant in the most efficient way we
11274 if (n_var
!= n_elts
)
11276 rtx copy
= copy_rtx (vals
);
11278 /* Load constant part of vector. We really don't care what goes into the
11279 parts we will overwrite, but we're more likely to be able to load the
11280 constant efficiently if it has fewer, larger, repeating parts
11281 (see aarch64_simd_valid_immediate). */
11282 for (int i
= 0; i
< n_elts
; i
++)
11284 rtx x
= XVECEXP (vals
, 0, i
);
11285 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11287 rtx subst
= any_const
;
11288 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
11290 /* Look in the copied vector, as more elements are const. */
11291 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
11292 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
11298 XVECEXP (copy
, 0, i
) = subst
;
11300 aarch64_expand_vector_init (target
, copy
);
11303 /* Insert the variable lanes directly. */
11305 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
11306 gcc_assert (icode
!= CODE_FOR_nothing
);
11308 for (int i
= 0; i
< n_elts
; i
++)
11310 rtx x
= XVECEXP (vals
, 0, i
);
11311 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11313 x
= copy_to_mode_reg (inner_mode
, x
);
11314 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
11318 static unsigned HOST_WIDE_INT
11319 aarch64_shift_truncation_mask (machine_mode mode
)
11322 (!SHIFT_COUNT_TRUNCATED
11323 || aarch64_vector_mode_supported_p (mode
)
11324 || aarch64_vect_struct_mode_p (mode
)) ? 0 : (GET_MODE_BITSIZE (mode
) - 1);
11327 /* Select a format to encode pointers in exception handling data. */
11329 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
11332 switch (aarch64_cmodel
)
11334 case AARCH64_CMODEL_TINY
:
11335 case AARCH64_CMODEL_TINY_PIC
:
11336 case AARCH64_CMODEL_SMALL
:
11337 case AARCH64_CMODEL_SMALL_PIC
:
11338 case AARCH64_CMODEL_SMALL_SPIC
:
11339 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11341 type
= DW_EH_PE_sdata4
;
11344 /* No assumptions here. 8-byte relocs required. */
11345 type
= DW_EH_PE_sdata8
;
11348 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
11351 /* The last .arch and .tune assembly strings that we printed. */
11352 static std::string aarch64_last_printed_arch_string
;
11353 static std::string aarch64_last_printed_tune_string
;
11355 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11356 by the function fndecl. */
11359 aarch64_declare_function_name (FILE *stream
, const char* name
,
11362 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11364 struct cl_target_option
*targ_options
;
11366 targ_options
= TREE_TARGET_OPTION (target_parts
);
11368 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
11369 gcc_assert (targ_options
);
11371 const struct processor
*this_arch
11372 = aarch64_get_arch (targ_options
->x_explicit_arch
);
11374 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
11375 std::string extension
11376 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
11378 /* Only update the assembler .arch string if it is distinct from the last
11379 such string we printed. */
11380 std::string to_print
= this_arch
->name
+ extension
;
11381 if (to_print
!= aarch64_last_printed_arch_string
)
11383 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
11384 aarch64_last_printed_arch_string
= to_print
;
11387 /* Print the cpu name we're tuning for in the comments, might be
11388 useful to readers of the generated asm. Do it only when it changes
11389 from function to function and verbose assembly is requested. */
11390 const struct processor
*this_tune
11391 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
11393 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
11395 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
11397 aarch64_last_printed_tune_string
= this_tune
->name
;
11400 /* Don't forget the type directive for ELF. */
11401 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
11402 ASM_OUTPUT_LABEL (stream
, name
);
11405 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11408 aarch64_start_file (void)
11410 struct cl_target_option
*default_options
11411 = TREE_TARGET_OPTION (target_option_default_node
);
11413 const struct processor
*default_arch
11414 = aarch64_get_arch (default_options
->x_explicit_arch
);
11415 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
11416 std::string extension
11417 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
11418 default_arch
->flags
);
11420 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
11421 aarch64_last_printed_tune_string
= "";
11422 asm_fprintf (asm_out_file
, "\t.arch %s\n",
11423 aarch64_last_printed_arch_string
.c_str ());
11425 default_file_start ();
11428 /* Emit load exclusive. */
11431 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
11432 rtx mem
, rtx model_rtx
)
11434 rtx (*gen
) (rtx
, rtx
, rtx
);
11438 case QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
11439 case HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
11440 case SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
11441 case DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
11443 gcc_unreachable ();
11446 emit_insn (gen (rval
, mem
, model_rtx
));
11449 /* Emit store exclusive. */
11452 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
11453 rtx rval
, rtx mem
, rtx model_rtx
)
11455 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
11459 case QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
11460 case HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
11461 case SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
11462 case DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
11464 gcc_unreachable ();
11467 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
11470 /* Mark the previous jump instruction as unlikely. */
11473 aarch64_emit_unlikely_jump (rtx insn
)
11475 int very_unlikely
= REG_BR_PROB_BASE
/ 100 - 1;
11477 insn
= emit_jump_insn (insn
);
11478 add_int_reg_note (insn
, REG_BR_PROB
, very_unlikely
);
11481 /* Expand a compare and swap pattern. */
11484 aarch64_expand_compare_and_swap (rtx operands
[])
11486 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
11487 machine_mode mode
, cmp_mode
;
11488 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
11491 const gen_cas_fn split_cas
[] =
11493 gen_aarch64_compare_and_swapqi
,
11494 gen_aarch64_compare_and_swaphi
,
11495 gen_aarch64_compare_and_swapsi
,
11496 gen_aarch64_compare_and_swapdi
11498 const gen_cas_fn atomic_cas
[] =
11500 gen_aarch64_compare_and_swapqi_lse
,
11501 gen_aarch64_compare_and_swaphi_lse
,
11502 gen_aarch64_compare_and_swapsi_lse
,
11503 gen_aarch64_compare_and_swapdi_lse
11506 bval
= operands
[0];
11507 rval
= operands
[1];
11509 oldval
= operands
[3];
11510 newval
= operands
[4];
11511 is_weak
= operands
[5];
11512 mod_s
= operands
[6];
11513 mod_f
= operands
[7];
11514 mode
= GET_MODE (mem
);
11517 /* Normally the succ memory model must be stronger than fail, but in the
11518 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11519 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11521 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
11522 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
11523 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
11529 /* For short modes, we're going to perform the comparison in SImode,
11530 so do the zero-extension now. */
11532 rval
= gen_reg_rtx (SImode
);
11533 oldval
= convert_modes (SImode
, mode
, oldval
, true);
11534 /* Fall through. */
11538 /* Force the value into a register if needed. */
11539 if (!aarch64_plus_operand (oldval
, mode
))
11540 oldval
= force_reg (cmp_mode
, oldval
);
11544 gcc_unreachable ();
11549 case QImode
: idx
= 0; break;
11550 case HImode
: idx
= 1; break;
11551 case SImode
: idx
= 2; break;
11552 case DImode
: idx
= 3; break;
11554 gcc_unreachable ();
11557 gen
= atomic_cas
[idx
];
11559 gen
= split_cas
[idx
];
11561 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
11563 if (mode
== QImode
|| mode
== HImode
)
11564 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
11566 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
11567 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
11568 emit_insn (gen_rtx_SET (bval
, x
));
11571 /* Test whether the target supports using a atomic load-operate instruction.
11572 CODE is the operation and AFTER is TRUE if the data in memory after the
11573 operation should be returned and FALSE if the data before the operation
11574 should be returned. Returns FALSE if the operation isn't supported by the
11578 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
11597 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11598 sequence implementing an atomic operation. */
11601 aarch64_emit_post_barrier (enum memmodel model
)
11603 const enum memmodel base_model
= memmodel_base (model
);
11605 if (is_mm_sync (model
)
11606 && (base_model
== MEMMODEL_ACQUIRE
11607 || base_model
== MEMMODEL_ACQ_REL
11608 || base_model
== MEMMODEL_SEQ_CST
))
11610 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
11614 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11615 for the data in memory. EXPECTED is the value expected to be in memory.
11616 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11617 is the memory ordering to use. */
11620 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
11621 rtx expected
, rtx desired
,
11624 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
11627 mode
= GET_MODE (mem
);
11631 case QImode
: gen
= gen_aarch64_atomic_casqi
; break;
11632 case HImode
: gen
= gen_aarch64_atomic_cashi
; break;
11633 case SImode
: gen
= gen_aarch64_atomic_cassi
; break;
11634 case DImode
: gen
= gen_aarch64_atomic_casdi
; break;
11636 gcc_unreachable ();
11639 /* Move the expected value into the CAS destination register. */
11640 emit_insn (gen_rtx_SET (rval
, expected
));
11642 /* Emit the CAS. */
11643 emit_insn (gen (rval
, mem
, desired
, model
));
11645 /* Compare the expected value with the value loaded by the CAS, to establish
11646 whether the swap was made. */
11647 aarch64_gen_compare_reg (EQ
, rval
, expected
);
11650 /* Split a compare and swap pattern. */
11653 aarch64_split_compare_and_swap (rtx operands
[])
11655 rtx rval
, mem
, oldval
, newval
, scratch
;
11658 rtx_code_label
*label1
, *label2
;
11660 enum memmodel model
;
11663 rval
= operands
[0];
11665 oldval
= operands
[2];
11666 newval
= operands
[3];
11667 is_weak
= (operands
[4] != const0_rtx
);
11668 model_rtx
= operands
[5];
11669 scratch
= operands
[7];
11670 mode
= GET_MODE (mem
);
11671 model
= memmodel_from_int (INTVAL (model_rtx
));
11676 label1
= gen_label_rtx ();
11677 emit_label (label1
);
11679 label2
= gen_label_rtx ();
11681 /* The initial load can be relaxed for a __sync operation since a final
11682 barrier will be emitted to stop code hoisting. */
11683 if (is_mm_sync (model
))
11684 aarch64_emit_load_exclusive (mode
, rval
, mem
,
11685 GEN_INT (MEMMODEL_RELAXED
));
11687 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
11689 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
11690 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
11691 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
11692 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
11693 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
11695 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
11699 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
11700 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
11701 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
11702 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
11706 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
11707 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
11708 emit_insn (gen_rtx_SET (cond
, x
));
11711 emit_label (label2
);
11713 /* Emit any final barrier needed for a __sync operation. */
11714 if (is_mm_sync (model
))
11715 aarch64_emit_post_barrier (model
);
11718 /* Emit a BIC instruction. */
11721 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
11723 rtx shift_rtx
= GEN_INT (shift
);
11724 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
11728 case SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
11729 case DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
11731 gcc_unreachable ();
11734 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
11737 /* Emit an atomic swap. */
11740 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
11741 rtx mem
, rtx model
)
11743 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
11747 case QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
11748 case HImode
: gen
= gen_aarch64_atomic_swphi
; break;
11749 case SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
11750 case DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
11752 gcc_unreachable ();
11755 emit_insn (gen (dst
, mem
, value
, model
));
11758 /* Operations supported by aarch64_emit_atomic_load_op. */
11760 enum aarch64_atomic_load_op_code
11762 AARCH64_LDOP_PLUS
, /* A + B */
11763 AARCH64_LDOP_XOR
, /* A ^ B */
11764 AARCH64_LDOP_OR
, /* A | B */
11765 AARCH64_LDOP_BIC
/* A & ~B */
11768 /* Emit an atomic load-operate. */
11771 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
11772 machine_mode mode
, rtx dst
, rtx src
,
11773 rtx mem
, rtx model
)
11775 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
11776 const aarch64_atomic_load_op_fn plus
[] =
11778 gen_aarch64_atomic_loadaddqi
,
11779 gen_aarch64_atomic_loadaddhi
,
11780 gen_aarch64_atomic_loadaddsi
,
11781 gen_aarch64_atomic_loadadddi
11783 const aarch64_atomic_load_op_fn eor
[] =
11785 gen_aarch64_atomic_loadeorqi
,
11786 gen_aarch64_atomic_loadeorhi
,
11787 gen_aarch64_atomic_loadeorsi
,
11788 gen_aarch64_atomic_loadeordi
11790 const aarch64_atomic_load_op_fn ior
[] =
11792 gen_aarch64_atomic_loadsetqi
,
11793 gen_aarch64_atomic_loadsethi
,
11794 gen_aarch64_atomic_loadsetsi
,
11795 gen_aarch64_atomic_loadsetdi
11797 const aarch64_atomic_load_op_fn bic
[] =
11799 gen_aarch64_atomic_loadclrqi
,
11800 gen_aarch64_atomic_loadclrhi
,
11801 gen_aarch64_atomic_loadclrsi
,
11802 gen_aarch64_atomic_loadclrdi
11804 aarch64_atomic_load_op_fn gen
;
11809 case QImode
: idx
= 0; break;
11810 case HImode
: idx
= 1; break;
11811 case SImode
: idx
= 2; break;
11812 case DImode
: idx
= 3; break;
11814 gcc_unreachable ();
11819 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
11820 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
11821 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
11822 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
11824 gcc_unreachable ();
11827 emit_insn (gen (dst
, mem
, src
, model
));
11830 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11831 location to store the data read from memory. OUT_RESULT is the location to
11832 store the result of the operation. MEM is the memory location to read and
11833 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11834 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11838 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
11839 rtx mem
, rtx value
, rtx model_rtx
)
11841 machine_mode mode
= GET_MODE (mem
);
11842 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
11843 const bool short_mode
= (mode
< SImode
);
11844 aarch64_atomic_load_op_code ldop_code
;
11849 out_data
= gen_lowpart (mode
, out_data
);
11852 out_result
= gen_lowpart (mode
, out_result
);
11854 /* Make sure the value is in a register, putting it into a destination
11855 register if it needs to be manipulated. */
11856 if (!register_operand (value
, mode
)
11857 || code
== AND
|| code
== MINUS
)
11859 src
= out_result
? out_result
: out_data
;
11860 emit_move_insn (src
, gen_lowpart (mode
, value
));
11864 gcc_assert (register_operand (src
, mode
));
11866 /* Preprocess the data for the operation as necessary. If the operation is
11867 a SET then emit a swap instruction and finish. */
11871 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
11875 /* Negate the value and treat it as a PLUS. */
11879 /* Resize the value if necessary. */
11881 src
= gen_lowpart (wmode
, src
);
11883 neg_src
= gen_rtx_NEG (wmode
, src
);
11884 emit_insn (gen_rtx_SET (src
, neg_src
));
11887 src
= gen_lowpart (mode
, src
);
11889 /* Fall-through. */
11891 ldop_code
= AARCH64_LDOP_PLUS
;
11895 ldop_code
= AARCH64_LDOP_OR
;
11899 ldop_code
= AARCH64_LDOP_XOR
;
11906 /* Resize the value if necessary. */
11908 src
= gen_lowpart (wmode
, src
);
11910 not_src
= gen_rtx_NOT (wmode
, src
);
11911 emit_insn (gen_rtx_SET (src
, not_src
));
11914 src
= gen_lowpart (mode
, src
);
11916 ldop_code
= AARCH64_LDOP_BIC
;
11920 /* The operation can't be done with atomic instructions. */
11921 gcc_unreachable ();
11924 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
11926 /* If necessary, calculate the data in memory after the update by redoing the
11927 operation from values in registers. */
11933 src
= gen_lowpart (wmode
, src
);
11934 out_data
= gen_lowpart (wmode
, out_data
);
11935 out_result
= gen_lowpart (wmode
, out_result
);
11944 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
11947 x
= gen_rtx_IOR (wmode
, out_data
, src
);
11950 x
= gen_rtx_XOR (wmode
, out_data
, src
);
11953 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
11956 gcc_unreachable ();
11959 emit_set_insn (out_result
, x
);
11964 /* Split an atomic operation. */
11967 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
11968 rtx value
, rtx model_rtx
, rtx cond
)
11970 machine_mode mode
= GET_MODE (mem
);
11971 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
11972 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
11973 const bool is_sync
= is_mm_sync (model
);
11974 rtx_code_label
*label
;
11977 /* Split the atomic operation into a sequence. */
11978 label
= gen_label_rtx ();
11979 emit_label (label
);
11982 new_out
= gen_lowpart (wmode
, new_out
);
11984 old_out
= gen_lowpart (wmode
, old_out
);
11987 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
11989 /* The initial load can be relaxed for a __sync operation since a final
11990 barrier will be emitted to stop code hoisting. */
11992 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
11993 GEN_INT (MEMMODEL_RELAXED
));
11995 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
12004 x
= gen_rtx_AND (wmode
, old_out
, value
);
12005 emit_insn (gen_rtx_SET (new_out
, x
));
12006 x
= gen_rtx_NOT (wmode
, new_out
);
12007 emit_insn (gen_rtx_SET (new_out
, x
));
12011 if (CONST_INT_P (value
))
12013 value
= GEN_INT (-INTVAL (value
));
12016 /* Fall through. */
12019 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
12020 emit_insn (gen_rtx_SET (new_out
, x
));
12024 aarch64_emit_store_exclusive (mode
, cond
, mem
,
12025 gen_lowpart (mode
, new_out
), model_rtx
);
12027 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
12028 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12029 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
12030 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12032 /* Emit any final barrier needed for a __sync operation. */
12034 aarch64_emit_post_barrier (model
);
12038 aarch64_init_libfuncs (void)
12040 /* Half-precision float operations. The compiler handles all operations
12041 with NULL libfuncs by converting to SFmode. */
12044 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
12045 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
12048 set_optab_libfunc (add_optab
, HFmode
, NULL
);
12049 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
12050 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
12051 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
12052 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
12055 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
12056 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
12057 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
12058 set_optab_libfunc (le_optab
, HFmode
, NULL
);
12059 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
12060 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
12061 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
12064 /* Target hook for c_mode_for_suffix. */
12065 static machine_mode
12066 aarch64_c_mode_for_suffix (char suffix
)
12074 /* We can only represent floating point constants which will fit in
12075 "quarter-precision" values. These values are characterised by
12076 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12079 (-1)^s * (n/16) * 2^r
12082 's' is the sign bit.
12083 'n' is an integer in the range 16 <= n <= 31.
12084 'r' is an integer in the range -3 <= r <= 4. */
12086 /* Return true iff X can be represented by a quarter-precision
12087 floating point immediate operand X. Note, we cannot represent 0.0. */
12089 aarch64_float_const_representable_p (rtx x
)
12091 /* This represents our current view of how many bits
12092 make up the mantissa. */
12093 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
12095 unsigned HOST_WIDE_INT mantissa
, mask
;
12096 REAL_VALUE_TYPE r
, m
;
12099 if (!CONST_DOUBLE_P (x
))
12102 /* We don't support HFmode constants yet. */
12103 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
12106 r
= *CONST_DOUBLE_REAL_VALUE (x
);
12108 /* We cannot represent infinities, NaNs or +/-zero. We won't
12109 know if we have +zero until we analyse the mantissa, but we
12110 can reject the other invalid values. */
12111 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
12112 || REAL_VALUE_MINUS_ZERO (r
))
12115 /* Extract exponent. */
12116 r
= real_value_abs (&r
);
12117 exponent
= REAL_EXP (&r
);
12119 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12120 highest (sign) bit, with a fixed binary point at bit point_pos.
12121 m1 holds the low part of the mantissa, m2 the high part.
12122 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12123 bits for the mantissa, this can fail (low bits will be lost). */
12124 real_ldexp (&m
, &r
, point_pos
- exponent
);
12125 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
12127 /* If the low part of the mantissa has bits set we cannot represent
12129 if (w
.elt (0) != 0)
12131 /* We have rejected the lower HOST_WIDE_INT, so update our
12132 understanding of how many bits lie in the mantissa and
12133 look only at the high HOST_WIDE_INT. */
12134 mantissa
= w
.elt (1);
12135 point_pos
-= HOST_BITS_PER_WIDE_INT
;
12137 /* We can only represent values with a mantissa of the form 1.xxxx. */
12138 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
12139 if ((mantissa
& mask
) != 0)
12142 /* Having filtered unrepresentable values, we may now remove all
12143 but the highest 5 bits. */
12144 mantissa
>>= point_pos
- 5;
12146 /* We cannot represent the value 0.0, so reject it. This is handled
12151 /* Then, as bit 4 is always set, we can mask it off, leaving
12152 the mantissa in the range [0, 15]. */
12153 mantissa
&= ~(1 << 4);
12154 gcc_assert (mantissa
<= 15);
12156 /* GCC internally does not use IEEE754-like encoding (where normalized
12157 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12158 Our mantissa values are shifted 4 places to the left relative to
12159 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12160 by 5 places to correct for GCC's representation. */
12161 exponent
= 5 - exponent
;
12163 return (exponent
>= 0 && exponent
<= 7);
12167 aarch64_output_simd_mov_immediate (rtx const_vector
,
12172 static char templ
[40];
12173 const char *mnemonic
;
12174 const char *shift_op
;
12175 unsigned int lane_count
= 0;
12178 struct simd_immediate_info info
= { NULL_RTX
, 0, 0, false, false };
12180 /* This will return true to show const_vector is legal for use as either
12181 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12182 also update INFO to show how the immediate should be generated. */
12183 is_valid
= aarch64_simd_valid_immediate (const_vector
, mode
, false, &info
);
12184 gcc_assert (is_valid
);
12186 element_char
= sizetochar (info
.element_width
);
12187 lane_count
= width
/ info
.element_width
;
12189 mode
= GET_MODE_INNER (mode
);
12190 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12192 gcc_assert (info
.shift
== 0 && ! info
.mvn
);
12193 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12194 move immediate path. */
12195 if (aarch64_float_const_zero_rtx_p (info
.value
))
12196 info
.value
= GEN_INT (0);
12199 const unsigned int buf_size
= 20;
12200 char float_buf
[buf_size
] = {'\0'};
12201 real_to_decimal_for_mode (float_buf
,
12202 CONST_DOUBLE_REAL_VALUE (info
.value
),
12203 buf_size
, buf_size
, 1, mode
);
12205 if (lane_count
== 1)
12206 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
12208 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
12209 lane_count
, element_char
, float_buf
);
12214 mnemonic
= info
.mvn
? "mvni" : "movi";
12215 shift_op
= info
.msl
? "msl" : "lsl";
12217 gcc_assert (CONST_INT_P (info
.value
));
12218 if (lane_count
== 1)
12219 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
12220 mnemonic
, UINTVAL (info
.value
));
12221 else if (info
.shift
)
12222 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12223 ", %s %d", mnemonic
, lane_count
, element_char
,
12224 UINTVAL (info
.value
), shift_op
, info
.shift
);
12226 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
,
12227 mnemonic
, lane_count
, element_char
, UINTVAL (info
.value
));
12232 aarch64_output_scalar_simd_mov_immediate (rtx immediate
,
12235 machine_mode vmode
;
12237 gcc_assert (!VECTOR_MODE_P (mode
));
12238 vmode
= aarch64_simd_container_mode (mode
, 64);
12239 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
12240 return aarch64_output_simd_mov_immediate (v_op
, vmode
, 64);
12243 /* Split operands into moves from op[1] + op[2] into op[0]. */
12246 aarch64_split_combinev16qi (rtx operands
[3])
12248 unsigned int dest
= REGNO (operands
[0]);
12249 unsigned int src1
= REGNO (operands
[1]);
12250 unsigned int src2
= REGNO (operands
[2]);
12251 machine_mode halfmode
= GET_MODE (operands
[1]);
12252 unsigned int halfregs
= HARD_REGNO_NREGS (src1
, halfmode
);
12253 rtx destlo
, desthi
;
12255 gcc_assert (halfmode
== V16QImode
);
12257 if (src1
== dest
&& src2
== dest
+ halfregs
)
12259 /* No-op move. Can't split to nothing; emit something. */
12260 emit_note (NOTE_INSN_DELETED
);
12264 /* Preserve register attributes for variable tracking. */
12265 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
12266 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
12267 GET_MODE_SIZE (halfmode
));
12269 /* Special case of reversed high/low parts. */
12270 if (reg_overlap_mentioned_p (operands
[2], destlo
)
12271 && reg_overlap_mentioned_p (operands
[1], desthi
))
12273 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
12274 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
12275 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
12277 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
12279 /* Try to avoid unnecessary moves if part of the result
12280 is in the right place already. */
12282 emit_move_insn (destlo
, operands
[1]);
12283 if (src2
!= dest
+ halfregs
)
12284 emit_move_insn (desthi
, operands
[2]);
12288 if (src2
!= dest
+ halfregs
)
12289 emit_move_insn (desthi
, operands
[2]);
12291 emit_move_insn (destlo
, operands
[1]);
12295 /* vec_perm support. */
12297 #define MAX_VECT_LEN 16
12299 struct expand_vec_perm_d
12301 rtx target
, op0
, op1
;
12302 unsigned char perm
[MAX_VECT_LEN
];
12303 machine_mode vmode
;
12304 unsigned char nelt
;
12309 /* Generate a variable permutation. */
12312 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
12314 machine_mode vmode
= GET_MODE (target
);
12315 bool one_vector_p
= rtx_equal_p (op0
, op1
);
12317 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
12318 gcc_checking_assert (GET_MODE (op0
) == vmode
);
12319 gcc_checking_assert (GET_MODE (op1
) == vmode
);
12320 gcc_checking_assert (GET_MODE (sel
) == vmode
);
12321 gcc_checking_assert (TARGET_SIMD
);
12325 if (vmode
== V8QImode
)
12327 /* Expand the argument to a V16QI mode by duplicating it. */
12328 rtx pair
= gen_reg_rtx (V16QImode
);
12329 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
12330 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
12334 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
12341 if (vmode
== V8QImode
)
12343 pair
= gen_reg_rtx (V16QImode
);
12344 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
12345 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
12349 pair
= gen_reg_rtx (OImode
);
12350 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
12351 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
12357 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
12359 machine_mode vmode
= GET_MODE (target
);
12360 unsigned int nelt
= GET_MODE_NUNITS (vmode
);
12361 bool one_vector_p
= rtx_equal_p (op0
, op1
);
12364 /* The TBL instruction does not use a modulo index, so we must take care
12365 of that ourselves. */
12366 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
12367 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
12368 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
12370 /* For big-endian, we also need to reverse the index within the vector
12371 (but not which vector). */
12372 if (BYTES_BIG_ENDIAN
)
12374 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12376 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
12377 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
12378 NULL
, 0, OPTAB_LIB_WIDEN
);
12380 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
12383 /* Recognize patterns suitable for the TRN instructions. */
12385 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
12387 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
12388 rtx out
, in0
, in1
, x
;
12389 rtx (*gen
) (rtx
, rtx
, rtx
);
12390 machine_mode vmode
= d
->vmode
;
12392 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
12395 /* Note that these are little-endian tests.
12396 We correct for big-endian later. */
12397 if (d
->perm
[0] == 0)
12399 else if (d
->perm
[0] == 1)
12403 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
12405 for (i
= 0; i
< nelt
; i
+= 2)
12407 if (d
->perm
[i
] != i
+ odd
)
12409 if (d
->perm
[i
+ 1] != ((i
+ nelt
+ odd
) & mask
))
12419 if (BYTES_BIG_ENDIAN
)
12421 x
= in0
, in0
= in1
, in1
= x
;
12430 case V16QImode
: gen
= gen_aarch64_trn2v16qi
; break;
12431 case V8QImode
: gen
= gen_aarch64_trn2v8qi
; break;
12432 case V8HImode
: gen
= gen_aarch64_trn2v8hi
; break;
12433 case V4HImode
: gen
= gen_aarch64_trn2v4hi
; break;
12434 case V4SImode
: gen
= gen_aarch64_trn2v4si
; break;
12435 case V2SImode
: gen
= gen_aarch64_trn2v2si
; break;
12436 case V2DImode
: gen
= gen_aarch64_trn2v2di
; break;
12437 case V4HFmode
: gen
= gen_aarch64_trn2v4hf
; break;
12438 case V8HFmode
: gen
= gen_aarch64_trn2v8hf
; break;
12439 case V4SFmode
: gen
= gen_aarch64_trn2v4sf
; break;
12440 case V2SFmode
: gen
= gen_aarch64_trn2v2sf
; break;
12441 case V2DFmode
: gen
= gen_aarch64_trn2v2df
; break;
12450 case V16QImode
: gen
= gen_aarch64_trn1v16qi
; break;
12451 case V8QImode
: gen
= gen_aarch64_trn1v8qi
; break;
12452 case V8HImode
: gen
= gen_aarch64_trn1v8hi
; break;
12453 case V4HImode
: gen
= gen_aarch64_trn1v4hi
; break;
12454 case V4SImode
: gen
= gen_aarch64_trn1v4si
; break;
12455 case V2SImode
: gen
= gen_aarch64_trn1v2si
; break;
12456 case V2DImode
: gen
= gen_aarch64_trn1v2di
; break;
12457 case V4HFmode
: gen
= gen_aarch64_trn1v4hf
; break;
12458 case V8HFmode
: gen
= gen_aarch64_trn1v8hf
; break;
12459 case V4SFmode
: gen
= gen_aarch64_trn1v4sf
; break;
12460 case V2SFmode
: gen
= gen_aarch64_trn1v2sf
; break;
12461 case V2DFmode
: gen
= gen_aarch64_trn1v2df
; break;
12467 emit_insn (gen (out
, in0
, in1
));
12471 /* Recognize patterns suitable for the UZP instructions. */
12473 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
12475 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
12476 rtx out
, in0
, in1
, x
;
12477 rtx (*gen
) (rtx
, rtx
, rtx
);
12478 machine_mode vmode
= d
->vmode
;
12480 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
12483 /* Note that these are little-endian tests.
12484 We correct for big-endian later. */
12485 if (d
->perm
[0] == 0)
12487 else if (d
->perm
[0] == 1)
12491 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
12493 for (i
= 0; i
< nelt
; i
++)
12495 unsigned elt
= (i
* 2 + odd
) & mask
;
12496 if (d
->perm
[i
] != elt
)
12506 if (BYTES_BIG_ENDIAN
)
12508 x
= in0
, in0
= in1
, in1
= x
;
12517 case V16QImode
: gen
= gen_aarch64_uzp2v16qi
; break;
12518 case V8QImode
: gen
= gen_aarch64_uzp2v8qi
; break;
12519 case V8HImode
: gen
= gen_aarch64_uzp2v8hi
; break;
12520 case V4HImode
: gen
= gen_aarch64_uzp2v4hi
; break;
12521 case V4SImode
: gen
= gen_aarch64_uzp2v4si
; break;
12522 case V2SImode
: gen
= gen_aarch64_uzp2v2si
; break;
12523 case V2DImode
: gen
= gen_aarch64_uzp2v2di
; break;
12524 case V4HFmode
: gen
= gen_aarch64_uzp2v4hf
; break;
12525 case V8HFmode
: gen
= gen_aarch64_uzp2v8hf
; break;
12526 case V4SFmode
: gen
= gen_aarch64_uzp2v4sf
; break;
12527 case V2SFmode
: gen
= gen_aarch64_uzp2v2sf
; break;
12528 case V2DFmode
: gen
= gen_aarch64_uzp2v2df
; break;
12537 case V16QImode
: gen
= gen_aarch64_uzp1v16qi
; break;
12538 case V8QImode
: gen
= gen_aarch64_uzp1v8qi
; break;
12539 case V8HImode
: gen
= gen_aarch64_uzp1v8hi
; break;
12540 case V4HImode
: gen
= gen_aarch64_uzp1v4hi
; break;
12541 case V4SImode
: gen
= gen_aarch64_uzp1v4si
; break;
12542 case V2SImode
: gen
= gen_aarch64_uzp1v2si
; break;
12543 case V2DImode
: gen
= gen_aarch64_uzp1v2di
; break;
12544 case V4HFmode
: gen
= gen_aarch64_uzp1v4hf
; break;
12545 case V8HFmode
: gen
= gen_aarch64_uzp1v8hf
; break;
12546 case V4SFmode
: gen
= gen_aarch64_uzp1v4sf
; break;
12547 case V2SFmode
: gen
= gen_aarch64_uzp1v2sf
; break;
12548 case V2DFmode
: gen
= gen_aarch64_uzp1v2df
; break;
12554 emit_insn (gen (out
, in0
, in1
));
12558 /* Recognize patterns suitable for the ZIP instructions. */
12560 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
12562 unsigned int i
, high
, mask
, nelt
= d
->nelt
;
12563 rtx out
, in0
, in1
, x
;
12564 rtx (*gen
) (rtx
, rtx
, rtx
);
12565 machine_mode vmode
= d
->vmode
;
12567 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
12570 /* Note that these are little-endian tests.
12571 We correct for big-endian later. */
12573 if (d
->perm
[0] == high
)
12576 else if (d
->perm
[0] == 0)
12580 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
12582 for (i
= 0; i
< nelt
/ 2; i
++)
12584 unsigned elt
= (i
+ high
) & mask
;
12585 if (d
->perm
[i
* 2] != elt
)
12587 elt
= (elt
+ nelt
) & mask
;
12588 if (d
->perm
[i
* 2 + 1] != elt
)
12598 if (BYTES_BIG_ENDIAN
)
12600 x
= in0
, in0
= in1
, in1
= x
;
12609 case V16QImode
: gen
= gen_aarch64_zip2v16qi
; break;
12610 case V8QImode
: gen
= gen_aarch64_zip2v8qi
; break;
12611 case V8HImode
: gen
= gen_aarch64_zip2v8hi
; break;
12612 case V4HImode
: gen
= gen_aarch64_zip2v4hi
; break;
12613 case V4SImode
: gen
= gen_aarch64_zip2v4si
; break;
12614 case V2SImode
: gen
= gen_aarch64_zip2v2si
; break;
12615 case V2DImode
: gen
= gen_aarch64_zip2v2di
; break;
12616 case V4HFmode
: gen
= gen_aarch64_zip2v4hf
; break;
12617 case V8HFmode
: gen
= gen_aarch64_zip2v8hf
; break;
12618 case V4SFmode
: gen
= gen_aarch64_zip2v4sf
; break;
12619 case V2SFmode
: gen
= gen_aarch64_zip2v2sf
; break;
12620 case V2DFmode
: gen
= gen_aarch64_zip2v2df
; break;
12629 case V16QImode
: gen
= gen_aarch64_zip1v16qi
; break;
12630 case V8QImode
: gen
= gen_aarch64_zip1v8qi
; break;
12631 case V8HImode
: gen
= gen_aarch64_zip1v8hi
; break;
12632 case V4HImode
: gen
= gen_aarch64_zip1v4hi
; break;
12633 case V4SImode
: gen
= gen_aarch64_zip1v4si
; break;
12634 case V2SImode
: gen
= gen_aarch64_zip1v2si
; break;
12635 case V2DImode
: gen
= gen_aarch64_zip1v2di
; break;
12636 case V4HFmode
: gen
= gen_aarch64_zip1v4hf
; break;
12637 case V8HFmode
: gen
= gen_aarch64_zip1v8hf
; break;
12638 case V4SFmode
: gen
= gen_aarch64_zip1v4sf
; break;
12639 case V2SFmode
: gen
= gen_aarch64_zip1v2sf
; break;
12640 case V2DFmode
: gen
= gen_aarch64_zip1v2df
; break;
12646 emit_insn (gen (out
, in0
, in1
));
12650 /* Recognize patterns for the EXT insn. */
12653 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
12655 unsigned int i
, nelt
= d
->nelt
;
12656 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12659 unsigned int location
= d
->perm
[0]; /* Always < nelt. */
12661 /* Check if the extracted indices are increasing by one. */
12662 for (i
= 1; i
< nelt
; i
++)
12664 unsigned int required
= location
+ i
;
12665 if (d
->one_vector_p
)
12667 /* We'll pass the same vector in twice, so allow indices to wrap. */
12668 required
&= (nelt
- 1);
12670 if (d
->perm
[i
] != required
)
12676 case V16QImode
: gen
= gen_aarch64_extv16qi
; break;
12677 case V8QImode
: gen
= gen_aarch64_extv8qi
; break;
12678 case V4HImode
: gen
= gen_aarch64_extv4hi
; break;
12679 case V8HImode
: gen
= gen_aarch64_extv8hi
; break;
12680 case V2SImode
: gen
= gen_aarch64_extv2si
; break;
12681 case V4SImode
: gen
= gen_aarch64_extv4si
; break;
12682 case V4HFmode
: gen
= gen_aarch64_extv4hf
; break;
12683 case V8HFmode
: gen
= gen_aarch64_extv8hf
; break;
12684 case V2SFmode
: gen
= gen_aarch64_extv2sf
; break;
12685 case V4SFmode
: gen
= gen_aarch64_extv4sf
; break;
12686 case V2DImode
: gen
= gen_aarch64_extv2di
; break;
12687 case V2DFmode
: gen
= gen_aarch64_extv2df
; break;
12696 /* The case where (location == 0) is a no-op for both big- and little-endian,
12697 and is removed by the mid-end at optimization levels -O1 and higher. */
12699 if (BYTES_BIG_ENDIAN
&& (location
!= 0))
12701 /* After setup, we want the high elements of the first vector (stored
12702 at the LSB end of the register), and the low elements of the second
12703 vector (stored at the MSB end of the register). So swap. */
12704 std::swap (d
->op0
, d
->op1
);
12705 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12706 location
= nelt
- location
;
12709 offset
= GEN_INT (location
);
12710 emit_insn (gen (d
->target
, d
->op0
, d
->op1
, offset
));
12714 /* Recognize patterns for the REV insns. */
12717 aarch64_evpc_rev (struct expand_vec_perm_d
*d
)
12719 unsigned int i
, j
, diff
, nelt
= d
->nelt
;
12720 rtx (*gen
) (rtx
, rtx
);
12722 if (!d
->one_vector_p
)
12731 case V16QImode
: gen
= gen_aarch64_rev64v16qi
; break;
12732 case V8QImode
: gen
= gen_aarch64_rev64v8qi
; break;
12740 case V16QImode
: gen
= gen_aarch64_rev32v16qi
; break;
12741 case V8QImode
: gen
= gen_aarch64_rev32v8qi
; break;
12742 case V8HImode
: gen
= gen_aarch64_rev64v8hi
; break;
12743 case V4HImode
: gen
= gen_aarch64_rev64v4hi
; break;
12751 case V16QImode
: gen
= gen_aarch64_rev16v16qi
; break;
12752 case V8QImode
: gen
= gen_aarch64_rev16v8qi
; break;
12753 case V8HImode
: gen
= gen_aarch64_rev32v8hi
; break;
12754 case V4HImode
: gen
= gen_aarch64_rev32v4hi
; break;
12755 case V4SImode
: gen
= gen_aarch64_rev64v4si
; break;
12756 case V2SImode
: gen
= gen_aarch64_rev64v2si
; break;
12757 case V4SFmode
: gen
= gen_aarch64_rev64v4sf
; break;
12758 case V2SFmode
: gen
= gen_aarch64_rev64v2sf
; break;
12759 case V8HFmode
: gen
= gen_aarch64_rev64v8hf
; break;
12760 case V4HFmode
: gen
= gen_aarch64_rev64v4hf
; break;
12769 for (i
= 0; i
< nelt
; i
+= diff
+ 1)
12770 for (j
= 0; j
<= diff
; j
+= 1)
12772 /* This is guaranteed to be true as the value of diff
12773 is 7, 3, 1 and we should have enough elements in the
12774 queue to generate this. Getting a vector mask with a
12775 value of diff other than these values implies that
12776 something is wrong by the time we get here. */
12777 gcc_assert (i
+ j
< nelt
);
12778 if (d
->perm
[i
+ j
] != i
+ diff
- j
)
12786 emit_insn (gen (d
->target
, d
->op0
));
12791 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
12793 rtx (*gen
) (rtx
, rtx
, rtx
);
12794 rtx out
= d
->target
;
12796 machine_mode vmode
= d
->vmode
;
12797 unsigned int i
, elt
, nelt
= d
->nelt
;
12801 for (i
= 1; i
< nelt
; i
++)
12803 if (elt
!= d
->perm
[i
])
12807 /* The generic preparation in aarch64_expand_vec_perm_const_1
12808 swaps the operand order and the permute indices if it finds
12809 d->perm[0] to be in the second operand. Thus, we can always
12810 use d->op0 and need not do any extra arithmetic to get the
12811 correct lane number. */
12813 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
12817 case V16QImode
: gen
= gen_aarch64_dup_lanev16qi
; break;
12818 case V8QImode
: gen
= gen_aarch64_dup_lanev8qi
; break;
12819 case V8HImode
: gen
= gen_aarch64_dup_lanev8hi
; break;
12820 case V4HImode
: gen
= gen_aarch64_dup_lanev4hi
; break;
12821 case V4SImode
: gen
= gen_aarch64_dup_lanev4si
; break;
12822 case V2SImode
: gen
= gen_aarch64_dup_lanev2si
; break;
12823 case V2DImode
: gen
= gen_aarch64_dup_lanev2di
; break;
12824 case V8HFmode
: gen
= gen_aarch64_dup_lanev8hf
; break;
12825 case V4HFmode
: gen
= gen_aarch64_dup_lanev4hf
; break;
12826 case V4SFmode
: gen
= gen_aarch64_dup_lanev4sf
; break;
12827 case V2SFmode
: gen
= gen_aarch64_dup_lanev2sf
; break;
12828 case V2DFmode
: gen
= gen_aarch64_dup_lanev2df
; break;
12833 emit_insn (gen (out
, in0
, lane
));
12838 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
12840 rtx rperm
[MAX_VECT_LEN
], sel
;
12841 machine_mode vmode
= d
->vmode
;
12842 unsigned int i
, nelt
= d
->nelt
;
12847 /* Generic code will try constant permutation twice. Once with the
12848 original mode and again with the elements lowered to QImode.
12849 So wait and don't do the selector expansion ourselves. */
12850 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
12853 for (i
= 0; i
< nelt
; ++i
)
12855 int nunits
= GET_MODE_NUNITS (vmode
);
12857 /* If big-endian and two vectors we end up with a weird mixed-endian
12858 mode on NEON. Reverse the index within each word but not the word
12860 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? d
->perm
[i
] ^ (nunits
- 1)
12863 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
12864 sel
= force_reg (vmode
, sel
);
12866 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
12871 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
12873 /* The pattern matching functions above are written to look for a small
12874 number to begin the sequence (0, 1, N/2). If we begin with an index
12875 from the second operand, we can swap the operands. */
12876 if (d
->perm
[0] >= d
->nelt
)
12878 unsigned i
, nelt
= d
->nelt
;
12880 gcc_assert (nelt
== (nelt
& -nelt
));
12881 for (i
= 0; i
< nelt
; ++i
)
12882 d
->perm
[i
] ^= nelt
; /* Keep the same index, but in the other vector. */
12884 std::swap (d
->op0
, d
->op1
);
12889 if (aarch64_evpc_rev (d
))
12891 else if (aarch64_evpc_ext (d
))
12893 else if (aarch64_evpc_dup (d
))
12895 else if (aarch64_evpc_zip (d
))
12897 else if (aarch64_evpc_uzp (d
))
12899 else if (aarch64_evpc_trn (d
))
12901 return aarch64_evpc_tbl (d
);
12906 /* Expand a vec_perm_const pattern. */
12909 aarch64_expand_vec_perm_const (rtx target
, rtx op0
, rtx op1
, rtx sel
)
12911 struct expand_vec_perm_d d
;
12912 int i
, nelt
, which
;
12918 d
.vmode
= GET_MODE (target
);
12919 gcc_assert (VECTOR_MODE_P (d
.vmode
));
12920 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
12921 d
.testing_p
= false;
12923 for (i
= which
= 0; i
< nelt
; ++i
)
12925 rtx e
= XVECEXP (sel
, 0, i
);
12926 int ei
= INTVAL (e
) & (2 * nelt
- 1);
12927 which
|= (ei
< nelt
? 1 : 2);
12934 gcc_unreachable ();
12937 d
.one_vector_p
= false;
12938 if (!rtx_equal_p (op0
, op1
))
12941 /* The elements of PERM do not suggest that only the first operand
12942 is used, but both operands are identical. Allow easier matching
12943 of the permutation by folding the permutation into the single
12945 /* Fall Through. */
12947 for (i
= 0; i
< nelt
; ++i
)
12948 d
.perm
[i
] &= nelt
- 1;
12950 d
.one_vector_p
= true;
12955 d
.one_vector_p
= true;
12959 return aarch64_expand_vec_perm_const_1 (&d
);
12963 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
12964 const unsigned char *sel
)
12966 struct expand_vec_perm_d d
;
12967 unsigned int i
, nelt
, which
;
12971 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
12972 d
.testing_p
= true;
12973 memcpy (d
.perm
, sel
, nelt
);
12975 /* Calculate whether all elements are in one vector. */
12976 for (i
= which
= 0; i
< nelt
; ++i
)
12978 unsigned char e
= d
.perm
[i
];
12979 gcc_assert (e
< 2 * nelt
);
12980 which
|= (e
< nelt
? 1 : 2);
12983 /* If all elements are from the second vector, reindex as if from the
12986 for (i
= 0; i
< nelt
; ++i
)
12989 /* Check whether the mask can be applied to a single vector. */
12990 d
.one_vector_p
= (which
!= 3);
12992 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
12993 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
12994 if (!d
.one_vector_p
)
12995 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
12998 ret
= aarch64_expand_vec_perm_const_1 (&d
);
13005 aarch64_reverse_mask (enum machine_mode mode
)
13007 /* We have to reverse each vector because we dont have
13008 a permuted load that can reverse-load according to ABI rules. */
13010 rtvec v
= rtvec_alloc (16);
13012 int nunits
= GET_MODE_NUNITS (mode
);
13013 int usize
= GET_MODE_UNIT_SIZE (mode
);
13015 gcc_assert (BYTES_BIG_ENDIAN
);
13016 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
13018 for (i
= 0; i
< nunits
; i
++)
13019 for (j
= 0; j
< usize
; j
++)
13020 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
13021 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
13022 return force_reg (V16QImode
, mask
);
13025 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13026 However due to issues with register allocation it is preferable to avoid
13027 tieing integer scalar and FP scalar modes. Executing integer operations
13028 in general registers is better than treating them as scalar vector
13029 operations. This reduces latency and avoids redundant int<->FP moves.
13030 So tie modes if they are either the same class, or vector modes with
13031 other vector modes, vector structs or any scalar mode.
13035 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
13037 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
13040 /* We specifically want to allow elements of "structure" modes to
13041 be tieable to the structure. This more general condition allows
13042 other rarer situations too. */
13043 if (aarch64_vector_mode_p (mode1
) && aarch64_vector_mode_p (mode2
))
13046 /* Also allow any scalar modes with vectors. */
13047 if (aarch64_vector_mode_supported_p (mode1
)
13048 || aarch64_vector_mode_supported_p (mode2
))
13054 /* Return a new RTX holding the result of moving POINTER forward by
13058 aarch64_move_pointer (rtx pointer
, int amount
)
13060 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
13062 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
13066 /* Return a new RTX holding the result of moving POINTER forward by the
13067 size of the mode it points to. */
13070 aarch64_progress_pointer (rtx pointer
)
13072 HOST_WIDE_INT amount
= GET_MODE_SIZE (GET_MODE (pointer
));
13074 return aarch64_move_pointer (pointer
, amount
);
13077 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13081 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
13084 rtx reg
= gen_reg_rtx (mode
);
13086 /* "Cast" the pointers to the correct mode. */
13087 *src
= adjust_address (*src
, mode
, 0);
13088 *dst
= adjust_address (*dst
, mode
, 0);
13089 /* Emit the memcpy. */
13090 emit_move_insn (reg
, *src
);
13091 emit_move_insn (*dst
, reg
);
13092 /* Move the pointers forward. */
13093 *src
= aarch64_progress_pointer (*src
);
13094 *dst
= aarch64_progress_pointer (*dst
);
13097 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13098 we succeed, otherwise return false. */
13101 aarch64_expand_movmem (rtx
*operands
)
13104 rtx dst
= operands
[0];
13105 rtx src
= operands
[1];
13107 bool speed_p
= !optimize_function_for_size_p (cfun
);
13109 /* When optimizing for size, give a better estimate of the length of a
13110 memcpy call, but use the default otherwise. */
13111 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
13113 /* We can't do anything smart if the amount to copy is not constant. */
13114 if (!CONST_INT_P (operands
[2]))
13117 n
= UINTVAL (operands
[2]);
13119 /* Try to keep the number of instructions low. For cases below 16 bytes we
13120 need to make at most two moves. For cases above 16 bytes it will be one
13121 move for each 16 byte chunk, then at most two additional moves. */
13122 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
13125 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
13126 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
13128 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
13129 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
13131 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13137 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
13142 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
13147 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13148 4-byte chunk, partially overlapping with the previously copied chunk. */
13151 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13157 src
= aarch64_move_pointer (src
, move
);
13158 dst
= aarch64_move_pointer (dst
, move
);
13159 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13164 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13165 them, then (if applicable) an 8-byte chunk. */
13170 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
13175 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
13180 /* Finish the final bytes of the copy. We can always do this in one
13181 instruction. We either copy the exact amount we need, or partially
13182 overlap with the previous chunk we copied and copy 8-bytes. */
13186 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
13188 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
13190 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13195 src
= aarch64_move_pointer (src
, -1);
13196 dst
= aarch64_move_pointer (dst
, -1);
13197 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13203 src
= aarch64_move_pointer (src
, move
);
13204 dst
= aarch64_move_pointer (dst
, move
);
13205 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
13212 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13214 static unsigned HOST_WIDE_INT
13215 aarch64_asan_shadow_offset (void)
13217 return (HOST_WIDE_INT_1
<< 36);
13221 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size
,
13222 unsigned int align
,
13223 enum by_pieces_operation op
,
13226 /* STORE_BY_PIECES can be used when copying a constant string, but
13227 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13228 For now we always fail this and let the move_by_pieces code copy
13229 the string from read-only memory. */
13230 if (op
== STORE_BY_PIECES
)
13233 return default_use_by_pieces_infrastructure_p (size
, align
, op
, speed_p
);
13237 aarch64_gen_ccmp_first (rtx
*prep_seq
, rtx
*gen_seq
,
13238 int code
, tree treeop0
, tree treeop1
)
13240 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
13242 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
13244 struct expand_operand ops
[4];
13247 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
13249 op_mode
= GET_MODE (op0
);
13250 if (op_mode
== VOIDmode
)
13251 op_mode
= GET_MODE (op1
);
13259 icode
= CODE_FOR_cmpsi
;
13264 icode
= CODE_FOR_cmpdi
;
13269 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
13270 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
13275 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
13276 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
13284 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
13285 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
13291 *prep_seq
= get_insns ();
13294 create_fixed_operand (&ops
[0], op0
);
13295 create_fixed_operand (&ops
[1], op1
);
13298 if (!maybe_expand_insn (icode
, 2, ops
))
13303 *gen_seq
= get_insns ();
13306 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
13307 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
13311 aarch64_gen_ccmp_next (rtx
*prep_seq
, rtx
*gen_seq
, rtx prev
, int cmp_code
,
13312 tree treeop0
, tree treeop1
, int bit_code
)
13314 rtx op0
, op1
, target
;
13315 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
13316 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
13318 struct expand_operand ops
[6];
13321 push_to_sequence ((rtx_insn
*) *prep_seq
);
13322 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
13324 op_mode
= GET_MODE (op0
);
13325 if (op_mode
== VOIDmode
)
13326 op_mode
= GET_MODE (op1
);
13334 icode
= CODE_FOR_ccmpsi
;
13339 icode
= CODE_FOR_ccmpdi
;
13344 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
13345 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
13350 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
13351 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
13359 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
13360 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
13366 *prep_seq
= get_insns ();
13369 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
13370 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
13372 if (bit_code
!= AND
)
13374 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
13375 GET_MODE (XEXP (prev
, 0))),
13376 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
13377 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
13380 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
13381 create_fixed_operand (&ops
[1], target
);
13382 create_fixed_operand (&ops
[2], op0
);
13383 create_fixed_operand (&ops
[3], op1
);
13384 create_fixed_operand (&ops
[4], prev
);
13385 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
13387 push_to_sequence ((rtx_insn
*) *gen_seq
);
13388 if (!maybe_expand_insn (icode
, 6, ops
))
13394 *gen_seq
= get_insns ();
13397 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
13400 #undef TARGET_GEN_CCMP_FIRST
13401 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13403 #undef TARGET_GEN_CCMP_NEXT
13404 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13406 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13407 instruction fusion of some sort. */
13410 aarch64_macro_fusion_p (void)
13412 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
13416 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13417 should be kept together during scheduling. */
13420 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
13423 rtx prev_set
= single_set (prev
);
13424 rtx curr_set
= single_set (curr
);
13425 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13426 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
13428 if (!aarch64_macro_fusion_p ())
13431 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
13433 /* We are trying to match:
13434 prev (mov) == (set (reg r0) (const_int imm16))
13435 curr (movk) == (set (zero_extract (reg r0)
13438 (const_int imm16_1)) */
13440 set_dest
= SET_DEST (curr_set
);
13442 if (GET_CODE (set_dest
) == ZERO_EXTRACT
13443 && CONST_INT_P (SET_SRC (curr_set
))
13444 && CONST_INT_P (SET_SRC (prev_set
))
13445 && CONST_INT_P (XEXP (set_dest
, 2))
13446 && INTVAL (XEXP (set_dest
, 2)) == 16
13447 && REG_P (XEXP (set_dest
, 0))
13448 && REG_P (SET_DEST (prev_set
))
13449 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
13455 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
13458 /* We're trying to match:
13459 prev (adrp) == (set (reg r1)
13460 (high (symbol_ref ("SYM"))))
13461 curr (add) == (set (reg r0)
13463 (symbol_ref ("SYM"))))
13464 Note that r0 need not necessarily be the same as r1, especially
13465 during pre-regalloc scheduling. */
13467 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
13468 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
13470 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
13471 && REG_P (XEXP (SET_SRC (curr_set
), 0))
13472 && REGNO (XEXP (SET_SRC (curr_set
), 0))
13473 == REGNO (SET_DEST (prev_set
))
13474 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
13475 XEXP (SET_SRC (curr_set
), 1)))
13480 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
13483 /* We're trying to match:
13484 prev (movk) == (set (zero_extract (reg r0)
13487 (const_int imm16_1))
13488 curr (movk) == (set (zero_extract (reg r0)
13491 (const_int imm16_2)) */
13493 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
13494 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
13495 && REG_P (XEXP (SET_DEST (prev_set
), 0))
13496 && REG_P (XEXP (SET_DEST (curr_set
), 0))
13497 && REGNO (XEXP (SET_DEST (prev_set
), 0))
13498 == REGNO (XEXP (SET_DEST (curr_set
), 0))
13499 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
13500 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
13501 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
13502 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
13503 && CONST_INT_P (SET_SRC (prev_set
))
13504 && CONST_INT_P (SET_SRC (curr_set
)))
13508 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
13510 /* We're trying to match:
13511 prev (adrp) == (set (reg r0)
13512 (high (symbol_ref ("SYM"))))
13513 curr (ldr) == (set (reg r1)
13514 (mem (lo_sum (reg r0)
13515 (symbol_ref ("SYM")))))
13517 curr (ldr) == (set (reg r1)
13520 (symbol_ref ("SYM")))))) */
13521 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
13522 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
13524 rtx curr_src
= SET_SRC (curr_set
);
13526 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
13527 curr_src
= XEXP (curr_src
, 0);
13529 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
13530 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
13531 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
13532 == REGNO (SET_DEST (prev_set
))
13533 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
13534 XEXP (SET_SRC (prev_set
), 0)))
13539 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
13540 && aarch_crypto_can_dual_issue (prev
, curr
))
13543 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
13544 && any_condjump_p (curr
))
13546 enum attr_type prev_type
= get_attr_type (prev
);
13548 /* FIXME: this misses some which is considered simple arthematic
13549 instructions for ThunderX. Simple shifts are missed here. */
13550 if (prev_type
== TYPE_ALUS_SREG
13551 || prev_type
== TYPE_ALUS_IMM
13552 || prev_type
== TYPE_LOGICS_REG
13553 || prev_type
== TYPE_LOGICS_IMM
)
13560 /* Return true iff the instruction fusion described by OP is enabled. */
13563 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
13565 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
13568 /* If MEM is in the form of [base+offset], extract the two parts
13569 of address and set to BASE and OFFSET, otherwise return false
13570 after clearing BASE and OFFSET. */
13573 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
13577 gcc_assert (MEM_P (mem
));
13579 addr
= XEXP (mem
, 0);
13584 *offset
= const0_rtx
;
13588 if (GET_CODE (addr
) == PLUS
13589 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
13591 *base
= XEXP (addr
, 0);
13592 *offset
= XEXP (addr
, 1);
13597 *offset
= NULL_RTX
;
13602 /* Types for scheduling fusion. */
13603 enum sched_fusion_type
13605 SCHED_FUSION_NONE
= 0,
13606 SCHED_FUSION_LD_SIGN_EXTEND
,
13607 SCHED_FUSION_LD_ZERO_EXTEND
,
13613 /* If INSN is a load or store of address in the form of [base+offset],
13614 extract the two parts and set to BASE and OFFSET. Return scheduling
13615 fusion type this INSN is. */
13617 static enum sched_fusion_type
13618 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
13621 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
13623 gcc_assert (INSN_P (insn
));
13624 x
= PATTERN (insn
);
13625 if (GET_CODE (x
) != SET
)
13626 return SCHED_FUSION_NONE
;
13629 dest
= SET_DEST (x
);
13631 machine_mode dest_mode
= GET_MODE (dest
);
13633 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
13634 return SCHED_FUSION_NONE
;
13636 if (GET_CODE (src
) == SIGN_EXTEND
)
13638 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
13639 src
= XEXP (src
, 0);
13640 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
13641 return SCHED_FUSION_NONE
;
13643 else if (GET_CODE (src
) == ZERO_EXTEND
)
13645 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
13646 src
= XEXP (src
, 0);
13647 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
13648 return SCHED_FUSION_NONE
;
13651 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
13652 extract_base_offset_in_addr (src
, base
, offset
);
13653 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
13655 fusion
= SCHED_FUSION_ST
;
13656 extract_base_offset_in_addr (dest
, base
, offset
);
13659 return SCHED_FUSION_NONE
;
13661 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
13662 fusion
= SCHED_FUSION_NONE
;
13667 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13669 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13670 and PRI are only calculated for these instructions. For other instruction,
13671 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13672 type instruction fusion can be added by returning different priorities.
13674 It's important that irrelevant instructions get the largest FUSION_PRI. */
13677 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
13678 int *fusion_pri
, int *pri
)
13682 enum sched_fusion_type fusion
;
13684 gcc_assert (INSN_P (insn
));
13687 fusion
= fusion_load_store (insn
, &base
, &offset
);
13688 if (fusion
== SCHED_FUSION_NONE
)
13695 /* Set FUSION_PRI according to fusion type and base register. */
13696 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
13698 /* Calculate PRI. */
13701 /* INSN with smaller offset goes first. */
13702 off_val
= (int)(INTVAL (offset
));
13704 tmp
-= (off_val
& 0xfffff);
13706 tmp
+= ((- off_val
) & 0xfffff);
13712 /* Given OPERANDS of consecutive load/store, check if we can merge
13713 them into ldp/stp. LOAD is true if they are load instructions.
13714 MODE is the mode of memory operands. */
13717 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
13718 enum machine_mode mode
)
13720 HOST_WIDE_INT offval_1
, offval_2
, msize
;
13721 enum reg_class rclass_1
, rclass_2
;
13722 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
13726 mem_1
= operands
[1];
13727 mem_2
= operands
[3];
13728 reg_1
= operands
[0];
13729 reg_2
= operands
[2];
13730 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
13731 if (REGNO (reg_1
) == REGNO (reg_2
))
13736 mem_1
= operands
[0];
13737 mem_2
= operands
[2];
13738 reg_1
= operands
[1];
13739 reg_2
= operands
[3];
13742 /* The mems cannot be volatile. */
13743 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
13746 /* If we have SImode and slow unaligned ldp,
13747 check the alignment to be at least 8 byte. */
13749 && (aarch64_tune_params
.extra_tuning_flags
13750 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
13752 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
13755 /* Check if the addresses are in the form of [base+offset]. */
13756 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
13757 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
13759 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
13760 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
13763 /* Check if the bases are same. */
13764 if (!rtx_equal_p (base_1
, base_2
))
13767 offval_1
= INTVAL (offset_1
);
13768 offval_2
= INTVAL (offset_2
);
13769 msize
= GET_MODE_SIZE (mode
);
13770 /* Check if the offsets are consecutive. */
13771 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
13774 /* Check if the addresses are clobbered by load. */
13777 if (reg_mentioned_p (reg_1
, mem_1
))
13780 /* In increasing order, the last load can clobber the address. */
13781 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
13785 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
13786 rclass_1
= FP_REGS
;
13788 rclass_1
= GENERAL_REGS
;
13790 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
13791 rclass_2
= FP_REGS
;
13793 rclass_2
= GENERAL_REGS
;
13795 /* Check if the registers are of same class. */
13796 if (rclass_1
!= rclass_2
)
13802 /* Given OPERANDS of consecutive load/store, check if we can merge
13803 them into ldp/stp by adjusting the offset. LOAD is true if they
13804 are load instructions. MODE is the mode of memory operands.
13806 Given below consecutive stores:
13808 str w1, [xb, 0x100]
13809 str w1, [xb, 0x104]
13810 str w1, [xb, 0x108]
13811 str w1, [xb, 0x10c]
13813 Though the offsets are out of the range supported by stp, we can
13814 still pair them after adjusting the offset, like:
13816 add scratch, xb, 0x100
13817 stp w1, w1, [scratch]
13818 stp w1, w1, [scratch, 0x8]
13820 The peephole patterns detecting this opportunity should guarantee
13821 the scratch register is avaliable. */
13824 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
13825 enum machine_mode mode
)
13827 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
13828 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
13829 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
13830 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
13834 reg_1
= operands
[0];
13835 mem_1
= operands
[1];
13836 reg_2
= operands
[2];
13837 mem_2
= operands
[3];
13838 reg_3
= operands
[4];
13839 mem_3
= operands
[5];
13840 reg_4
= operands
[6];
13841 mem_4
= operands
[7];
13842 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
13843 && REG_P (reg_3
) && REG_P (reg_4
));
13844 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
13849 mem_1
= operands
[0];
13850 reg_1
= operands
[1];
13851 mem_2
= operands
[2];
13852 reg_2
= operands
[3];
13853 mem_3
= operands
[4];
13854 reg_3
= operands
[5];
13855 mem_4
= operands
[6];
13856 reg_4
= operands
[7];
13858 /* Skip if memory operand is by itslef valid for ldp/stp. */
13859 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
13862 /* The mems cannot be volatile. */
13863 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
13864 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
13867 /* Check if the addresses are in the form of [base+offset]. */
13868 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
13869 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
13871 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
13872 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
13874 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
13875 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
13877 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
13878 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
13881 /* Check if the bases are same. */
13882 if (!rtx_equal_p (base_1
, base_2
)
13883 || !rtx_equal_p (base_2
, base_3
)
13884 || !rtx_equal_p (base_3
, base_4
))
13887 offval_1
= INTVAL (offset_1
);
13888 offval_2
= INTVAL (offset_2
);
13889 offval_3
= INTVAL (offset_3
);
13890 offval_4
= INTVAL (offset_4
);
13891 msize
= GET_MODE_SIZE (mode
);
13892 /* Check if the offsets are consecutive. */
13893 if ((offval_1
!= (offval_2
+ msize
)
13894 || offval_1
!= (offval_3
+ msize
* 2)
13895 || offval_1
!= (offval_4
+ msize
* 3))
13896 && (offval_4
!= (offval_3
+ msize
)
13897 || offval_4
!= (offval_2
+ msize
* 2)
13898 || offval_4
!= (offval_1
+ msize
* 3)))
13901 /* Check if the addresses are clobbered by load. */
13904 if (reg_mentioned_p (reg_1
, mem_1
)
13905 || reg_mentioned_p (reg_2
, mem_2
)
13906 || reg_mentioned_p (reg_3
, mem_3
))
13909 /* In increasing order, the last load can clobber the address. */
13910 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
13914 /* If we have SImode and slow unaligned ldp,
13915 check the alignment to be at least 8 byte. */
13917 && (aarch64_tune_params
.extra_tuning_flags
13918 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
13920 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
13923 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
13924 rclass_1
= FP_REGS
;
13926 rclass_1
= GENERAL_REGS
;
13928 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
13929 rclass_2
= FP_REGS
;
13931 rclass_2
= GENERAL_REGS
;
13933 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
13934 rclass_3
= FP_REGS
;
13936 rclass_3
= GENERAL_REGS
;
13938 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
13939 rclass_4
= FP_REGS
;
13941 rclass_4
= GENERAL_REGS
;
13943 /* Check if the registers are of same class. */
13944 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
13950 /* Given OPERANDS of consecutive load/store, this function pairs them
13951 into ldp/stp after adjusting the offset. It depends on the fact
13952 that addresses of load/store instructions are in increasing order.
13953 MODE is the mode of memory operands. CODE is the rtl operator
13954 which should be applied to all memory operands, it's SIGN_EXTEND,
13955 ZERO_EXTEND or UNKNOWN. */
13958 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
13959 enum machine_mode mode
, RTX_CODE code
)
13961 rtx base
, offset
, t1
, t2
;
13962 rtx mem_1
, mem_2
, mem_3
, mem_4
;
13963 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
13967 mem_1
= operands
[1];
13968 mem_2
= operands
[3];
13969 mem_3
= operands
[5];
13970 mem_4
= operands
[7];
13974 mem_1
= operands
[0];
13975 mem_2
= operands
[2];
13976 mem_3
= operands
[4];
13977 mem_4
= operands
[6];
13978 gcc_assert (code
== UNKNOWN
);
13981 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
13982 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
13984 /* Adjust offset thus it can fit in ldp/stp instruction. */
13985 msize
= GET_MODE_SIZE (mode
);
13986 stp_off_limit
= msize
* 0x40;
13987 off_val
= INTVAL (offset
);
13988 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
13989 new_off
= abs_off
% stp_off_limit
;
13990 adj_off
= abs_off
- new_off
;
13992 /* Further adjust to make sure all offsets are OK. */
13993 if ((new_off
+ msize
* 2) >= stp_off_limit
)
13995 adj_off
+= stp_off_limit
;
13996 new_off
-= stp_off_limit
;
13999 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14000 if (adj_off
>= 0x1000)
14005 adj_off
= -adj_off
;
14006 new_off
= -new_off
;
14009 /* Create new memory references. */
14010 mem_1
= change_address (mem_1
, VOIDmode
,
14011 plus_constant (DImode
, operands
[8], new_off
));
14013 /* Check if the adjusted address is OK for ldp/stp. */
14014 if (!aarch64_mem_pair_operand (mem_1
, mode
))
14017 msize
= GET_MODE_SIZE (mode
);
14018 mem_2
= change_address (mem_2
, VOIDmode
,
14019 plus_constant (DImode
,
14022 mem_3
= change_address (mem_3
, VOIDmode
,
14023 plus_constant (DImode
,
14025 new_off
+ msize
* 2));
14026 mem_4
= change_address (mem_4
, VOIDmode
,
14027 plus_constant (DImode
,
14029 new_off
+ msize
* 3));
14031 if (code
== ZERO_EXTEND
)
14033 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
14034 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
14035 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
14036 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
14038 else if (code
== SIGN_EXTEND
)
14040 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
14041 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
14042 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
14043 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
14048 operands
[1] = mem_1
;
14049 operands
[3] = mem_2
;
14050 operands
[5] = mem_3
;
14051 operands
[7] = mem_4
;
14055 operands
[0] = mem_1
;
14056 operands
[2] = mem_2
;
14057 operands
[4] = mem_3
;
14058 operands
[6] = mem_4
;
14061 /* Emit adjusting instruction. */
14062 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, adj_off
)));
14063 /* Emit ldp/stp instructions. */
14064 t1
= gen_rtx_SET (operands
[0], operands
[1]);
14065 t2
= gen_rtx_SET (operands
[2], operands
[3]);
14066 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
14067 t1
= gen_rtx_SET (operands
[4], operands
[5]);
14068 t2
= gen_rtx_SET (operands
[6], operands
[7]);
14069 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
14073 /* Return 1 if pseudo register should be created and used to hold
14074 GOT address for PIC code. */
14077 aarch64_use_pseudo_pic_reg (void)
14079 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
14082 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14085 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
14087 switch (XINT (x
, 1))
14089 case UNSPEC_GOTSMALLPIC
:
14090 case UNSPEC_GOTSMALLPIC28K
:
14091 case UNSPEC_GOTTINYPIC
:
14097 return default_unspec_may_trap_p (x
, flags
);
14101 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14102 return the log2 of that value. Otherwise return -1. */
14105 aarch64_fpconst_pow_of_2 (rtx x
)
14107 const REAL_VALUE_TYPE
*r
;
14109 if (!CONST_DOUBLE_P (x
))
14112 r
= CONST_DOUBLE_REAL_VALUE (x
);
14114 if (REAL_VALUE_NEGATIVE (*r
)
14115 || REAL_VALUE_ISNAN (*r
)
14116 || REAL_VALUE_ISINF (*r
)
14117 || !real_isinteger (r
, DFmode
))
14120 return exact_log2 (real_to_integer (r
));
14123 /* If X is a vector of equal CONST_DOUBLE values and that value is
14124 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14127 aarch64_vec_fpconst_pow_of_2 (rtx x
)
14129 if (GET_CODE (x
) != CONST_VECTOR
)
14132 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
14135 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
14139 for (int i
= 1; i
< CONST_VECTOR_NUNITS (x
); i
++)
14140 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
14146 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
14148 aarch64_promoted_type (const_tree t
)
14150 if (SCALAR_FLOAT_TYPE_P (t
) && TYPE_PRECISION (t
) == 16)
14151 return float_type_node
;
14155 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14158 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
14159 optimization_type opt_type
)
14164 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
14171 #undef TARGET_ADDRESS_COST
14172 #define TARGET_ADDRESS_COST aarch64_address_cost
14174 /* This hook will determines whether unnamed bitfields affect the alignment
14175 of the containing structure. The hook returns true if the structure
14176 should inherit the alignment requirements of an unnamed bitfield's
14178 #undef TARGET_ALIGN_ANON_BITFIELD
14179 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14181 #undef TARGET_ASM_ALIGNED_DI_OP
14182 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14184 #undef TARGET_ASM_ALIGNED_HI_OP
14185 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14187 #undef TARGET_ASM_ALIGNED_SI_OP
14188 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14190 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14191 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14192 hook_bool_const_tree_hwi_hwi_const_tree_true
14194 #undef TARGET_ASM_FILE_START
14195 #define TARGET_ASM_FILE_START aarch64_start_file
14197 #undef TARGET_ASM_OUTPUT_MI_THUNK
14198 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14200 #undef TARGET_ASM_SELECT_RTX_SECTION
14201 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14203 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14204 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14206 #undef TARGET_BUILD_BUILTIN_VA_LIST
14207 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14209 #undef TARGET_CALLEE_COPIES
14210 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14212 #undef TARGET_CAN_ELIMINATE
14213 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14215 #undef TARGET_CAN_INLINE_P
14216 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14218 #undef TARGET_CANNOT_FORCE_CONST_MEM
14219 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14221 #undef TARGET_CASE_VALUES_THRESHOLD
14222 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14224 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14225 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14227 /* Only the least significant bit is used for initialization guard
14229 #undef TARGET_CXX_GUARD_MASK_BIT
14230 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14232 #undef TARGET_C_MODE_FOR_SUFFIX
14233 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14235 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14236 #undef TARGET_DEFAULT_TARGET_FLAGS
14237 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14240 #undef TARGET_CLASS_MAX_NREGS
14241 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14243 #undef TARGET_BUILTIN_DECL
14244 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14246 #undef TARGET_BUILTIN_RECIPROCAL
14247 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14249 #undef TARGET_EXPAND_BUILTIN
14250 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14252 #undef TARGET_EXPAND_BUILTIN_VA_START
14253 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14255 #undef TARGET_FOLD_BUILTIN
14256 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14258 #undef TARGET_FUNCTION_ARG
14259 #define TARGET_FUNCTION_ARG aarch64_function_arg
14261 #undef TARGET_FUNCTION_ARG_ADVANCE
14262 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14264 #undef TARGET_FUNCTION_ARG_BOUNDARY
14265 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14267 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14268 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14270 #undef TARGET_FUNCTION_VALUE
14271 #define TARGET_FUNCTION_VALUE aarch64_function_value
14273 #undef TARGET_FUNCTION_VALUE_REGNO_P
14274 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14276 #undef TARGET_FRAME_POINTER_REQUIRED
14277 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14279 #undef TARGET_GIMPLE_FOLD_BUILTIN
14280 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14282 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14283 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14285 #undef TARGET_INIT_BUILTINS
14286 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14288 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14289 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14290 aarch64_ira_change_pseudo_allocno_class
14292 #undef TARGET_LEGITIMATE_ADDRESS_P
14293 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14295 #undef TARGET_LEGITIMATE_CONSTANT_P
14296 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14298 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14299 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14300 aarch64_legitimize_address_displacement
14302 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14303 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14305 #undef TARGET_MANGLE_TYPE
14306 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14308 #undef TARGET_MEMORY_MOVE_COST
14309 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14311 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14312 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14314 #undef TARGET_MUST_PASS_IN_STACK
14315 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14317 /* This target hook should return true if accesses to volatile bitfields
14318 should use the narrowest mode possible. It should return false if these
14319 accesses should use the bitfield container type. */
14320 #undef TARGET_NARROW_VOLATILE_BITFIELD
14321 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14323 #undef TARGET_OPTION_OVERRIDE
14324 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14326 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14327 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14328 aarch64_override_options_after_change
14330 #undef TARGET_OPTION_SAVE
14331 #define TARGET_OPTION_SAVE aarch64_option_save
14333 #undef TARGET_OPTION_RESTORE
14334 #define TARGET_OPTION_RESTORE aarch64_option_restore
14336 #undef TARGET_OPTION_PRINT
14337 #define TARGET_OPTION_PRINT aarch64_option_print
14339 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14340 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14342 #undef TARGET_SET_CURRENT_FUNCTION
14343 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14345 #undef TARGET_PASS_BY_REFERENCE
14346 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14348 #undef TARGET_PREFERRED_RELOAD_CLASS
14349 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14351 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14352 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14354 #undef TARGET_PROMOTED_TYPE
14355 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14357 #undef TARGET_SECONDARY_RELOAD
14358 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14360 #undef TARGET_SHIFT_TRUNCATION_MASK
14361 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14363 #undef TARGET_SETUP_INCOMING_VARARGS
14364 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14366 #undef TARGET_STRUCT_VALUE_RTX
14367 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14369 #undef TARGET_REGISTER_MOVE_COST
14370 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14372 #undef TARGET_RETURN_IN_MEMORY
14373 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14375 #undef TARGET_RETURN_IN_MSB
14376 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14378 #undef TARGET_RTX_COSTS
14379 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14381 #undef TARGET_SCHED_ISSUE_RATE
14382 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14384 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14385 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14386 aarch64_sched_first_cycle_multipass_dfa_lookahead
14388 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14389 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14390 aarch64_first_cycle_multipass_dfa_lookahead_guard
14392 #undef TARGET_TRAMPOLINE_INIT
14393 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14395 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14396 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14398 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14399 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14401 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14402 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14404 #undef TARGET_VECTORIZE_ADD_STMT_COST
14405 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14407 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14408 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14409 aarch64_builtin_vectorization_cost
14411 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14412 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14414 #undef TARGET_VECTORIZE_BUILTINS
14415 #define TARGET_VECTORIZE_BUILTINS
14417 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14418 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14419 aarch64_builtin_vectorized_function
14421 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14422 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14423 aarch64_autovectorize_vector_sizes
14425 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14426 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14427 aarch64_atomic_assign_expand_fenv
14429 /* Section anchor support. */
14431 #undef TARGET_MIN_ANCHOR_OFFSET
14432 #define TARGET_MIN_ANCHOR_OFFSET -256
14434 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14435 byte offset; we can do much more for larger data types, but have no way
14436 to determine the size of the access. We assume accesses are aligned. */
14437 #undef TARGET_MAX_ANCHOR_OFFSET
14438 #define TARGET_MAX_ANCHOR_OFFSET 4095
14440 #undef TARGET_VECTOR_ALIGNMENT
14441 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14443 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14444 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14445 aarch64_simd_vector_alignment_reachable
14447 /* vec_perm support. */
14449 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14450 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14451 aarch64_vectorize_vec_perm_const_ok
14453 #undef TARGET_INIT_LIBFUNCS
14454 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14456 #undef TARGET_FIXED_CONDITION_CODE_REGS
14457 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14459 #undef TARGET_FLAGS_REGNUM
14460 #define TARGET_FLAGS_REGNUM CC_REGNUM
14462 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14463 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14465 #undef TARGET_ASAN_SHADOW_OFFSET
14466 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14468 #undef TARGET_LEGITIMIZE_ADDRESS
14469 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14471 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14472 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14473 aarch64_use_by_pieces_infrastructure_p
14475 #undef TARGET_CAN_USE_DOLOOP_P
14476 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14478 #undef TARGET_SCHED_MACRO_FUSION_P
14479 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14481 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14482 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14484 #undef TARGET_SCHED_FUSION_PRIORITY
14485 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14487 #undef TARGET_UNSPEC_MAY_TRAP_P
14488 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14490 #undef TARGET_USE_PSEUDO_PIC_REG
14491 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14493 #undef TARGET_PRINT_OPERAND
14494 #define TARGET_PRINT_OPERAND aarch64_print_operand
14496 #undef TARGET_PRINT_OPERAND_ADDRESS
14497 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14499 #undef TARGET_OPTAB_SUPPORTED_P
14500 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14502 #undef TARGET_OMIT_STRUCT_RETURN_REG
14503 #define TARGET_OMIT_STRUCT_RETURN_REG true
14505 struct gcc_target targetm
= TARGET_INITIALIZER
;
14507 #include "gt-aarch64.h"