[AArch64] Remove aarch64_frame_pointer_required
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobf58f192273e3ad313e154f28df0a88188cde36db
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
145 vec_perm_indices);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
826 for now. */
827 static const struct tune_params saphira_tunings =
829 &generic_extra_costs,
830 &generic_addrcost_table,
831 &generic_regmove_cost,
832 &generic_vector_cost,
833 &generic_branch_cost,
834 &generic_approx_modes,
835 4, /* memmov_cost */
836 4, /* issue_rate */
837 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
838 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
839 16, /* function_align. */
840 8, /* jump_align. */
841 16, /* loop_align. */
842 2, /* int_reassoc_width. */
843 4, /* fp_reassoc_width. */
844 1, /* vec_reassoc_width. */
845 2, /* min_div_recip_mul_sf. */
846 2, /* min_div_recip_mul_df. */
847 0, /* max_case_values. */
848 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
849 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
850 &generic_prefetch_tune
853 static const struct tune_params thunderx2t99_tunings =
855 &thunderx2t99_extra_costs,
856 &thunderx2t99_addrcost_table,
857 &thunderx2t99_regmove_cost,
858 &thunderx2t99_vector_cost,
859 &generic_branch_cost,
860 &generic_approx_modes,
861 4, /* memmov_cost. */
862 4, /* issue_rate. */
863 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
864 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
865 16, /* function_align. */
866 8, /* jump_align. */
867 16, /* loop_align. */
868 3, /* int_reassoc_width. */
869 2, /* fp_reassoc_width. */
870 2, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
876 &thunderx2t99_prefetch_tune
879 /* Support for fine-grained override of the tuning structures. */
880 struct aarch64_tuning_override_function
882 const char* name;
883 void (*parse_override)(const char*, struct tune_params*);
886 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
887 static void aarch64_parse_tune_string (const char*, struct tune_params*);
889 static const struct aarch64_tuning_override_function
890 aarch64_tuning_override_functions[] =
892 { "fuse", aarch64_parse_fuse_string },
893 { "tune", aarch64_parse_tune_string },
894 { NULL, NULL }
897 /* A processor implementing AArch64. */
898 struct processor
900 const char *const name;
901 enum aarch64_processor ident;
902 enum aarch64_processor sched_core;
903 enum aarch64_arch arch;
904 unsigned architecture_version;
905 const unsigned long flags;
906 const struct tune_params *const tune;
909 /* Architectures implementing AArch64. */
910 static const struct processor all_architectures[] =
912 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
913 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
914 #include "aarch64-arches.def"
915 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
918 /* Processor cores implementing AArch64. */
919 static const struct processor all_cores[] =
921 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
922 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
923 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
924 FLAGS, &COSTS##_tunings},
925 #include "aarch64-cores.def"
926 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
927 AARCH64_FL_FOR_ARCH8, &generic_tunings},
928 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
932 /* Target specification. These are populated by the -march, -mtune, -mcpu
933 handling code or by target attributes. */
934 static const struct processor *selected_arch;
935 static const struct processor *selected_cpu;
936 static const struct processor *selected_tune;
938 /* The current tuning set. */
939 struct tune_params aarch64_tune_params = generic_tunings;
941 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
943 /* An ISA extension in the co-processor and main instruction set space. */
944 struct aarch64_option_extension
946 const char *const name;
947 const unsigned long flags_on;
948 const unsigned long flags_off;
951 typedef enum aarch64_cond_code
953 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
954 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
955 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
957 aarch64_cc;
959 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
961 /* The condition codes of the processor, and the inverse function. */
962 static const char * const aarch64_condition_codes[] =
964 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
965 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
968 /* Generate code to enable conditional branches in functions over 1 MiB. */
969 const char *
970 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
971 const char * branch_format)
973 rtx_code_label * tmp_label = gen_label_rtx ();
974 char label_buf[256];
975 char buffer[128];
976 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
977 CODE_LABEL_NUMBER (tmp_label));
978 const char *label_ptr = targetm.strip_name_encoding (label_buf);
979 rtx dest_label = operands[pos_label];
980 operands[pos_label] = tmp_label;
982 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
983 output_asm_insn (buffer, operands);
985 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
986 operands[pos_label] = dest_label;
987 output_asm_insn (buffer, operands);
988 return "";
991 void
992 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
994 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
995 if (TARGET_GENERAL_REGS_ONLY)
996 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
997 else
998 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1001 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1002 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1003 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1004 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1005 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1006 irrespectively of its cost results in bad allocations with many redundant
1007 int<->FP moves which are expensive on various cores.
1008 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1009 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1010 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1011 Otherwise set the allocno class depending on the mode.
1012 The result of this is that it is no longer inefficient to have a higher
1013 memory move cost than the register move cost.
1016 static reg_class_t
1017 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1018 reg_class_t best_class)
1020 machine_mode mode;
1022 if (allocno_class != ALL_REGS)
1023 return allocno_class;
1025 if (best_class != ALL_REGS)
1026 return best_class;
1028 mode = PSEUDO_REGNO_MODE (regno);
1029 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1032 static unsigned int
1033 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1035 if (GET_MODE_UNIT_SIZE (mode) == 4)
1036 return aarch64_tune_params.min_div_recip_mul_sf;
1037 return aarch64_tune_params.min_div_recip_mul_df;
1040 static int
1041 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1042 machine_mode mode)
1044 if (VECTOR_MODE_P (mode))
1045 return aarch64_tune_params.vec_reassoc_width;
1046 if (INTEGRAL_MODE_P (mode))
1047 return aarch64_tune_params.int_reassoc_width;
1048 if (FLOAT_MODE_P (mode))
1049 return aarch64_tune_params.fp_reassoc_width;
1050 return 1;
1053 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1054 unsigned
1055 aarch64_dbx_register_number (unsigned regno)
1057 if (GP_REGNUM_P (regno))
1058 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1059 else if (regno == SP_REGNUM)
1060 return AARCH64_DWARF_SP;
1061 else if (FP_REGNUM_P (regno))
1062 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1064 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1065 equivalent DWARF register. */
1066 return DWARF_FRAME_REGISTERS;
1069 /* Return TRUE if MODE is any of the large INT modes. */
1070 static bool
1071 aarch64_vect_struct_mode_p (machine_mode mode)
1073 return mode == OImode || mode == CImode || mode == XImode;
1076 /* Return TRUE if MODE is any of the vector modes. */
1077 static bool
1078 aarch64_vector_mode_p (machine_mode mode)
1080 return aarch64_vector_mode_supported_p (mode)
1081 || aarch64_vect_struct_mode_p (mode);
1084 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1085 static bool
1086 aarch64_array_mode_supported_p (machine_mode mode,
1087 unsigned HOST_WIDE_INT nelems)
1089 if (TARGET_SIMD
1090 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1091 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1092 && (nelems >= 2 && nelems <= 4))
1093 return true;
1095 return false;
1098 /* Implement TARGET_HARD_REGNO_NREGS. */
1100 static unsigned int
1101 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1103 switch (aarch64_regno_regclass (regno))
1105 case FP_REGS:
1106 case FP_LO_REGS:
1107 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1108 default:
1109 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1111 gcc_unreachable ();
1114 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1116 static bool
1117 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1119 if (GET_MODE_CLASS (mode) == MODE_CC)
1120 return regno == CC_REGNUM;
1122 if (regno == SP_REGNUM)
1123 /* The purpose of comparing with ptr_mode is to support the
1124 global register variable associated with the stack pointer
1125 register via the syntax of asm ("wsp") in ILP32. */
1126 return mode == Pmode || mode == ptr_mode;
1128 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1129 return mode == Pmode;
1131 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1132 return true;
1134 if (FP_REGNUM_P (regno))
1136 if (aarch64_vect_struct_mode_p (mode))
1137 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1138 else
1139 return true;
1142 return false;
1145 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1146 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1147 clobbers the top 64 bits when restoring the bottom 64 bits. */
1149 static bool
1150 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1152 return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1155 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1156 machine_mode
1157 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1158 machine_mode mode)
1160 /* Handle modes that fit within single registers. */
1161 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1163 if (GET_MODE_SIZE (mode) >= 4)
1164 return mode;
1165 else
1166 return SImode;
1168 /* Fall back to generic for multi-reg and very large modes. */
1169 else
1170 return choose_hard_reg_mode (regno, nregs, false);
1173 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1174 that strcpy from constants will be faster. */
1176 static HOST_WIDE_INT
1177 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1179 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1180 return MAX (align, BITS_PER_WORD);
1181 return align;
1184 /* Return true if calls to DECL should be treated as
1185 long-calls (ie called via a register). */
1186 static bool
1187 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1189 return false;
1192 /* Return true if calls to symbol-ref SYM should be treated as
1193 long-calls (ie called via a register). */
1194 bool
1195 aarch64_is_long_call_p (rtx sym)
1197 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1200 /* Return true if calls to symbol-ref SYM should not go through
1201 plt stubs. */
1203 bool
1204 aarch64_is_noplt_call_p (rtx sym)
1206 const_tree decl = SYMBOL_REF_DECL (sym);
1208 if (flag_pic
1209 && decl
1210 && (!flag_plt
1211 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1212 && !targetm.binds_local_p (decl))
1213 return true;
1215 return false;
1218 /* Return true if the offsets to a zero/sign-extract operation
1219 represent an expression that matches an extend operation. The
1220 operands represent the paramters from
1222 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1223 bool
1224 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1225 rtx extract_imm)
1227 HOST_WIDE_INT mult_val, extract_val;
1229 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1230 return false;
1232 mult_val = INTVAL (mult_imm);
1233 extract_val = INTVAL (extract_imm);
1235 if (extract_val > 8
1236 && extract_val < GET_MODE_BITSIZE (mode)
1237 && exact_log2 (extract_val & ~7) > 0
1238 && (extract_val & 7) <= 4
1239 && mult_val == (1 << (extract_val & 7)))
1240 return true;
1242 return false;
1245 /* Emit an insn that's a simple single-set. Both the operands must be
1246 known to be valid. */
1247 inline static rtx_insn *
1248 emit_set_insn (rtx x, rtx y)
1250 return emit_insn (gen_rtx_SET (x, y));
1253 /* X and Y are two things to compare using CODE. Emit the compare insn and
1254 return the rtx for register 0 in the proper mode. */
1256 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1258 machine_mode mode = SELECT_CC_MODE (code, x, y);
1259 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1261 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1262 return cc_reg;
1265 /* Build the SYMBOL_REF for __tls_get_addr. */
1267 static GTY(()) rtx tls_get_addr_libfunc;
1270 aarch64_tls_get_addr (void)
1272 if (!tls_get_addr_libfunc)
1273 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1274 return tls_get_addr_libfunc;
1277 /* Return the TLS model to use for ADDR. */
1279 static enum tls_model
1280 tls_symbolic_operand_type (rtx addr)
1282 enum tls_model tls_kind = TLS_MODEL_NONE;
1283 rtx sym, addend;
1285 if (GET_CODE (addr) == CONST)
1287 split_const (addr, &sym, &addend);
1288 if (GET_CODE (sym) == SYMBOL_REF)
1289 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1291 else if (GET_CODE (addr) == SYMBOL_REF)
1292 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1294 return tls_kind;
1297 /* We'll allow lo_sum's in addresses in our legitimate addresses
1298 so that combine would take care of combining addresses where
1299 necessary, but for generation purposes, we'll generate the address
1300 as :
1301 RTL Absolute
1302 tmp = hi (symbol_ref); adrp x1, foo
1303 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1306 PIC TLS
1307 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1308 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1309 bl __tls_get_addr
1312 Load TLS symbol, depending on TLS mechanism and TLS access model.
1314 Global Dynamic - Traditional TLS:
1315 adrp tmp, :tlsgd:imm
1316 add dest, tmp, #:tlsgd_lo12:imm
1317 bl __tls_get_addr
1319 Global Dynamic - TLS Descriptors:
1320 adrp dest, :tlsdesc:imm
1321 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1322 add dest, dest, #:tlsdesc_lo12:imm
1323 blr tmp
1324 mrs tp, tpidr_el0
1325 add dest, dest, tp
1327 Initial Exec:
1328 mrs tp, tpidr_el0
1329 adrp tmp, :gottprel:imm
1330 ldr dest, [tmp, #:gottprel_lo12:imm]
1331 add dest, dest, tp
1333 Local Exec:
1334 mrs tp, tpidr_el0
1335 add t0, tp, #:tprel_hi12:imm, lsl #12
1336 add t0, t0, #:tprel_lo12_nc:imm
1339 static void
1340 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1341 enum aarch64_symbol_type type)
1343 switch (type)
1345 case SYMBOL_SMALL_ABSOLUTE:
1347 /* In ILP32, the mode of dest can be either SImode or DImode. */
1348 rtx tmp_reg = dest;
1349 machine_mode mode = GET_MODE (dest);
1351 gcc_assert (mode == Pmode || mode == ptr_mode);
1353 if (can_create_pseudo_p ())
1354 tmp_reg = gen_reg_rtx (mode);
1356 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1357 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1358 return;
1361 case SYMBOL_TINY_ABSOLUTE:
1362 emit_insn (gen_rtx_SET (dest, imm));
1363 return;
1365 case SYMBOL_SMALL_GOT_28K:
1367 machine_mode mode = GET_MODE (dest);
1368 rtx gp_rtx = pic_offset_table_rtx;
1369 rtx insn;
1370 rtx mem;
1372 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1373 here before rtl expand. Tree IVOPT will generate rtl pattern to
1374 decide rtx costs, in which case pic_offset_table_rtx is not
1375 initialized. For that case no need to generate the first adrp
1376 instruction as the final cost for global variable access is
1377 one instruction. */
1378 if (gp_rtx != NULL)
1380 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1381 using the page base as GOT base, the first page may be wasted,
1382 in the worst scenario, there is only 28K space for GOT).
1384 The generate instruction sequence for accessing global variable
1387 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1389 Only one instruction needed. But we must initialize
1390 pic_offset_table_rtx properly. We generate initialize insn for
1391 every global access, and allow CSE to remove all redundant.
1393 The final instruction sequences will look like the following
1394 for multiply global variables access.
1396 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1398 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1399 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1400 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1401 ... */
1403 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1404 crtl->uses_pic_offset_table = 1;
1405 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1407 if (mode != GET_MODE (gp_rtx))
1408 gp_rtx = gen_lowpart (mode, gp_rtx);
1412 if (mode == ptr_mode)
1414 if (mode == DImode)
1415 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1416 else
1417 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1419 mem = XVECEXP (SET_SRC (insn), 0, 0);
1421 else
1423 gcc_assert (mode == Pmode);
1425 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1426 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1429 /* The operand is expected to be MEM. Whenever the related insn
1430 pattern changed, above code which calculate mem should be
1431 updated. */
1432 gcc_assert (GET_CODE (mem) == MEM);
1433 MEM_READONLY_P (mem) = 1;
1434 MEM_NOTRAP_P (mem) = 1;
1435 emit_insn (insn);
1436 return;
1439 case SYMBOL_SMALL_GOT_4G:
1441 /* In ILP32, the mode of dest can be either SImode or DImode,
1442 while the got entry is always of SImode size. The mode of
1443 dest depends on how dest is used: if dest is assigned to a
1444 pointer (e.g. in the memory), it has SImode; it may have
1445 DImode if dest is dereferenced to access the memeory.
1446 This is why we have to handle three different ldr_got_small
1447 patterns here (two patterns for ILP32). */
1449 rtx insn;
1450 rtx mem;
1451 rtx tmp_reg = dest;
1452 machine_mode mode = GET_MODE (dest);
1454 if (can_create_pseudo_p ())
1455 tmp_reg = gen_reg_rtx (mode);
1457 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1458 if (mode == ptr_mode)
1460 if (mode == DImode)
1461 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1462 else
1463 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1465 mem = XVECEXP (SET_SRC (insn), 0, 0);
1467 else
1469 gcc_assert (mode == Pmode);
1471 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1472 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1475 gcc_assert (GET_CODE (mem) == MEM);
1476 MEM_READONLY_P (mem) = 1;
1477 MEM_NOTRAP_P (mem) = 1;
1478 emit_insn (insn);
1479 return;
1482 case SYMBOL_SMALL_TLSGD:
1484 rtx_insn *insns;
1485 machine_mode mode = GET_MODE (dest);
1486 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1488 start_sequence ();
1489 if (TARGET_ILP32)
1490 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1491 else
1492 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1493 insns = get_insns ();
1494 end_sequence ();
1496 RTL_CONST_CALL_P (insns) = 1;
1497 emit_libcall_block (insns, dest, result, imm);
1498 return;
1501 case SYMBOL_SMALL_TLSDESC:
1503 machine_mode mode = GET_MODE (dest);
1504 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1505 rtx tp;
1507 gcc_assert (mode == Pmode || mode == ptr_mode);
1509 /* In ILP32, the got entry is always of SImode size. Unlike
1510 small GOT, the dest is fixed at reg 0. */
1511 if (TARGET_ILP32)
1512 emit_insn (gen_tlsdesc_small_si (imm));
1513 else
1514 emit_insn (gen_tlsdesc_small_di (imm));
1515 tp = aarch64_load_tp (NULL);
1517 if (mode != Pmode)
1518 tp = gen_lowpart (mode, tp);
1520 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1521 if (REG_P (dest))
1522 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1523 return;
1526 case SYMBOL_SMALL_TLSIE:
1528 /* In ILP32, the mode of dest can be either SImode or DImode,
1529 while the got entry is always of SImode size. The mode of
1530 dest depends on how dest is used: if dest is assigned to a
1531 pointer (e.g. in the memory), it has SImode; it may have
1532 DImode if dest is dereferenced to access the memeory.
1533 This is why we have to handle three different tlsie_small
1534 patterns here (two patterns for ILP32). */
1535 machine_mode mode = GET_MODE (dest);
1536 rtx tmp_reg = gen_reg_rtx (mode);
1537 rtx tp = aarch64_load_tp (NULL);
1539 if (mode == ptr_mode)
1541 if (mode == DImode)
1542 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1543 else
1545 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1546 tp = gen_lowpart (mode, tp);
1549 else
1551 gcc_assert (mode == Pmode);
1552 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1555 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1556 if (REG_P (dest))
1557 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1558 return;
1561 case SYMBOL_TLSLE12:
1562 case SYMBOL_TLSLE24:
1563 case SYMBOL_TLSLE32:
1564 case SYMBOL_TLSLE48:
1566 machine_mode mode = GET_MODE (dest);
1567 rtx tp = aarch64_load_tp (NULL);
1569 if (mode != Pmode)
1570 tp = gen_lowpart (mode, tp);
1572 switch (type)
1574 case SYMBOL_TLSLE12:
1575 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1576 (dest, tp, imm));
1577 break;
1578 case SYMBOL_TLSLE24:
1579 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1580 (dest, tp, imm));
1581 break;
1582 case SYMBOL_TLSLE32:
1583 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1584 (dest, imm));
1585 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1586 (dest, dest, tp));
1587 break;
1588 case SYMBOL_TLSLE48:
1589 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1590 (dest, imm));
1591 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1592 (dest, dest, tp));
1593 break;
1594 default:
1595 gcc_unreachable ();
1598 if (REG_P (dest))
1599 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1600 return;
1603 case SYMBOL_TINY_GOT:
1604 emit_insn (gen_ldr_got_tiny (dest, imm));
1605 return;
1607 case SYMBOL_TINY_TLSIE:
1609 machine_mode mode = GET_MODE (dest);
1610 rtx tp = aarch64_load_tp (NULL);
1612 if (mode == ptr_mode)
1614 if (mode == DImode)
1615 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1616 else
1618 tp = gen_lowpart (mode, tp);
1619 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1622 else
1624 gcc_assert (mode == Pmode);
1625 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1628 if (REG_P (dest))
1629 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1630 return;
1633 default:
1634 gcc_unreachable ();
1638 /* Emit a move from SRC to DEST. Assume that the move expanders can
1639 handle all moves if !can_create_pseudo_p (). The distinction is
1640 important because, unlike emit_move_insn, the move expanders know
1641 how to force Pmode objects into the constant pool even when the
1642 constant pool address is not itself legitimate. */
1643 static rtx
1644 aarch64_emit_move (rtx dest, rtx src)
1646 return (can_create_pseudo_p ()
1647 ? emit_move_insn (dest, src)
1648 : emit_move_insn_1 (dest, src));
1651 /* Split a 128-bit move operation into two 64-bit move operations,
1652 taking care to handle partial overlap of register to register
1653 copies. Special cases are needed when moving between GP regs and
1654 FP regs. SRC can be a register, constant or memory; DST a register
1655 or memory. If either operand is memory it must not have any side
1656 effects. */
1657 void
1658 aarch64_split_128bit_move (rtx dst, rtx src)
1660 rtx dst_lo, dst_hi;
1661 rtx src_lo, src_hi;
1663 machine_mode mode = GET_MODE (dst);
1665 gcc_assert (mode == TImode || mode == TFmode);
1666 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1667 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1669 if (REG_P (dst) && REG_P (src))
1671 int src_regno = REGNO (src);
1672 int dst_regno = REGNO (dst);
1674 /* Handle FP <-> GP regs. */
1675 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1677 src_lo = gen_lowpart (word_mode, src);
1678 src_hi = gen_highpart (word_mode, src);
1680 if (mode == TImode)
1682 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1683 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1685 else
1687 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1688 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1690 return;
1692 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1694 dst_lo = gen_lowpart (word_mode, dst);
1695 dst_hi = gen_highpart (word_mode, dst);
1697 if (mode == TImode)
1699 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1700 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1702 else
1704 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1705 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1707 return;
1711 dst_lo = gen_lowpart (word_mode, dst);
1712 dst_hi = gen_highpart (word_mode, dst);
1713 src_lo = gen_lowpart (word_mode, src);
1714 src_hi = gen_highpart_mode (word_mode, mode, src);
1716 /* At most one pairing may overlap. */
1717 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1719 aarch64_emit_move (dst_hi, src_hi);
1720 aarch64_emit_move (dst_lo, src_lo);
1722 else
1724 aarch64_emit_move (dst_lo, src_lo);
1725 aarch64_emit_move (dst_hi, src_hi);
1729 bool
1730 aarch64_split_128bit_move_p (rtx dst, rtx src)
1732 return (! REG_P (src)
1733 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1736 /* Split a complex SIMD combine. */
1738 void
1739 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1741 machine_mode src_mode = GET_MODE (src1);
1742 machine_mode dst_mode = GET_MODE (dst);
1744 gcc_assert (VECTOR_MODE_P (dst_mode));
1745 gcc_assert (register_operand (dst, dst_mode)
1746 && register_operand (src1, src_mode)
1747 && register_operand (src2, src_mode));
1749 rtx (*gen) (rtx, rtx, rtx);
1751 switch (src_mode)
1753 case E_V8QImode:
1754 gen = gen_aarch64_simd_combinev8qi;
1755 break;
1756 case E_V4HImode:
1757 gen = gen_aarch64_simd_combinev4hi;
1758 break;
1759 case E_V2SImode:
1760 gen = gen_aarch64_simd_combinev2si;
1761 break;
1762 case E_V4HFmode:
1763 gen = gen_aarch64_simd_combinev4hf;
1764 break;
1765 case E_V2SFmode:
1766 gen = gen_aarch64_simd_combinev2sf;
1767 break;
1768 case E_DImode:
1769 gen = gen_aarch64_simd_combinedi;
1770 break;
1771 case E_DFmode:
1772 gen = gen_aarch64_simd_combinedf;
1773 break;
1774 default:
1775 gcc_unreachable ();
1778 emit_insn (gen (dst, src1, src2));
1779 return;
1782 /* Split a complex SIMD move. */
1784 void
1785 aarch64_split_simd_move (rtx dst, rtx src)
1787 machine_mode src_mode = GET_MODE (src);
1788 machine_mode dst_mode = GET_MODE (dst);
1790 gcc_assert (VECTOR_MODE_P (dst_mode));
1792 if (REG_P (dst) && REG_P (src))
1794 rtx (*gen) (rtx, rtx);
1796 gcc_assert (VECTOR_MODE_P (src_mode));
1798 switch (src_mode)
1800 case E_V16QImode:
1801 gen = gen_aarch64_split_simd_movv16qi;
1802 break;
1803 case E_V8HImode:
1804 gen = gen_aarch64_split_simd_movv8hi;
1805 break;
1806 case E_V4SImode:
1807 gen = gen_aarch64_split_simd_movv4si;
1808 break;
1809 case E_V2DImode:
1810 gen = gen_aarch64_split_simd_movv2di;
1811 break;
1812 case E_V8HFmode:
1813 gen = gen_aarch64_split_simd_movv8hf;
1814 break;
1815 case E_V4SFmode:
1816 gen = gen_aarch64_split_simd_movv4sf;
1817 break;
1818 case E_V2DFmode:
1819 gen = gen_aarch64_split_simd_movv2df;
1820 break;
1821 default:
1822 gcc_unreachable ();
1825 emit_insn (gen (dst, src));
1826 return;
1830 bool
1831 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1832 machine_mode ymode, rtx y)
1834 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1835 gcc_assert (r != NULL);
1836 return rtx_equal_p (x, r);
1840 static rtx
1841 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1843 if (can_create_pseudo_p ())
1844 return force_reg (mode, value);
1845 else
1847 x = aarch64_emit_move (x, value);
1848 return x;
1853 static rtx
1854 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1855 HOST_WIDE_INT offset)
1857 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1859 rtx high;
1860 /* Load the full offset into a register. This
1861 might be improvable in the future. */
1862 high = GEN_INT (offset);
1863 offset = 0;
1864 high = aarch64_force_temporary (mode, temp, high);
1865 reg = aarch64_force_temporary (mode, temp,
1866 gen_rtx_PLUS (mode, high, reg));
1868 return plus_constant (mode, reg, offset);
1871 static int
1872 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1873 scalar_int_mode mode)
1875 int i;
1876 unsigned HOST_WIDE_INT val, val2, mask;
1877 int one_match, zero_match;
1878 int num_insns;
1880 val = INTVAL (imm);
1882 if (aarch64_move_imm (val, mode))
1884 if (generate)
1885 emit_insn (gen_rtx_SET (dest, imm));
1886 return 1;
1889 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1890 (with XXXX non-zero). In that case check to see if the move can be done in
1891 a smaller mode. */
1892 val2 = val & 0xffffffff;
1893 if (mode == DImode
1894 && aarch64_move_imm (val2, SImode)
1895 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1897 if (generate)
1898 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1900 /* Check if we have to emit a second instruction by checking to see
1901 if any of the upper 32 bits of the original DI mode value is set. */
1902 if (val == val2)
1903 return 1;
1905 i = (val >> 48) ? 48 : 32;
1907 if (generate)
1908 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1909 GEN_INT ((val >> i) & 0xffff)));
1911 return 2;
1914 if ((val >> 32) == 0 || mode == SImode)
1916 if (generate)
1918 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1919 if (mode == SImode)
1920 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1921 GEN_INT ((val >> 16) & 0xffff)));
1922 else
1923 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1924 GEN_INT ((val >> 16) & 0xffff)));
1926 return 2;
1929 /* Remaining cases are all for DImode. */
1931 mask = 0xffff;
1932 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1933 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1934 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1935 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1937 if (zero_match != 2 && one_match != 2)
1939 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1940 For a 64-bit bitmask try whether changing 16 bits to all ones or
1941 zeroes creates a valid bitmask. To check any repeated bitmask,
1942 try using 16 bits from the other 32-bit half of val. */
1944 for (i = 0; i < 64; i += 16, mask <<= 16)
1946 val2 = val & ~mask;
1947 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1948 break;
1949 val2 = val | mask;
1950 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1951 break;
1952 val2 = val2 & ~mask;
1953 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1954 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1955 break;
1957 if (i != 64)
1959 if (generate)
1961 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1962 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1963 GEN_INT ((val >> i) & 0xffff)));
1965 return 2;
1969 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1970 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1971 otherwise skip zero bits. */
1973 num_insns = 1;
1974 mask = 0xffff;
1975 val2 = one_match > zero_match ? ~val : val;
1976 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1978 if (generate)
1979 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1980 ? (val | ~(mask << i))
1981 : (val & (mask << i)))));
1982 for (i += 16; i < 64; i += 16)
1984 if ((val2 & (mask << i)) == 0)
1985 continue;
1986 if (generate)
1987 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1988 GEN_INT ((val >> i) & 0xffff)));
1989 num_insns ++;
1992 return num_insns;
1995 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1996 temporary value if necessary. FRAME_RELATED_P should be true if
1997 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1998 to the generated instructions. If SCRATCHREG is known to hold
1999 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2000 immediate again.
2002 Since this function may be used to adjust the stack pointer, we must
2003 ensure that it cannot cause transient stack deallocation (for example
2004 by first incrementing SP and then decrementing when adjusting by a
2005 large immediate). */
2007 static void
2008 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2009 int scratchreg, HOST_WIDE_INT delta,
2010 bool frame_related_p, bool emit_move_imm)
2012 HOST_WIDE_INT mdelta = abs_hwi (delta);
2013 rtx this_rtx = gen_rtx_REG (mode, regnum);
2014 rtx_insn *insn;
2016 if (!mdelta)
2017 return;
2019 /* Single instruction adjustment. */
2020 if (aarch64_uimm12_shift (mdelta))
2022 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2023 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024 return;
2027 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2028 Only do this if mdelta is not a 16-bit move as adjusting using a move
2029 is better. */
2030 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2032 HOST_WIDE_INT low_off = mdelta & 0xfff;
2034 low_off = delta < 0 ? -low_off : low_off;
2035 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2036 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2037 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2038 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2039 return;
2042 /* Emit a move immediate if required and an addition/subtraction. */
2043 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2044 if (emit_move_imm)
2045 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2046 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2047 : gen_add2_insn (this_rtx, scratch_rtx));
2048 if (frame_related_p)
2050 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2051 rtx adj = plus_constant (mode, this_rtx, delta);
2052 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2056 static inline void
2057 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2058 HOST_WIDE_INT delta)
2060 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2063 static inline void
2064 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2066 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2067 true, emit_move_imm);
2070 static inline void
2071 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2073 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2074 frame_related_p, true);
2077 void
2078 aarch64_expand_mov_immediate (rtx dest, rtx imm)
2080 machine_mode mode = GET_MODE (dest);
2082 gcc_assert (mode == SImode || mode == DImode);
2084 /* Check on what type of symbol it is. */
2085 scalar_int_mode int_mode;
2086 if ((GET_CODE (imm) == SYMBOL_REF
2087 || GET_CODE (imm) == LABEL_REF
2088 || GET_CODE (imm) == CONST)
2089 && is_a <scalar_int_mode> (mode, &int_mode))
2091 rtx mem, base, offset;
2092 enum aarch64_symbol_type sty;
2094 /* If we have (const (plus symbol offset)), separate out the offset
2095 before we start classifying the symbol. */
2096 split_const (imm, &base, &offset);
2098 sty = aarch64_classify_symbol (base, offset);
2099 switch (sty)
2101 case SYMBOL_FORCE_TO_MEM:
2102 if (offset != const0_rtx
2103 && targetm.cannot_force_const_mem (int_mode, imm))
2105 gcc_assert (can_create_pseudo_p ());
2106 base = aarch64_force_temporary (int_mode, dest, base);
2107 base = aarch64_add_offset (int_mode, NULL, base,
2108 INTVAL (offset));
2109 aarch64_emit_move (dest, base);
2110 return;
2113 mem = force_const_mem (ptr_mode, imm);
2114 gcc_assert (mem);
2116 /* If we aren't generating PC relative literals, then
2117 we need to expand the literal pool access carefully.
2118 This is something that needs to be done in a number
2119 of places, so could well live as a separate function. */
2120 if (!aarch64_pcrelative_literal_loads)
2122 gcc_assert (can_create_pseudo_p ());
2123 base = gen_reg_rtx (ptr_mode);
2124 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2125 if (ptr_mode != Pmode)
2126 base = convert_memory_address (Pmode, base);
2127 mem = gen_rtx_MEM (ptr_mode, base);
2130 if (int_mode != ptr_mode)
2131 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2133 emit_insn (gen_rtx_SET (dest, mem));
2135 return;
2137 case SYMBOL_SMALL_TLSGD:
2138 case SYMBOL_SMALL_TLSDESC:
2139 case SYMBOL_SMALL_TLSIE:
2140 case SYMBOL_SMALL_GOT_28K:
2141 case SYMBOL_SMALL_GOT_4G:
2142 case SYMBOL_TINY_GOT:
2143 case SYMBOL_TINY_TLSIE:
2144 if (offset != const0_rtx)
2146 gcc_assert(can_create_pseudo_p ());
2147 base = aarch64_force_temporary (int_mode, dest, base);
2148 base = aarch64_add_offset (int_mode, NULL, base,
2149 INTVAL (offset));
2150 aarch64_emit_move (dest, base);
2151 return;
2153 /* FALLTHRU */
2155 case SYMBOL_SMALL_ABSOLUTE:
2156 case SYMBOL_TINY_ABSOLUTE:
2157 case SYMBOL_TLSLE12:
2158 case SYMBOL_TLSLE24:
2159 case SYMBOL_TLSLE32:
2160 case SYMBOL_TLSLE48:
2161 aarch64_load_symref_appropriately (dest, imm, sty);
2162 return;
2164 default:
2165 gcc_unreachable ();
2169 if (!CONST_INT_P (imm))
2171 if (GET_CODE (imm) == HIGH)
2172 emit_insn (gen_rtx_SET (dest, imm));
2173 else
2175 rtx mem = force_const_mem (mode, imm);
2176 gcc_assert (mem);
2177 emit_insn (gen_rtx_SET (dest, mem));
2180 return;
2183 aarch64_internal_mov_immediate (dest, imm, true,
2184 as_a <scalar_int_mode> (mode));
2187 static bool
2188 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2189 tree exp ATTRIBUTE_UNUSED)
2191 /* Currently, always true. */
2192 return true;
2195 /* Implement TARGET_PASS_BY_REFERENCE. */
2197 static bool
2198 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2199 machine_mode mode,
2200 const_tree type,
2201 bool named ATTRIBUTE_UNUSED)
2203 HOST_WIDE_INT size;
2204 machine_mode dummymode;
2205 int nregs;
2207 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2208 size = (mode == BLKmode && type)
2209 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2211 /* Aggregates are passed by reference based on their size. */
2212 if (type && AGGREGATE_TYPE_P (type))
2214 size = int_size_in_bytes (type);
2217 /* Variable sized arguments are always returned by reference. */
2218 if (size < 0)
2219 return true;
2221 /* Can this be a candidate to be passed in fp/simd register(s)? */
2222 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2223 &dummymode, &nregs,
2224 NULL))
2225 return false;
2227 /* Arguments which are variable sized or larger than 2 registers are
2228 passed by reference unless they are a homogenous floating point
2229 aggregate. */
2230 return size > 2 * UNITS_PER_WORD;
2233 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2234 static bool
2235 aarch64_return_in_msb (const_tree valtype)
2237 machine_mode dummy_mode;
2238 int dummy_int;
2240 /* Never happens in little-endian mode. */
2241 if (!BYTES_BIG_ENDIAN)
2242 return false;
2244 /* Only composite types smaller than or equal to 16 bytes can
2245 be potentially returned in registers. */
2246 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2247 || int_size_in_bytes (valtype) <= 0
2248 || int_size_in_bytes (valtype) > 16)
2249 return false;
2251 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2252 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2253 is always passed/returned in the least significant bits of fp/simd
2254 register(s). */
2255 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2256 &dummy_mode, &dummy_int, NULL))
2257 return false;
2259 return true;
2262 /* Implement TARGET_FUNCTION_VALUE.
2263 Define how to find the value returned by a function. */
2265 static rtx
2266 aarch64_function_value (const_tree type, const_tree func,
2267 bool outgoing ATTRIBUTE_UNUSED)
2269 machine_mode mode;
2270 int unsignedp;
2271 int count;
2272 machine_mode ag_mode;
2274 mode = TYPE_MODE (type);
2275 if (INTEGRAL_TYPE_P (type))
2276 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2278 if (aarch64_return_in_msb (type))
2280 HOST_WIDE_INT size = int_size_in_bytes (type);
2282 if (size % UNITS_PER_WORD != 0)
2284 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2285 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2289 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2290 &ag_mode, &count, NULL))
2292 if (!aarch64_composite_type_p (type, mode))
2294 gcc_assert (count == 1 && mode == ag_mode);
2295 return gen_rtx_REG (mode, V0_REGNUM);
2297 else
2299 int i;
2300 rtx par;
2302 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2303 for (i = 0; i < count; i++)
2305 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2306 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2307 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2308 XVECEXP (par, 0, i) = tmp;
2310 return par;
2313 else
2314 return gen_rtx_REG (mode, R0_REGNUM);
2317 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2318 Return true if REGNO is the number of a hard register in which the values
2319 of called function may come back. */
2321 static bool
2322 aarch64_function_value_regno_p (const unsigned int regno)
2324 /* Maximum of 16 bytes can be returned in the general registers. Examples
2325 of 16-byte return values are: 128-bit integers and 16-byte small
2326 structures (excluding homogeneous floating-point aggregates). */
2327 if (regno == R0_REGNUM || regno == R1_REGNUM)
2328 return true;
2330 /* Up to four fp/simd registers can return a function value, e.g. a
2331 homogeneous floating-point aggregate having four members. */
2332 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2333 return TARGET_FLOAT;
2335 return false;
2338 /* Implement TARGET_RETURN_IN_MEMORY.
2340 If the type T of the result of a function is such that
2341 void func (T arg)
2342 would require that arg be passed as a value in a register (or set of
2343 registers) according to the parameter passing rules, then the result
2344 is returned in the same registers as would be used for such an
2345 argument. */
2347 static bool
2348 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2350 HOST_WIDE_INT size;
2351 machine_mode ag_mode;
2352 int count;
2354 if (!AGGREGATE_TYPE_P (type)
2355 && TREE_CODE (type) != COMPLEX_TYPE
2356 && TREE_CODE (type) != VECTOR_TYPE)
2357 /* Simple scalar types always returned in registers. */
2358 return false;
2360 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2361 type,
2362 &ag_mode,
2363 &count,
2364 NULL))
2365 return false;
2367 /* Types larger than 2 registers returned in memory. */
2368 size = int_size_in_bytes (type);
2369 return (size < 0 || size > 2 * UNITS_PER_WORD);
2372 static bool
2373 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2374 const_tree type, int *nregs)
2376 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2377 return aarch64_vfp_is_call_or_return_candidate (mode,
2378 type,
2379 &pcum->aapcs_vfp_rmode,
2380 nregs,
2381 NULL);
2384 /* Given MODE and TYPE of a function argument, return the alignment in
2385 bits. The idea is to suppress any stronger alignment requested by
2386 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2387 This is a helper function for local use only. */
2389 static unsigned int
2390 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2392 if (!type)
2393 return GET_MODE_ALIGNMENT (mode);
2395 if (integer_zerop (TYPE_SIZE (type)))
2396 return 0;
2398 gcc_assert (TYPE_MODE (type) == mode);
2400 if (!AGGREGATE_TYPE_P (type))
2401 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2403 if (TREE_CODE (type) == ARRAY_TYPE)
2404 return TYPE_ALIGN (TREE_TYPE (type));
2406 unsigned int alignment = 0;
2407 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2408 if (TREE_CODE (field) == FIELD_DECL)
2409 alignment = std::max (alignment, DECL_ALIGN (field));
2411 return alignment;
2414 /* Layout a function argument according to the AAPCS64 rules. The rule
2415 numbers refer to the rule numbers in the AAPCS64. */
2417 static void
2418 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2419 const_tree type,
2420 bool named ATTRIBUTE_UNUSED)
2422 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2423 int ncrn, nvrn, nregs;
2424 bool allocate_ncrn, allocate_nvrn;
2425 HOST_WIDE_INT size;
2427 /* We need to do this once per argument. */
2428 if (pcum->aapcs_arg_processed)
2429 return;
2431 pcum->aapcs_arg_processed = true;
2433 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2434 size
2435 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2436 UNITS_PER_WORD);
2438 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2439 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2440 mode,
2441 type,
2442 &nregs);
2444 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2445 The following code thus handles passing by SIMD/FP registers first. */
2447 nvrn = pcum->aapcs_nvrn;
2449 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2450 and homogenous short-vector aggregates (HVA). */
2451 if (allocate_nvrn)
2453 if (!TARGET_FLOAT)
2454 aarch64_err_no_fpadvsimd (mode, "argument");
2456 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2458 pcum->aapcs_nextnvrn = nvrn + nregs;
2459 if (!aarch64_composite_type_p (type, mode))
2461 gcc_assert (nregs == 1);
2462 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2464 else
2466 rtx par;
2467 int i;
2468 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2469 for (i = 0; i < nregs; i++)
2471 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2472 V0_REGNUM + nvrn + i);
2473 tmp = gen_rtx_EXPR_LIST
2474 (VOIDmode, tmp,
2475 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2476 XVECEXP (par, 0, i) = tmp;
2478 pcum->aapcs_reg = par;
2480 return;
2482 else
2484 /* C.3 NSRN is set to 8. */
2485 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2486 goto on_stack;
2490 ncrn = pcum->aapcs_ncrn;
2491 nregs = size / UNITS_PER_WORD;
2493 /* C6 - C9. though the sign and zero extension semantics are
2494 handled elsewhere. This is the case where the argument fits
2495 entirely general registers. */
2496 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2499 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2501 /* C.8 if the argument has an alignment of 16 then the NGRN is
2502 rounded up to the next even number. */
2503 if (nregs == 2
2504 && ncrn % 2
2505 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2506 comparison is there because for > 16 * BITS_PER_UNIT
2507 alignment nregs should be > 2 and therefore it should be
2508 passed by reference rather than value. */
2509 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2511 ++ncrn;
2512 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2515 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2516 A reg is still generated for it, but the caller should be smart
2517 enough not to use it. */
2518 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2519 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2520 else
2522 rtx par;
2523 int i;
2525 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2526 for (i = 0; i < nregs; i++)
2528 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2529 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2530 GEN_INT (i * UNITS_PER_WORD));
2531 XVECEXP (par, 0, i) = tmp;
2533 pcum->aapcs_reg = par;
2536 pcum->aapcs_nextncrn = ncrn + nregs;
2537 return;
2540 /* C.11 */
2541 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2543 /* The argument is passed on stack; record the needed number of words for
2544 this argument and align the total size if necessary. */
2545 on_stack:
2546 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2548 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2549 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2550 16 / UNITS_PER_WORD);
2551 return;
2554 /* Implement TARGET_FUNCTION_ARG. */
2556 static rtx
2557 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2558 const_tree type, bool named)
2560 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2561 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2563 if (mode == VOIDmode)
2564 return NULL_RTX;
2566 aarch64_layout_arg (pcum_v, mode, type, named);
2567 return pcum->aapcs_reg;
2570 void
2571 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2572 const_tree fntype ATTRIBUTE_UNUSED,
2573 rtx libname ATTRIBUTE_UNUSED,
2574 const_tree fndecl ATTRIBUTE_UNUSED,
2575 unsigned n_named ATTRIBUTE_UNUSED)
2577 pcum->aapcs_ncrn = 0;
2578 pcum->aapcs_nvrn = 0;
2579 pcum->aapcs_nextncrn = 0;
2580 pcum->aapcs_nextnvrn = 0;
2581 pcum->pcs_variant = ARM_PCS_AAPCS64;
2582 pcum->aapcs_reg = NULL_RTX;
2583 pcum->aapcs_arg_processed = false;
2584 pcum->aapcs_stack_words = 0;
2585 pcum->aapcs_stack_size = 0;
2587 if (!TARGET_FLOAT
2588 && fndecl && TREE_PUBLIC (fndecl)
2589 && fntype && fntype != error_mark_node)
2591 const_tree type = TREE_TYPE (fntype);
2592 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2593 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2594 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2595 &mode, &nregs, NULL))
2596 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2598 return;
2601 static void
2602 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2603 machine_mode mode,
2604 const_tree type,
2605 bool named)
2607 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2608 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2610 aarch64_layout_arg (pcum_v, mode, type, named);
2611 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2612 != (pcum->aapcs_stack_words != 0));
2613 pcum->aapcs_arg_processed = false;
2614 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2615 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2616 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2617 pcum->aapcs_stack_words = 0;
2618 pcum->aapcs_reg = NULL_RTX;
2622 bool
2623 aarch64_function_arg_regno_p (unsigned regno)
2625 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2626 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2629 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2630 PARM_BOUNDARY bits of alignment, but will be given anything up
2631 to STACK_BOUNDARY bits if the type requires it. This makes sure
2632 that both before and after the layout of each argument, the Next
2633 Stacked Argument Address (NSAA) will have a minimum alignment of
2634 8 bytes. */
2636 static unsigned int
2637 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2639 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2640 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2643 /* Implement TARGET_FUNCTION_ARG_PADDING.
2645 Small aggregate types are placed in the lowest memory address.
2647 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2649 static pad_direction
2650 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2652 /* On little-endian targets, the least significant byte of every stack
2653 argument is passed at the lowest byte address of the stack slot. */
2654 if (!BYTES_BIG_ENDIAN)
2655 return PAD_UPWARD;
2657 /* Otherwise, integral, floating-point and pointer types are padded downward:
2658 the least significant byte of a stack argument is passed at the highest
2659 byte address of the stack slot. */
2660 if (type
2661 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2662 || POINTER_TYPE_P (type))
2663 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2664 return PAD_DOWNWARD;
2666 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2667 return PAD_UPWARD;
2670 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2672 It specifies padding for the last (may also be the only)
2673 element of a block move between registers and memory. If
2674 assuming the block is in the memory, padding upward means that
2675 the last element is padded after its highest significant byte,
2676 while in downward padding, the last element is padded at the
2677 its least significant byte side.
2679 Small aggregates and small complex types are always padded
2680 upwards.
2682 We don't need to worry about homogeneous floating-point or
2683 short-vector aggregates; their move is not affected by the
2684 padding direction determined here. Regardless of endianness,
2685 each element of such an aggregate is put in the least
2686 significant bits of a fp/simd register.
2688 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2689 register has useful data, and return the opposite if the most
2690 significant byte does. */
2692 bool
2693 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2694 bool first ATTRIBUTE_UNUSED)
2697 /* Small composite types are always padded upward. */
2698 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2700 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2701 : GET_MODE_SIZE (mode));
2702 if (size < 2 * UNITS_PER_WORD)
2703 return true;
2706 /* Otherwise, use the default padding. */
2707 return !BYTES_BIG_ENDIAN;
2710 static scalar_int_mode
2711 aarch64_libgcc_cmp_return_mode (void)
2713 return SImode;
2716 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2718 /* We use the 12-bit shifted immediate arithmetic instructions so values
2719 must be multiple of (1 << 12), i.e. 4096. */
2720 #define ARITH_FACTOR 4096
2722 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2723 #error Cannot use simple address calculation for stack probing
2724 #endif
2726 /* The pair of scratch registers used for stack probing. */
2727 #define PROBE_STACK_FIRST_REG 9
2728 #define PROBE_STACK_SECOND_REG 10
2730 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2731 inclusive. These are offsets from the current stack pointer. */
2733 static void
2734 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2736 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2738 /* See the same assertion on PROBE_INTERVAL above. */
2739 gcc_assert ((first % ARITH_FACTOR) == 0);
2741 /* See if we have a constant small number of probes to generate. If so,
2742 that's the easy case. */
2743 if (size <= PROBE_INTERVAL)
2745 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2747 emit_set_insn (reg1,
2748 plus_constant (Pmode,
2749 stack_pointer_rtx, -(first + base)));
2750 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2753 /* The run-time loop is made up of 8 insns in the generic case while the
2754 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2755 else if (size <= 4 * PROBE_INTERVAL)
2757 HOST_WIDE_INT i, rem;
2759 emit_set_insn (reg1,
2760 plus_constant (Pmode,
2761 stack_pointer_rtx,
2762 -(first + PROBE_INTERVAL)));
2763 emit_stack_probe (reg1);
2765 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2766 it exceeds SIZE. If only two probes are needed, this will not
2767 generate any code. Then probe at FIRST + SIZE. */
2768 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2770 emit_set_insn (reg1,
2771 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2772 emit_stack_probe (reg1);
2775 rem = size - (i - PROBE_INTERVAL);
2776 if (rem > 256)
2778 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2780 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2781 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2783 else
2784 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2787 /* Otherwise, do the same as above, but in a loop. Note that we must be
2788 extra careful with variables wrapping around because we might be at
2789 the very top (or the very bottom) of the address space and we have
2790 to be able to handle this case properly; in particular, we use an
2791 equality test for the loop condition. */
2792 else
2794 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2796 /* Step 1: round SIZE to the previous multiple of the interval. */
2798 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2801 /* Step 2: compute initial and final value of the loop counter. */
2803 /* TEST_ADDR = SP + FIRST. */
2804 emit_set_insn (reg1,
2805 plus_constant (Pmode, stack_pointer_rtx, -first));
2807 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2808 HOST_WIDE_INT adjustment = - (first + rounded_size);
2809 if (! aarch64_uimm12_shift (adjustment))
2811 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2812 true, Pmode);
2813 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2815 else
2817 emit_set_insn (reg2,
2818 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2821 /* Step 3: the loop
2825 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2826 probe at TEST_ADDR
2828 while (TEST_ADDR != LAST_ADDR)
2830 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2831 until it is equal to ROUNDED_SIZE. */
2833 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2836 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2837 that SIZE is equal to ROUNDED_SIZE. */
2839 if (size != rounded_size)
2841 HOST_WIDE_INT rem = size - rounded_size;
2843 if (rem > 256)
2845 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2847 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2848 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2850 else
2851 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2855 /* Make sure nothing is scheduled before we are done. */
2856 emit_insn (gen_blockage ());
2859 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2860 absolute addresses. */
2862 const char *
2863 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2865 static int labelno = 0;
2866 char loop_lab[32];
2867 rtx xops[2];
2869 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2871 /* Loop. */
2872 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2874 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2875 xops[0] = reg1;
2876 xops[1] = GEN_INT (PROBE_INTERVAL);
2877 output_asm_insn ("sub\t%0, %0, %1", xops);
2879 /* Probe at TEST_ADDR. */
2880 output_asm_insn ("str\txzr, [%0]", xops);
2882 /* Test if TEST_ADDR == LAST_ADDR. */
2883 xops[1] = reg2;
2884 output_asm_insn ("cmp\t%0, %1", xops);
2886 /* Branch. */
2887 fputs ("\tb.ne\t", asm_out_file);
2888 assemble_name_raw (asm_out_file, loop_lab);
2889 fputc ('\n', asm_out_file);
2891 return "";
2894 /* Mark the registers that need to be saved by the callee and calculate
2895 the size of the callee-saved registers area and frame record (both FP
2896 and LR may be omitted). */
2897 static void
2898 aarch64_layout_frame (void)
2900 HOST_WIDE_INT offset = 0;
2901 int regno, last_fp_reg = INVALID_REGNUM;
2903 if (reload_completed && cfun->machine->frame.laid_out)
2904 return;
2906 /* Force a frame chain for EH returns so the return address is at FP+8. */
2907 cfun->machine->frame.emit_frame_chain
2908 = frame_pointer_needed || crtl->calls_eh_return;
2910 /* Emit a frame chain if the frame pointer is enabled.
2911 If -momit-leaf-frame-pointer is used, do not use a frame chain
2912 in leaf functions which do not use LR. */
2913 if (flag_omit_frame_pointer == 2
2914 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
2915 && !df_regs_ever_live_p (LR_REGNUM)))
2916 cfun->machine->frame.emit_frame_chain = true;
2918 #define SLOT_NOT_REQUIRED (-2)
2919 #define SLOT_REQUIRED (-1)
2921 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2922 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2924 /* First mark all the registers that really need to be saved... */
2925 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2926 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2928 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2929 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2931 /* ... that includes the eh data registers (if needed)... */
2932 if (crtl->calls_eh_return)
2933 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2934 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2935 = SLOT_REQUIRED;
2937 /* ... and any callee saved register that dataflow says is live. */
2938 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2939 if (df_regs_ever_live_p (regno)
2940 && (regno == R30_REGNUM
2941 || !call_used_regs[regno]))
2942 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2944 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2945 if (df_regs_ever_live_p (regno)
2946 && !call_used_regs[regno])
2948 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2949 last_fp_reg = regno;
2952 if (cfun->machine->frame.emit_frame_chain)
2954 /* FP and LR are placed in the linkage record. */
2955 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2956 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2957 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2958 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2959 offset = 2 * UNITS_PER_WORD;
2962 /* Now assign stack slots for them. */
2963 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2964 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2966 cfun->machine->frame.reg_offset[regno] = offset;
2967 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2968 cfun->machine->frame.wb_candidate1 = regno;
2969 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2970 cfun->machine->frame.wb_candidate2 = regno;
2971 offset += UNITS_PER_WORD;
2974 HOST_WIDE_INT max_int_offset = offset;
2975 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2976 bool has_align_gap = offset != max_int_offset;
2978 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2979 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2981 /* If there is an alignment gap between integer and fp callee-saves,
2982 allocate the last fp register to it if possible. */
2983 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2985 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2986 break;
2989 cfun->machine->frame.reg_offset[regno] = offset;
2990 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2991 cfun->machine->frame.wb_candidate1 = regno;
2992 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2993 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2994 cfun->machine->frame.wb_candidate2 = regno;
2995 offset += UNITS_PER_WORD;
2998 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3000 cfun->machine->frame.saved_regs_size = offset;
3002 HOST_WIDE_INT varargs_and_saved_regs_size
3003 = offset + cfun->machine->frame.saved_varargs_size;
3005 cfun->machine->frame.hard_fp_offset
3006 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
3007 STACK_BOUNDARY / BITS_PER_UNIT);
3009 cfun->machine->frame.frame_size
3010 = ROUND_UP (cfun->machine->frame.hard_fp_offset
3011 + crtl->outgoing_args_size,
3012 STACK_BOUNDARY / BITS_PER_UNIT);
3014 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3016 cfun->machine->frame.initial_adjust = 0;
3017 cfun->machine->frame.final_adjust = 0;
3018 cfun->machine->frame.callee_adjust = 0;
3019 cfun->machine->frame.callee_offset = 0;
3021 HOST_WIDE_INT max_push_offset = 0;
3022 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3023 max_push_offset = 512;
3024 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3025 max_push_offset = 256;
3027 if (cfun->machine->frame.frame_size < max_push_offset
3028 && crtl->outgoing_args_size == 0)
3030 /* Simple, small frame with no outgoing arguments:
3031 stp reg1, reg2, [sp, -frame_size]!
3032 stp reg3, reg4, [sp, 16] */
3033 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3035 else if ((crtl->outgoing_args_size
3036 + cfun->machine->frame.saved_regs_size < 512)
3037 && !(cfun->calls_alloca
3038 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3040 /* Frame with small outgoing arguments:
3041 sub sp, sp, frame_size
3042 stp reg1, reg2, [sp, outgoing_args_size]
3043 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3044 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3045 cfun->machine->frame.callee_offset
3046 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3048 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3050 /* Frame with large outgoing arguments but a small local area:
3051 stp reg1, reg2, [sp, -hard_fp_offset]!
3052 stp reg3, reg4, [sp, 16]
3053 sub sp, sp, outgoing_args_size */
3054 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3055 cfun->machine->frame.final_adjust
3056 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3058 else
3060 /* Frame with large local area and outgoing arguments using frame pointer:
3061 sub sp, sp, hard_fp_offset
3062 stp x29, x30, [sp, 0]
3063 add x29, sp, 0
3064 stp reg3, reg4, [sp, 16]
3065 sub sp, sp, outgoing_args_size */
3066 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3067 cfun->machine->frame.final_adjust
3068 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3071 cfun->machine->frame.laid_out = true;
3074 /* Return true if the register REGNO is saved on entry to
3075 the current function. */
3077 static bool
3078 aarch64_register_saved_on_entry (int regno)
3080 return cfun->machine->frame.reg_offset[regno] >= 0;
3083 /* Return the next register up from REGNO up to LIMIT for the callee
3084 to save. */
3086 static unsigned
3087 aarch64_next_callee_save (unsigned regno, unsigned limit)
3089 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3090 regno ++;
3091 return regno;
3094 /* Push the register number REGNO of mode MODE to the stack with write-back
3095 adjusting the stack by ADJUSTMENT. */
3097 static void
3098 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3099 HOST_WIDE_INT adjustment)
3101 rtx base_rtx = stack_pointer_rtx;
3102 rtx insn, reg, mem;
3104 reg = gen_rtx_REG (mode, regno);
3105 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3106 plus_constant (Pmode, base_rtx, -adjustment));
3107 mem = gen_frame_mem (mode, mem);
3109 insn = emit_move_insn (mem, reg);
3110 RTX_FRAME_RELATED_P (insn) = 1;
3113 /* Generate and return an instruction to store the pair of registers
3114 REG and REG2 of mode MODE to location BASE with write-back adjusting
3115 the stack location BASE by ADJUSTMENT. */
3117 static rtx
3118 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3119 HOST_WIDE_INT adjustment)
3121 switch (mode)
3123 case E_DImode:
3124 return gen_storewb_pairdi_di (base, base, reg, reg2,
3125 GEN_INT (-adjustment),
3126 GEN_INT (UNITS_PER_WORD - adjustment));
3127 case E_DFmode:
3128 return gen_storewb_pairdf_di (base, base, reg, reg2,
3129 GEN_INT (-adjustment),
3130 GEN_INT (UNITS_PER_WORD - adjustment));
3131 default:
3132 gcc_unreachable ();
3136 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3137 stack pointer by ADJUSTMENT. */
3139 static void
3140 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3142 rtx_insn *insn;
3143 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3145 if (regno2 == INVALID_REGNUM)
3146 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3148 rtx reg1 = gen_rtx_REG (mode, regno1);
3149 rtx reg2 = gen_rtx_REG (mode, regno2);
3151 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3152 reg2, adjustment));
3153 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3154 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3155 RTX_FRAME_RELATED_P (insn) = 1;
3158 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3159 adjusting it by ADJUSTMENT afterwards. */
3161 static rtx
3162 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3163 HOST_WIDE_INT adjustment)
3165 switch (mode)
3167 case E_DImode:
3168 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3169 GEN_INT (UNITS_PER_WORD));
3170 case E_DFmode:
3171 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3172 GEN_INT (UNITS_PER_WORD));
3173 default:
3174 gcc_unreachable ();
3178 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3179 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3180 into CFI_OPS. */
3182 static void
3183 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3184 rtx *cfi_ops)
3186 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3187 rtx reg1 = gen_rtx_REG (mode, regno1);
3189 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3191 if (regno2 == INVALID_REGNUM)
3193 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3194 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3195 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3197 else
3199 rtx reg2 = gen_rtx_REG (mode, regno2);
3200 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3201 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3202 reg2, adjustment));
3206 /* Generate and return a store pair instruction of mode MODE to store
3207 register REG1 to MEM1 and register REG2 to MEM2. */
3209 static rtx
3210 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3211 rtx reg2)
3213 switch (mode)
3215 case E_DImode:
3216 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3218 case E_DFmode:
3219 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3221 default:
3222 gcc_unreachable ();
3226 /* Generate and regurn a load pair isntruction of mode MODE to load register
3227 REG1 from MEM1 and register REG2 from MEM2. */
3229 static rtx
3230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3231 rtx mem2)
3233 switch (mode)
3235 case E_DImode:
3236 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3238 case E_DFmode:
3239 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3241 default:
3242 gcc_unreachable ();
3246 /* Return TRUE if return address signing should be enabled for the current
3247 function, otherwise return FALSE. */
3249 bool
3250 aarch64_return_address_signing_enabled (void)
3252 /* This function should only be called after frame laid out. */
3253 gcc_assert (cfun->machine->frame.laid_out);
3255 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3256 if it's LR is pushed onto stack. */
3257 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3258 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3259 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3262 /* Emit code to save the callee-saved registers from register number START
3263 to LIMIT to the stack at the location starting at offset START_OFFSET,
3264 skipping any write-back candidates if SKIP_WB is true. */
3266 static void
3267 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3268 unsigned start, unsigned limit, bool skip_wb)
3270 rtx_insn *insn;
3271 unsigned regno;
3272 unsigned regno2;
3274 for (regno = aarch64_next_callee_save (start, limit);
3275 regno <= limit;
3276 regno = aarch64_next_callee_save (regno + 1, limit))
3278 rtx reg, mem;
3279 HOST_WIDE_INT offset;
3281 if (skip_wb
3282 && (regno == cfun->machine->frame.wb_candidate1
3283 || regno == cfun->machine->frame.wb_candidate2))
3284 continue;
3286 if (cfun->machine->reg_is_wrapped_separately[regno])
3287 continue;
3289 reg = gen_rtx_REG (mode, regno);
3290 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3291 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3292 offset));
3294 regno2 = aarch64_next_callee_save (regno + 1, limit);
3296 if (regno2 <= limit
3297 && !cfun->machine->reg_is_wrapped_separately[regno2]
3298 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3299 == cfun->machine->frame.reg_offset[regno2]))
3302 rtx reg2 = gen_rtx_REG (mode, regno2);
3303 rtx mem2;
3305 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3306 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3307 offset));
3308 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3309 reg2));
3311 /* The first part of a frame-related parallel insn is
3312 always assumed to be relevant to the frame
3313 calculations; subsequent parts, are only
3314 frame-related if explicitly marked. */
3315 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3316 regno = regno2;
3318 else
3319 insn = emit_move_insn (mem, reg);
3321 RTX_FRAME_RELATED_P (insn) = 1;
3325 /* Emit code to restore the callee registers of mode MODE from register
3326 number START up to and including LIMIT. Restore from the stack offset
3327 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3328 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3330 static void
3331 aarch64_restore_callee_saves (machine_mode mode,
3332 HOST_WIDE_INT start_offset, unsigned start,
3333 unsigned limit, bool skip_wb, rtx *cfi_ops)
3335 rtx base_rtx = stack_pointer_rtx;
3336 unsigned regno;
3337 unsigned regno2;
3338 HOST_WIDE_INT offset;
3340 for (regno = aarch64_next_callee_save (start, limit);
3341 regno <= limit;
3342 regno = aarch64_next_callee_save (regno + 1, limit))
3344 if (cfun->machine->reg_is_wrapped_separately[regno])
3345 continue;
3347 rtx reg, mem;
3349 if (skip_wb
3350 && (regno == cfun->machine->frame.wb_candidate1
3351 || regno == cfun->machine->frame.wb_candidate2))
3352 continue;
3354 reg = gen_rtx_REG (mode, regno);
3355 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3356 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3358 regno2 = aarch64_next_callee_save (regno + 1, limit);
3360 if (regno2 <= limit
3361 && !cfun->machine->reg_is_wrapped_separately[regno2]
3362 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3363 == cfun->machine->frame.reg_offset[regno2]))
3365 rtx reg2 = gen_rtx_REG (mode, regno2);
3366 rtx mem2;
3368 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3369 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3370 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3372 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3373 regno = regno2;
3375 else
3376 emit_move_insn (reg, mem);
3377 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3381 static inline bool
3382 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3383 HOST_WIDE_INT offset)
3385 return offset >= -256 && offset < 256;
3388 static inline bool
3389 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3391 return (offset >= 0
3392 && offset < 4096 * GET_MODE_SIZE (mode)
3393 && offset % GET_MODE_SIZE (mode) == 0);
3396 bool
3397 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3399 return (offset >= -64 * GET_MODE_SIZE (mode)
3400 && offset < 64 * GET_MODE_SIZE (mode)
3401 && offset % GET_MODE_SIZE (mode) == 0);
3404 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3406 static sbitmap
3407 aarch64_get_separate_components (void)
3409 aarch64_layout_frame ();
3411 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3412 bitmap_clear (components);
3414 /* The registers we need saved to the frame. */
3415 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3416 if (aarch64_register_saved_on_entry (regno))
3418 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3419 if (!frame_pointer_needed)
3420 offset += cfun->machine->frame.frame_size
3421 - cfun->machine->frame.hard_fp_offset;
3422 /* Check that we can access the stack slot of the register with one
3423 direct load with no adjustments needed. */
3424 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3425 bitmap_set_bit (components, regno);
3428 /* Don't mess with the hard frame pointer. */
3429 if (frame_pointer_needed)
3430 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3432 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3433 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3434 /* If aarch64_layout_frame has chosen registers to store/restore with
3435 writeback don't interfere with them to avoid having to output explicit
3436 stack adjustment instructions. */
3437 if (reg2 != INVALID_REGNUM)
3438 bitmap_clear_bit (components, reg2);
3439 if (reg1 != INVALID_REGNUM)
3440 bitmap_clear_bit (components, reg1);
3442 bitmap_clear_bit (components, LR_REGNUM);
3443 bitmap_clear_bit (components, SP_REGNUM);
3445 return components;
3448 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3450 static sbitmap
3451 aarch64_components_for_bb (basic_block bb)
3453 bitmap in = DF_LIVE_IN (bb);
3454 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3455 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3457 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3458 bitmap_clear (components);
3460 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3461 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3462 if ((!call_used_regs[regno])
3463 && (bitmap_bit_p (in, regno)
3464 || bitmap_bit_p (gen, regno)
3465 || bitmap_bit_p (kill, regno)))
3466 bitmap_set_bit (components, regno);
3468 return components;
3471 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3472 Nothing to do for aarch64. */
3474 static void
3475 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3479 /* Return the next set bit in BMP from START onwards. Return the total number
3480 of bits in BMP if no set bit is found at or after START. */
3482 static unsigned int
3483 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3485 unsigned int nbits = SBITMAP_SIZE (bmp);
3486 if (start == nbits)
3487 return start;
3489 gcc_assert (start < nbits);
3490 for (unsigned int i = start; i < nbits; i++)
3491 if (bitmap_bit_p (bmp, i))
3492 return i;
3494 return nbits;
3497 /* Do the work for aarch64_emit_prologue_components and
3498 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3499 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3500 for these components or the epilogue sequence. That is, it determines
3501 whether we should emit stores or loads and what kind of CFA notes to attach
3502 to the insns. Otherwise the logic for the two sequences is very
3503 similar. */
3505 static void
3506 aarch64_process_components (sbitmap components, bool prologue_p)
3508 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3509 ? HARD_FRAME_POINTER_REGNUM
3510 : STACK_POINTER_REGNUM);
3512 unsigned last_regno = SBITMAP_SIZE (components);
3513 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3514 rtx_insn *insn = NULL;
3516 while (regno != last_regno)
3518 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3519 so DFmode for the vector registers is enough. */
3520 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3521 rtx reg = gen_rtx_REG (mode, regno);
3522 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3523 if (!frame_pointer_needed)
3524 offset += cfun->machine->frame.frame_size
3525 - cfun->machine->frame.hard_fp_offset;
3526 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3527 rtx mem = gen_frame_mem (mode, addr);
3529 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3530 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3531 /* No more registers to handle after REGNO.
3532 Emit a single save/restore and exit. */
3533 if (regno2 == last_regno)
3535 insn = emit_insn (set);
3536 RTX_FRAME_RELATED_P (insn) = 1;
3537 if (prologue_p)
3538 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3539 else
3540 add_reg_note (insn, REG_CFA_RESTORE, reg);
3541 break;
3544 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3545 /* The next register is not of the same class or its offset is not
3546 mergeable with the current one into a pair. */
3547 if (!satisfies_constraint_Ump (mem)
3548 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3549 || (offset2 - cfun->machine->frame.reg_offset[regno])
3550 != GET_MODE_SIZE (mode))
3552 insn = emit_insn (set);
3553 RTX_FRAME_RELATED_P (insn) = 1;
3554 if (prologue_p)
3555 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3556 else
3557 add_reg_note (insn, REG_CFA_RESTORE, reg);
3559 regno = regno2;
3560 continue;
3563 /* REGNO2 can be saved/restored in a pair with REGNO. */
3564 rtx reg2 = gen_rtx_REG (mode, regno2);
3565 if (!frame_pointer_needed)
3566 offset2 += cfun->machine->frame.frame_size
3567 - cfun->machine->frame.hard_fp_offset;
3568 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3569 rtx mem2 = gen_frame_mem (mode, addr2);
3570 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3571 : gen_rtx_SET (reg2, mem2);
3573 if (prologue_p)
3574 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3575 else
3576 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3578 RTX_FRAME_RELATED_P (insn) = 1;
3579 if (prologue_p)
3581 add_reg_note (insn, REG_CFA_OFFSET, set);
3582 add_reg_note (insn, REG_CFA_OFFSET, set2);
3584 else
3586 add_reg_note (insn, REG_CFA_RESTORE, reg);
3587 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3590 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3594 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3596 static void
3597 aarch64_emit_prologue_components (sbitmap components)
3599 aarch64_process_components (components, true);
3602 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3604 static void
3605 aarch64_emit_epilogue_components (sbitmap components)
3607 aarch64_process_components (components, false);
3610 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3612 static void
3613 aarch64_set_handled_components (sbitmap components)
3615 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3616 if (bitmap_bit_p (components, regno))
3617 cfun->machine->reg_is_wrapped_separately[regno] = true;
3620 /* AArch64 stack frames generated by this compiler look like:
3622 +-------------------------------+
3624 | incoming stack arguments |
3626 +-------------------------------+
3627 | | <-- incoming stack pointer (aligned)
3628 | callee-allocated save area |
3629 | for register varargs |
3631 +-------------------------------+
3632 | local variables | <-- frame_pointer_rtx
3634 +-------------------------------+
3635 | padding0 | \
3636 +-------------------------------+ |
3637 | callee-saved registers | | frame.saved_regs_size
3638 +-------------------------------+ |
3639 | LR' | |
3640 +-------------------------------+ |
3641 | FP' | / <- hard_frame_pointer_rtx (aligned)
3642 +-------------------------------+
3643 | dynamic allocation |
3644 +-------------------------------+
3645 | padding |
3646 +-------------------------------+
3647 | outgoing stack arguments | <-- arg_pointer
3649 +-------------------------------+
3650 | | <-- stack_pointer_rtx (aligned)
3652 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3653 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3654 unchanged. */
3656 /* Generate the prologue instructions for entry into a function.
3657 Establish the stack frame by decreasing the stack pointer with a
3658 properly calculated size and, if necessary, create a frame record
3659 filled with the values of LR and previous frame pointer. The
3660 current FP is also set up if it is in use. */
3662 void
3663 aarch64_expand_prologue (void)
3665 aarch64_layout_frame ();
3667 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3668 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3669 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3670 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3671 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3672 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3673 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3674 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
3675 rtx_insn *insn;
3677 /* Sign return address for functions. */
3678 if (aarch64_return_address_signing_enabled ())
3680 insn = emit_insn (gen_pacisp ());
3681 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3682 RTX_FRAME_RELATED_P (insn) = 1;
3685 if (flag_stack_usage_info)
3686 current_function_static_stack_size = frame_size;
3688 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3690 if (crtl->is_leaf && !cfun->calls_alloca)
3692 if (frame_size > PROBE_INTERVAL
3693 && frame_size > get_stack_check_protect ())
3694 aarch64_emit_probe_stack_range (get_stack_check_protect (),
3695 (frame_size
3696 - get_stack_check_protect ()));
3698 else if (frame_size > 0)
3699 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
3702 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3704 if (callee_adjust != 0)
3705 aarch64_push_regs (reg1, reg2, callee_adjust);
3707 if (emit_frame_chain)
3709 if (callee_adjust == 0)
3710 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3711 R30_REGNUM, false);
3712 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3713 stack_pointer_rtx,
3714 GEN_INT (callee_offset)));
3715 RTX_FRAME_RELATED_P (insn) = frame_pointer_needed;
3716 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3719 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3720 callee_adjust != 0 || emit_frame_chain);
3721 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3722 callee_adjust != 0 || emit_frame_chain);
3723 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3726 /* Return TRUE if we can use a simple_return insn.
3728 This function checks whether the callee saved stack is empty, which
3729 means no restore actions are need. The pro_and_epilogue will use
3730 this to check whether shrink-wrapping opt is feasible. */
3732 bool
3733 aarch64_use_return_insn_p (void)
3735 if (!reload_completed)
3736 return false;
3738 if (crtl->profile)
3739 return false;
3741 aarch64_layout_frame ();
3743 return cfun->machine->frame.frame_size == 0;
3746 /* Generate the epilogue instructions for returning from a function.
3747 This is almost exactly the reverse of the prolog sequence, except
3748 that we need to insert barriers to avoid scheduling loads that read
3749 from a deallocated stack, and we optimize the unwind records by
3750 emitting them all together if possible. */
3751 void
3752 aarch64_expand_epilogue (bool for_sibcall)
3754 aarch64_layout_frame ();
3756 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3757 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3758 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3759 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3760 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3761 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3762 rtx cfi_ops = NULL;
3763 rtx_insn *insn;
3765 /* We need to add memory barrier to prevent read from deallocated stack. */
3766 bool need_barrier_p = (get_frame_size ()
3767 + cfun->machine->frame.saved_varargs_size) != 0;
3769 /* Emit a barrier to prevent loads from a deallocated stack. */
3770 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3771 || crtl->calls_eh_return)
3773 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3774 need_barrier_p = false;
3777 /* Restore the stack pointer from the frame pointer if it may not
3778 be the same as the stack pointer. */
3779 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3781 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3782 hard_frame_pointer_rtx,
3783 GEN_INT (-callee_offset)));
3784 /* If writeback is used when restoring callee-saves, the CFA
3785 is restored on the instruction doing the writeback. */
3786 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3788 else
3789 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3791 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3792 callee_adjust != 0, &cfi_ops);
3793 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3794 callee_adjust != 0, &cfi_ops);
3796 if (need_barrier_p)
3797 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3799 if (callee_adjust != 0)
3800 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3802 if (callee_adjust != 0 || initial_adjust > 65536)
3804 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3805 insn = get_last_insn ();
3806 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3807 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3808 RTX_FRAME_RELATED_P (insn) = 1;
3809 cfi_ops = NULL;
3812 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3814 if (cfi_ops)
3816 /* Emit delayed restores and reset the CFA to be SP. */
3817 insn = get_last_insn ();
3818 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3819 REG_NOTES (insn) = cfi_ops;
3820 RTX_FRAME_RELATED_P (insn) = 1;
3823 /* We prefer to emit the combined return/authenticate instruction RETAA,
3824 however there are three cases in which we must instead emit an explicit
3825 authentication instruction.
3827 1) Sibcalls don't return in a normal way, so if we're about to call one
3828 we must authenticate.
3830 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3831 generating code for !TARGET_ARMV8_3 we can't use it and must
3832 explicitly authenticate.
3834 3) On an eh_return path we make extra stack adjustments to update the
3835 canonical frame address to be the exception handler's CFA. We want
3836 to authenticate using the CFA of the function which calls eh_return.
3838 if (aarch64_return_address_signing_enabled ()
3839 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3841 insn = emit_insn (gen_autisp ());
3842 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3843 RTX_FRAME_RELATED_P (insn) = 1;
3846 /* Stack adjustment for exception handler. */
3847 if (crtl->calls_eh_return)
3849 /* We need to unwind the stack by the offset computed by
3850 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3851 to be SP; letting the CFA move during this adjustment
3852 is just as correct as retaining the CFA from the body
3853 of the function. Therefore, do nothing special. */
3854 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3857 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3858 if (!for_sibcall)
3859 emit_jump_insn (ret_rtx);
3862 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3863 normally or return to a previous frame after unwinding.
3865 An EH return uses a single shared return sequence. The epilogue is
3866 exactly like a normal epilogue except that it has an extra input
3867 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3868 that must be applied after the frame has been destroyed. An extra label
3869 is inserted before the epilogue which initializes this register to zero,
3870 and this is the entry point for a normal return.
3872 An actual EH return updates the return address, initializes the stack
3873 adjustment and jumps directly into the epilogue (bypassing the zeroing
3874 of the adjustment). Since the return address is typically saved on the
3875 stack when a function makes a call, the saved LR must be updated outside
3876 the epilogue.
3878 This poses problems as the store is generated well before the epilogue,
3879 so the offset of LR is not known yet. Also optimizations will remove the
3880 store as it appears dead, even after the epilogue is generated (as the
3881 base or offset for loading LR is different in many cases).
3883 To avoid these problems this implementation forces the frame pointer
3884 in eh_return functions so that the location of LR is fixed and known early.
3885 It also marks the store volatile, so no optimization is permitted to
3886 remove the store. */
3888 aarch64_eh_return_handler_rtx (void)
3890 rtx tmp = gen_frame_mem (Pmode,
3891 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3893 /* Mark the store volatile, so no optimization is permitted to remove it. */
3894 MEM_VOLATILE_P (tmp) = true;
3895 return tmp;
3898 /* Output code to add DELTA to the first argument, and then jump
3899 to FUNCTION. Used for C++ multiple inheritance. */
3900 static void
3901 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3902 HOST_WIDE_INT delta,
3903 HOST_WIDE_INT vcall_offset,
3904 tree function)
3906 /* The this pointer is always in x0. Note that this differs from
3907 Arm where the this pointer maybe bumped to r1 if r0 is required
3908 to return a pointer to an aggregate. On AArch64 a result value
3909 pointer will be in x8. */
3910 int this_regno = R0_REGNUM;
3911 rtx this_rtx, temp0, temp1, addr, funexp;
3912 rtx_insn *insn;
3914 reload_completed = 1;
3915 emit_note (NOTE_INSN_PROLOGUE_END);
3917 if (vcall_offset == 0)
3918 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3919 else
3921 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3923 this_rtx = gen_rtx_REG (Pmode, this_regno);
3924 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3925 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3927 addr = this_rtx;
3928 if (delta != 0)
3930 if (delta >= -256 && delta < 256)
3931 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3932 plus_constant (Pmode, this_rtx, delta));
3933 else
3934 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3937 if (Pmode == ptr_mode)
3938 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3939 else
3940 aarch64_emit_move (temp0,
3941 gen_rtx_ZERO_EXTEND (Pmode,
3942 gen_rtx_MEM (ptr_mode, addr)));
3944 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3945 addr = plus_constant (Pmode, temp0, vcall_offset);
3946 else
3948 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3949 Pmode);
3950 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3953 if (Pmode == ptr_mode)
3954 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3955 else
3956 aarch64_emit_move (temp1,
3957 gen_rtx_SIGN_EXTEND (Pmode,
3958 gen_rtx_MEM (ptr_mode, addr)));
3960 emit_insn (gen_add2_insn (this_rtx, temp1));
3963 /* Generate a tail call to the target function. */
3964 if (!TREE_USED (function))
3966 assemble_external (function);
3967 TREE_USED (function) = 1;
3969 funexp = XEXP (DECL_RTL (function), 0);
3970 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3971 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3972 SIBLING_CALL_P (insn) = 1;
3974 insn = get_insns ();
3975 shorten_branches (insn);
3976 final_start_function (insn, file, 1);
3977 final (insn, file, 1);
3978 final_end_function ();
3980 /* Stop pretending to be a post-reload pass. */
3981 reload_completed = 0;
3984 static bool
3985 aarch64_tls_referenced_p (rtx x)
3987 if (!TARGET_HAVE_TLS)
3988 return false;
3989 subrtx_iterator::array_type array;
3990 FOR_EACH_SUBRTX (iter, array, x, ALL)
3992 const_rtx x = *iter;
3993 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3994 return true;
3995 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3996 TLS offsets, not real symbol references. */
3997 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3998 iter.skip_subrtxes ();
4000 return false;
4004 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4005 a left shift of 0 or 12 bits. */
4006 bool
4007 aarch64_uimm12_shift (HOST_WIDE_INT val)
4009 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
4010 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
4015 /* Return true if val is an immediate that can be loaded into a
4016 register by a MOVZ instruction. */
4017 static bool
4018 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
4020 if (GET_MODE_SIZE (mode) > 4)
4022 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4023 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4024 return 1;
4026 else
4028 /* Ignore sign extension. */
4029 val &= (HOST_WIDE_INT) 0xffffffff;
4031 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4032 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4035 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4037 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4039 0x0000000100000001ull,
4040 0x0001000100010001ull,
4041 0x0101010101010101ull,
4042 0x1111111111111111ull,
4043 0x5555555555555555ull,
4047 /* Return true if val is a valid bitmask immediate. */
4049 bool
4050 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4052 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4053 int bits;
4055 /* Check for a single sequence of one bits and return quickly if so.
4056 The special cases of all ones and all zeroes returns false. */
4057 val = (unsigned HOST_WIDE_INT) val_in;
4058 tmp = val + (val & -val);
4060 if (tmp == (tmp & -tmp))
4061 return (val + 1) > 1;
4063 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4064 if (mode == SImode)
4065 val = (val << 32) | (val & 0xffffffff);
4067 /* Invert if the immediate doesn't start with a zero bit - this means we
4068 only need to search for sequences of one bits. */
4069 if (val & 1)
4070 val = ~val;
4072 /* Find the first set bit and set tmp to val with the first sequence of one
4073 bits removed. Return success if there is a single sequence of ones. */
4074 first_one = val & -val;
4075 tmp = val & (val + first_one);
4077 if (tmp == 0)
4078 return true;
4080 /* Find the next set bit and compute the difference in bit position. */
4081 next_one = tmp & -tmp;
4082 bits = clz_hwi (first_one) - clz_hwi (next_one);
4083 mask = val ^ tmp;
4085 /* Check the bit position difference is a power of 2, and that the first
4086 sequence of one bits fits within 'bits' bits. */
4087 if ((mask >> bits) != 0 || bits != (bits & -bits))
4088 return false;
4090 /* Check the sequence of one bits is repeated 64/bits times. */
4091 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4094 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4095 Assumed precondition: VAL_IN Is not zero. */
4097 unsigned HOST_WIDE_INT
4098 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4100 int lowest_bit_set = ctz_hwi (val_in);
4101 int highest_bit_set = floor_log2 (val_in);
4102 gcc_assert (val_in != 0);
4104 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4105 (HOST_WIDE_INT_1U << lowest_bit_set));
4108 /* Create constant where bits outside of lowest bit set to highest bit set
4109 are set to 1. */
4111 unsigned HOST_WIDE_INT
4112 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4114 return val_in | ~aarch64_and_split_imm1 (val_in);
4117 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4119 bool
4120 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4122 scalar_int_mode int_mode;
4123 if (!is_a <scalar_int_mode> (mode, &int_mode))
4124 return false;
4126 if (aarch64_bitmask_imm (val_in, int_mode))
4127 return false;
4129 if (aarch64_move_imm (val_in, int_mode))
4130 return false;
4132 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4134 return aarch64_bitmask_imm (imm2, int_mode);
4137 /* Return true if val is an immediate that can be loaded into a
4138 register in a single instruction. */
4139 bool
4140 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4142 scalar_int_mode int_mode;
4143 if (!is_a <scalar_int_mode> (mode, &int_mode))
4144 return false;
4146 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4147 return 1;
4148 return aarch64_bitmask_imm (val, int_mode);
4151 static bool
4152 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4154 rtx base, offset;
4156 if (GET_CODE (x) == HIGH)
4157 return true;
4159 split_const (x, &base, &offset);
4160 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4162 if (aarch64_classify_symbol (base, offset)
4163 != SYMBOL_FORCE_TO_MEM)
4164 return true;
4165 else
4166 /* Avoid generating a 64-bit relocation in ILP32; leave
4167 to aarch64_expand_mov_immediate to handle it properly. */
4168 return mode != ptr_mode;
4171 return aarch64_tls_referenced_p (x);
4174 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4175 The expansion for a table switch is quite expensive due to the number
4176 of instructions, the table lookup and hard to predict indirect jump.
4177 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4178 set, otherwise use tables for > 16 cases as a tradeoff between size and
4179 performance. When optimizing for size, use the default setting. */
4181 static unsigned int
4182 aarch64_case_values_threshold (void)
4184 /* Use the specified limit for the number of cases before using jump
4185 tables at higher optimization levels. */
4186 if (optimize > 2
4187 && selected_cpu->tune->max_case_values != 0)
4188 return selected_cpu->tune->max_case_values;
4189 else
4190 return optimize_size ? default_case_values_threshold () : 17;
4193 /* Return true if register REGNO is a valid index register.
4194 STRICT_P is true if REG_OK_STRICT is in effect. */
4196 bool
4197 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4199 if (!HARD_REGISTER_NUM_P (regno))
4201 if (!strict_p)
4202 return true;
4204 if (!reg_renumber)
4205 return false;
4207 regno = reg_renumber[regno];
4209 return GP_REGNUM_P (regno);
4212 /* Return true if register REGNO is a valid base register for mode MODE.
4213 STRICT_P is true if REG_OK_STRICT is in effect. */
4215 bool
4216 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4218 if (!HARD_REGISTER_NUM_P (regno))
4220 if (!strict_p)
4221 return true;
4223 if (!reg_renumber)
4224 return false;
4226 regno = reg_renumber[regno];
4229 /* The fake registers will be eliminated to either the stack or
4230 hard frame pointer, both of which are usually valid base registers.
4231 Reload deals with the cases where the eliminated form isn't valid. */
4232 return (GP_REGNUM_P (regno)
4233 || regno == SP_REGNUM
4234 || regno == FRAME_POINTER_REGNUM
4235 || regno == ARG_POINTER_REGNUM);
4238 /* Return true if X is a valid base register for mode MODE.
4239 STRICT_P is true if REG_OK_STRICT is in effect. */
4241 static bool
4242 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4244 if (!strict_p
4245 && GET_CODE (x) == SUBREG
4246 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4247 x = SUBREG_REG (x);
4249 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4252 /* Return true if address offset is a valid index. If it is, fill in INFO
4253 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4255 static bool
4256 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4257 machine_mode mode, bool strict_p)
4259 enum aarch64_address_type type;
4260 rtx index;
4261 int shift;
4263 /* (reg:P) */
4264 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4265 && GET_MODE (x) == Pmode)
4267 type = ADDRESS_REG_REG;
4268 index = x;
4269 shift = 0;
4271 /* (sign_extend:DI (reg:SI)) */
4272 else if ((GET_CODE (x) == SIGN_EXTEND
4273 || GET_CODE (x) == ZERO_EXTEND)
4274 && GET_MODE (x) == DImode
4275 && GET_MODE (XEXP (x, 0)) == SImode)
4277 type = (GET_CODE (x) == SIGN_EXTEND)
4278 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4279 index = XEXP (x, 0);
4280 shift = 0;
4282 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4283 else if (GET_CODE (x) == MULT
4284 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4285 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4286 && GET_MODE (XEXP (x, 0)) == DImode
4287 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4288 && CONST_INT_P (XEXP (x, 1)))
4290 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4291 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4292 index = XEXP (XEXP (x, 0), 0);
4293 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4295 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4296 else if (GET_CODE (x) == ASHIFT
4297 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4298 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4299 && GET_MODE (XEXP (x, 0)) == DImode
4300 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4301 && CONST_INT_P (XEXP (x, 1)))
4303 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4304 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4305 index = XEXP (XEXP (x, 0), 0);
4306 shift = INTVAL (XEXP (x, 1));
4308 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4309 else if ((GET_CODE (x) == SIGN_EXTRACT
4310 || GET_CODE (x) == ZERO_EXTRACT)
4311 && GET_MODE (x) == DImode
4312 && GET_CODE (XEXP (x, 0)) == MULT
4313 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4314 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4316 type = (GET_CODE (x) == SIGN_EXTRACT)
4317 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4318 index = XEXP (XEXP (x, 0), 0);
4319 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4320 if (INTVAL (XEXP (x, 1)) != 32 + shift
4321 || INTVAL (XEXP (x, 2)) != 0)
4322 shift = -1;
4324 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4325 (const_int 0xffffffff<<shift)) */
4326 else if (GET_CODE (x) == AND
4327 && GET_MODE (x) == DImode
4328 && GET_CODE (XEXP (x, 0)) == MULT
4329 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4330 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4331 && CONST_INT_P (XEXP (x, 1)))
4333 type = ADDRESS_REG_UXTW;
4334 index = XEXP (XEXP (x, 0), 0);
4335 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4336 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4337 shift = -1;
4339 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4340 else if ((GET_CODE (x) == SIGN_EXTRACT
4341 || GET_CODE (x) == ZERO_EXTRACT)
4342 && GET_MODE (x) == DImode
4343 && GET_CODE (XEXP (x, 0)) == ASHIFT
4344 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4345 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4347 type = (GET_CODE (x) == SIGN_EXTRACT)
4348 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4349 index = XEXP (XEXP (x, 0), 0);
4350 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4351 if (INTVAL (XEXP (x, 1)) != 32 + shift
4352 || INTVAL (XEXP (x, 2)) != 0)
4353 shift = -1;
4355 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4356 (const_int 0xffffffff<<shift)) */
4357 else if (GET_CODE (x) == AND
4358 && GET_MODE (x) == DImode
4359 && GET_CODE (XEXP (x, 0)) == ASHIFT
4360 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4361 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4362 && CONST_INT_P (XEXP (x, 1)))
4364 type = ADDRESS_REG_UXTW;
4365 index = XEXP (XEXP (x, 0), 0);
4366 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4367 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4368 shift = -1;
4370 /* (mult:P (reg:P) (const_int scale)) */
4371 else if (GET_CODE (x) == MULT
4372 && GET_MODE (x) == Pmode
4373 && GET_MODE (XEXP (x, 0)) == Pmode
4374 && CONST_INT_P (XEXP (x, 1)))
4376 type = ADDRESS_REG_REG;
4377 index = XEXP (x, 0);
4378 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4380 /* (ashift:P (reg:P) (const_int shift)) */
4381 else if (GET_CODE (x) == ASHIFT
4382 && GET_MODE (x) == Pmode
4383 && GET_MODE (XEXP (x, 0)) == Pmode
4384 && CONST_INT_P (XEXP (x, 1)))
4386 type = ADDRESS_REG_REG;
4387 index = XEXP (x, 0);
4388 shift = INTVAL (XEXP (x, 1));
4390 else
4391 return false;
4393 if (!strict_p
4394 && GET_CODE (index) == SUBREG
4395 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4396 index = SUBREG_REG (index);
4398 if ((shift == 0 ||
4399 (shift > 0 && shift <= 3
4400 && (1 << shift) == GET_MODE_SIZE (mode)))
4401 && REG_P (index)
4402 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4404 info->type = type;
4405 info->offset = index;
4406 info->shift = shift;
4407 return true;
4410 return false;
4413 /* Return true if MODE is one of the modes for which we
4414 support LDP/STP operations. */
4416 static bool
4417 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4419 return mode == SImode || mode == DImode
4420 || mode == SFmode || mode == DFmode
4421 || (aarch64_vector_mode_supported_p (mode)
4422 && GET_MODE_SIZE (mode) == 8);
4425 /* Return true if REGNO is a virtual pointer register, or an eliminable
4426 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4427 include stack_pointer or hard_frame_pointer. */
4428 static bool
4429 virt_or_elim_regno_p (unsigned regno)
4431 return ((regno >= FIRST_VIRTUAL_REGISTER
4432 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4433 || regno == FRAME_POINTER_REGNUM
4434 || regno == ARG_POINTER_REGNUM);
4437 /* Return true if X is a valid address for machine mode MODE. If it is,
4438 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4439 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4441 static bool
4442 aarch64_classify_address (struct aarch64_address_info *info,
4443 rtx x, machine_mode mode,
4444 RTX_CODE outer_code, bool strict_p)
4446 enum rtx_code code = GET_CODE (x);
4447 rtx op0, op1;
4449 /* On BE, we use load/store pair for all large int mode load/stores.
4450 TI/TFmode may also use a load/store pair. */
4451 bool load_store_pair_p = (outer_code == PARALLEL
4452 || mode == TImode
4453 || mode == TFmode
4454 || (BYTES_BIG_ENDIAN
4455 && aarch64_vect_struct_mode_p (mode)));
4457 bool allow_reg_index_p =
4458 !load_store_pair_p
4459 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4460 && !aarch64_vect_struct_mode_p (mode);
4462 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4463 REG addressing. */
4464 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4465 && (code != POST_INC && code != REG))
4466 return false;
4468 switch (code)
4470 case REG:
4471 case SUBREG:
4472 info->type = ADDRESS_REG_IMM;
4473 info->base = x;
4474 info->offset = const0_rtx;
4475 return aarch64_base_register_rtx_p (x, strict_p);
4477 case PLUS:
4478 op0 = XEXP (x, 0);
4479 op1 = XEXP (x, 1);
4481 if (! strict_p
4482 && REG_P (op0)
4483 && virt_or_elim_regno_p (REGNO (op0))
4484 && CONST_INT_P (op1))
4486 info->type = ADDRESS_REG_IMM;
4487 info->base = op0;
4488 info->offset = op1;
4490 return true;
4493 if (GET_MODE_SIZE (mode) != 0
4494 && CONST_INT_P (op1)
4495 && aarch64_base_register_rtx_p (op0, strict_p))
4497 HOST_WIDE_INT offset = INTVAL (op1);
4499 info->type = ADDRESS_REG_IMM;
4500 info->base = op0;
4501 info->offset = op1;
4503 /* TImode and TFmode values are allowed in both pairs of X
4504 registers and individual Q registers. The available
4505 address modes are:
4506 X,X: 7-bit signed scaled offset
4507 Q: 9-bit signed offset
4508 We conservatively require an offset representable in either mode.
4509 When performing the check for pairs of X registers i.e. LDP/STP
4510 pass down DImode since that is the natural size of the LDP/STP
4511 instruction memory accesses. */
4512 if (mode == TImode || mode == TFmode)
4513 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4514 && (offset_9bit_signed_unscaled_p (mode, offset)
4515 || offset_12bit_unsigned_scaled_p (mode, offset)));
4517 /* A 7bit offset check because OImode will emit a ldp/stp
4518 instruction (only big endian will get here).
4519 For ldp/stp instructions, the offset is scaled for the size of a
4520 single element of the pair. */
4521 if (mode == OImode)
4522 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4524 /* Three 9/12 bit offsets checks because CImode will emit three
4525 ldr/str instructions (only big endian will get here). */
4526 if (mode == CImode)
4527 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4528 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4529 || offset_12bit_unsigned_scaled_p (V16QImode,
4530 offset + 32)));
4532 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4533 instructions (only big endian will get here). */
4534 if (mode == XImode)
4535 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4536 && aarch64_offset_7bit_signed_scaled_p (TImode,
4537 offset + 32));
4539 if (load_store_pair_p)
4540 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4541 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4542 else
4543 return (offset_9bit_signed_unscaled_p (mode, offset)
4544 || offset_12bit_unsigned_scaled_p (mode, offset));
4547 if (allow_reg_index_p)
4549 /* Look for base + (scaled/extended) index register. */
4550 if (aarch64_base_register_rtx_p (op0, strict_p)
4551 && aarch64_classify_index (info, op1, mode, strict_p))
4553 info->base = op0;
4554 return true;
4556 if (aarch64_base_register_rtx_p (op1, strict_p)
4557 && aarch64_classify_index (info, op0, mode, strict_p))
4559 info->base = op1;
4560 return true;
4564 return false;
4566 case POST_INC:
4567 case POST_DEC:
4568 case PRE_INC:
4569 case PRE_DEC:
4570 info->type = ADDRESS_REG_WB;
4571 info->base = XEXP (x, 0);
4572 info->offset = NULL_RTX;
4573 return aarch64_base_register_rtx_p (info->base, strict_p);
4575 case POST_MODIFY:
4576 case PRE_MODIFY:
4577 info->type = ADDRESS_REG_WB;
4578 info->base = XEXP (x, 0);
4579 if (GET_CODE (XEXP (x, 1)) == PLUS
4580 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4581 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4582 && aarch64_base_register_rtx_p (info->base, strict_p))
4584 HOST_WIDE_INT offset;
4585 info->offset = XEXP (XEXP (x, 1), 1);
4586 offset = INTVAL (info->offset);
4588 /* TImode and TFmode values are allowed in both pairs of X
4589 registers and individual Q registers. The available
4590 address modes are:
4591 X,X: 7-bit signed scaled offset
4592 Q: 9-bit signed offset
4593 We conservatively require an offset representable in either mode.
4595 if (mode == TImode || mode == TFmode)
4596 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4597 && offset_9bit_signed_unscaled_p (mode, offset));
4599 if (load_store_pair_p)
4600 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4601 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4602 else
4603 return offset_9bit_signed_unscaled_p (mode, offset);
4605 return false;
4607 case CONST:
4608 case SYMBOL_REF:
4609 case LABEL_REF:
4610 /* load literal: pc-relative constant pool entry. Only supported
4611 for SI mode or larger. */
4612 info->type = ADDRESS_SYMBOLIC;
4614 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4616 rtx sym, addend;
4618 split_const (x, &sym, &addend);
4619 return ((GET_CODE (sym) == LABEL_REF
4620 || (GET_CODE (sym) == SYMBOL_REF
4621 && CONSTANT_POOL_ADDRESS_P (sym)
4622 && aarch64_pcrelative_literal_loads)));
4624 return false;
4626 case LO_SUM:
4627 info->type = ADDRESS_LO_SUM;
4628 info->base = XEXP (x, 0);
4629 info->offset = XEXP (x, 1);
4630 if (allow_reg_index_p
4631 && aarch64_base_register_rtx_p (info->base, strict_p))
4633 rtx sym, offs;
4634 split_const (info->offset, &sym, &offs);
4635 if (GET_CODE (sym) == SYMBOL_REF
4636 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4638 /* The symbol and offset must be aligned to the access size. */
4639 unsigned int align;
4640 unsigned int ref_size;
4642 if (CONSTANT_POOL_ADDRESS_P (sym))
4643 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4644 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4646 tree exp = SYMBOL_REF_DECL (sym);
4647 align = TYPE_ALIGN (TREE_TYPE (exp));
4648 align = aarch64_constant_alignment (exp, align);
4650 else if (SYMBOL_REF_DECL (sym))
4651 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4652 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4653 && SYMBOL_REF_BLOCK (sym) != NULL)
4654 align = SYMBOL_REF_BLOCK (sym)->alignment;
4655 else
4656 align = BITS_PER_UNIT;
4658 ref_size = GET_MODE_SIZE (mode);
4659 if (ref_size == 0)
4660 ref_size = GET_MODE_SIZE (DImode);
4662 return ((INTVAL (offs) & (ref_size - 1)) == 0
4663 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4666 return false;
4668 default:
4669 return false;
4673 /* Return true if the address X is valid for a PRFM instruction.
4674 STRICT_P is true if we should do strict checking with
4675 aarch64_classify_address. */
4677 bool
4678 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4680 struct aarch64_address_info addr;
4682 /* PRFM accepts the same addresses as DImode... */
4683 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4684 if (!res)
4685 return false;
4687 /* ... except writeback forms. */
4688 return addr.type != ADDRESS_REG_WB;
4691 bool
4692 aarch64_symbolic_address_p (rtx x)
4694 rtx offset;
4696 split_const (x, &x, &offset);
4697 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4700 /* Classify the base of symbolic expression X. */
4702 enum aarch64_symbol_type
4703 aarch64_classify_symbolic_expression (rtx x)
4705 rtx offset;
4707 split_const (x, &x, &offset);
4708 return aarch64_classify_symbol (x, offset);
4712 /* Return TRUE if X is a legitimate address for accessing memory in
4713 mode MODE. */
4714 static bool
4715 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4717 struct aarch64_address_info addr;
4719 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4722 /* Return TRUE if X is a legitimate address for accessing memory in
4723 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4724 pair operation. */
4725 bool
4726 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4727 RTX_CODE outer_code, bool strict_p)
4729 struct aarch64_address_info addr;
4731 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4734 /* Split an out-of-range address displacement into a base and offset.
4735 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4736 to increase opportunities for sharing the base address of different sizes.
4737 Unaligned accesses use the signed 9-bit range, TImode/TFmode use
4738 the intersection of signed scaled 7-bit and signed 9-bit offset. */
4739 static bool
4740 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4742 HOST_WIDE_INT offset = INTVAL (*disp);
4743 HOST_WIDE_INT base;
4745 if (mode == TImode || mode == TFmode)
4746 base = (offset + 0x100) & ~0x1f8;
4747 else if ((offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4748 base = (offset + 0x100) & ~0x1ff;
4749 else
4750 base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4752 *off = GEN_INT (base);
4753 *disp = GEN_INT (offset - base);
4754 return true;
4757 /* Return the binary representation of floating point constant VALUE in INTVAL.
4758 If the value cannot be converted, return false without setting INTVAL.
4759 The conversion is done in the given MODE. */
4760 bool
4761 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4764 /* We make a general exception for 0. */
4765 if (aarch64_float_const_zero_rtx_p (value))
4767 *intval = 0;
4768 return true;
4771 machine_mode mode = GET_MODE (value);
4772 if (GET_CODE (value) != CONST_DOUBLE
4773 || !SCALAR_FLOAT_MODE_P (mode)
4774 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4775 /* Only support up to DF mode. */
4776 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4777 return false;
4779 unsigned HOST_WIDE_INT ival = 0;
4781 long res[2];
4782 real_to_target (res,
4783 CONST_DOUBLE_REAL_VALUE (value),
4784 REAL_MODE_FORMAT (mode));
4786 if (mode == DFmode)
4788 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4789 ival = zext_hwi (res[order], 32);
4790 ival |= (zext_hwi (res[1 - order], 32) << 32);
4792 else
4793 ival = zext_hwi (res[0], 32);
4795 *intval = ival;
4796 return true;
4799 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4800 single MOV(+MOVK) followed by an FMOV. */
4801 bool
4802 aarch64_float_const_rtx_p (rtx x)
4804 machine_mode mode = GET_MODE (x);
4805 if (mode == VOIDmode)
4806 return false;
4808 /* Determine whether it's cheaper to write float constants as
4809 mov/movk pairs over ldr/adrp pairs. */
4810 unsigned HOST_WIDE_INT ival;
4812 if (GET_CODE (x) == CONST_DOUBLE
4813 && SCALAR_FLOAT_MODE_P (mode)
4814 && aarch64_reinterpret_float_as_int (x, &ival))
4816 scalar_int_mode imode = (mode == HFmode
4817 ? SImode
4818 : int_mode_for_mode (mode).require ());
4819 int num_instr = aarch64_internal_mov_immediate
4820 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4821 return num_instr < 3;
4824 return false;
4827 /* Return TRUE if rtx X is immediate constant 0.0 */
4828 bool
4829 aarch64_float_const_zero_rtx_p (rtx x)
4831 if (GET_MODE (x) == VOIDmode)
4832 return false;
4834 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4835 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4836 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4839 /* Return TRUE if rtx X is immediate constant that fits in a single
4840 MOVI immediate operation. */
4841 bool
4842 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4844 if (!TARGET_SIMD)
4845 return false;
4847 machine_mode vmode;
4848 scalar_int_mode imode;
4849 unsigned HOST_WIDE_INT ival;
4851 if (GET_CODE (x) == CONST_DOUBLE
4852 && SCALAR_FLOAT_MODE_P (mode))
4854 if (!aarch64_reinterpret_float_as_int (x, &ival))
4855 return false;
4857 /* We make a general exception for 0. */
4858 if (aarch64_float_const_zero_rtx_p (x))
4859 return true;
4861 imode = int_mode_for_mode (mode).require ();
4863 else if (GET_CODE (x) == CONST_INT
4864 && is_a <scalar_int_mode> (mode, &imode))
4865 ival = INTVAL (x);
4866 else
4867 return false;
4869 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4870 a 128 bit vector mode. */
4871 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4873 vmode = aarch64_simd_container_mode (imode, width);
4874 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4876 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4880 /* Return the fixed registers used for condition codes. */
4882 static bool
4883 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4885 *p1 = CC_REGNUM;
4886 *p2 = INVALID_REGNUM;
4887 return true;
4890 /* This function is used by the call expanders of the machine description.
4891 RESULT is the register in which the result is returned. It's NULL for
4892 "call" and "sibcall".
4893 MEM is the location of the function call.
4894 SIBCALL indicates whether this function call is normal call or sibling call.
4895 It will generate different pattern accordingly. */
4897 void
4898 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4900 rtx call, callee, tmp;
4901 rtvec vec;
4902 machine_mode mode;
4904 gcc_assert (MEM_P (mem));
4905 callee = XEXP (mem, 0);
4906 mode = GET_MODE (callee);
4907 gcc_assert (mode == Pmode);
4909 /* Decide if we should generate indirect calls by loading the
4910 address of the callee into a register before performing
4911 the branch-and-link. */
4912 if (SYMBOL_REF_P (callee)
4913 ? (aarch64_is_long_call_p (callee)
4914 || aarch64_is_noplt_call_p (callee))
4915 : !REG_P (callee))
4916 XEXP (mem, 0) = force_reg (mode, callee);
4918 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4920 if (result != NULL_RTX)
4921 call = gen_rtx_SET (result, call);
4923 if (sibcall)
4924 tmp = ret_rtx;
4925 else
4926 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4928 vec = gen_rtvec (2, call, tmp);
4929 call = gen_rtx_PARALLEL (VOIDmode, vec);
4931 aarch64_emit_call_insn (call);
4934 /* Emit call insn with PAT and do aarch64-specific handling. */
4936 void
4937 aarch64_emit_call_insn (rtx pat)
4939 rtx insn = emit_call_insn (pat);
4941 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4942 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4943 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4946 machine_mode
4947 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4949 /* All floating point compares return CCFP if it is an equality
4950 comparison, and CCFPE otherwise. */
4951 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4953 switch (code)
4955 case EQ:
4956 case NE:
4957 case UNORDERED:
4958 case ORDERED:
4959 case UNLT:
4960 case UNLE:
4961 case UNGT:
4962 case UNGE:
4963 case UNEQ:
4964 case LTGT:
4965 return CCFPmode;
4967 case LT:
4968 case LE:
4969 case GT:
4970 case GE:
4971 return CCFPEmode;
4973 default:
4974 gcc_unreachable ();
4978 /* Equality comparisons of short modes against zero can be performed
4979 using the TST instruction with the appropriate bitmask. */
4980 if (y == const0_rtx && REG_P (x)
4981 && (code == EQ || code == NE)
4982 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4983 return CC_NZmode;
4985 /* Similarly, comparisons of zero_extends from shorter modes can
4986 be performed using an ANDS with an immediate mask. */
4987 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4988 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4989 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4990 && (code == EQ || code == NE))
4991 return CC_NZmode;
4993 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4994 && y == const0_rtx
4995 && (code == EQ || code == NE || code == LT || code == GE)
4996 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4997 || GET_CODE (x) == NEG
4998 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4999 && CONST_INT_P (XEXP (x, 2)))))
5000 return CC_NZmode;
5002 /* A compare with a shifted operand. Because of canonicalization,
5003 the comparison will have to be swapped when we emit the assembly
5004 code. */
5005 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5006 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
5007 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
5008 || GET_CODE (x) == LSHIFTRT
5009 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
5010 return CC_SWPmode;
5012 /* Similarly for a negated operand, but we can only do this for
5013 equalities. */
5014 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5015 && (REG_P (y) || GET_CODE (y) == SUBREG)
5016 && (code == EQ || code == NE)
5017 && GET_CODE (x) == NEG)
5018 return CC_Zmode;
5020 /* A test for unsigned overflow. */
5021 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
5022 && code == NE
5023 && GET_CODE (x) == PLUS
5024 && GET_CODE (y) == ZERO_EXTEND)
5025 return CC_Cmode;
5027 /* For everything else, return CCmode. */
5028 return CCmode;
5031 static int
5032 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5035 aarch64_get_condition_code (rtx x)
5037 machine_mode mode = GET_MODE (XEXP (x, 0));
5038 enum rtx_code comp_code = GET_CODE (x);
5040 if (GET_MODE_CLASS (mode) != MODE_CC)
5041 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5042 return aarch64_get_condition_code_1 (mode, comp_code);
5045 static int
5046 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5048 switch (mode)
5050 case E_CCFPmode:
5051 case E_CCFPEmode:
5052 switch (comp_code)
5054 case GE: return AARCH64_GE;
5055 case GT: return AARCH64_GT;
5056 case LE: return AARCH64_LS;
5057 case LT: return AARCH64_MI;
5058 case NE: return AARCH64_NE;
5059 case EQ: return AARCH64_EQ;
5060 case ORDERED: return AARCH64_VC;
5061 case UNORDERED: return AARCH64_VS;
5062 case UNLT: return AARCH64_LT;
5063 case UNLE: return AARCH64_LE;
5064 case UNGT: return AARCH64_HI;
5065 case UNGE: return AARCH64_PL;
5066 default: return -1;
5068 break;
5070 case E_CCmode:
5071 switch (comp_code)
5073 case NE: return AARCH64_NE;
5074 case EQ: return AARCH64_EQ;
5075 case GE: return AARCH64_GE;
5076 case GT: return AARCH64_GT;
5077 case LE: return AARCH64_LE;
5078 case LT: return AARCH64_LT;
5079 case GEU: return AARCH64_CS;
5080 case GTU: return AARCH64_HI;
5081 case LEU: return AARCH64_LS;
5082 case LTU: return AARCH64_CC;
5083 default: return -1;
5085 break;
5087 case E_CC_SWPmode:
5088 switch (comp_code)
5090 case NE: return AARCH64_NE;
5091 case EQ: return AARCH64_EQ;
5092 case GE: return AARCH64_LE;
5093 case GT: return AARCH64_LT;
5094 case LE: return AARCH64_GE;
5095 case LT: return AARCH64_GT;
5096 case GEU: return AARCH64_LS;
5097 case GTU: return AARCH64_CC;
5098 case LEU: return AARCH64_CS;
5099 case LTU: return AARCH64_HI;
5100 default: return -1;
5102 break;
5104 case E_CC_NZmode:
5105 switch (comp_code)
5107 case NE: return AARCH64_NE;
5108 case EQ: return AARCH64_EQ;
5109 case GE: return AARCH64_PL;
5110 case LT: return AARCH64_MI;
5111 default: return -1;
5113 break;
5115 case E_CC_Zmode:
5116 switch (comp_code)
5118 case NE: return AARCH64_NE;
5119 case EQ: return AARCH64_EQ;
5120 default: return -1;
5122 break;
5124 case E_CC_Cmode:
5125 switch (comp_code)
5127 case NE: return AARCH64_CS;
5128 case EQ: return AARCH64_CC;
5129 default: return -1;
5131 break;
5133 default:
5134 return -1;
5137 return -1;
5140 bool
5141 aarch64_const_vec_all_same_in_range_p (rtx x,
5142 HOST_WIDE_INT minval,
5143 HOST_WIDE_INT maxval)
5145 HOST_WIDE_INT firstval;
5146 int count, i;
5148 if (GET_CODE (x) != CONST_VECTOR
5149 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5150 return false;
5152 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5153 if (firstval < minval || firstval > maxval)
5154 return false;
5156 count = CONST_VECTOR_NUNITS (x);
5157 for (i = 1; i < count; i++)
5158 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5159 return false;
5161 return true;
5164 bool
5165 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5167 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5171 /* N Z C V. */
5172 #define AARCH64_CC_V 1
5173 #define AARCH64_CC_C (1 << 1)
5174 #define AARCH64_CC_Z (1 << 2)
5175 #define AARCH64_CC_N (1 << 3)
5177 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5178 static const int aarch64_nzcv_codes[] =
5180 0, /* EQ, Z == 1. */
5181 AARCH64_CC_Z, /* NE, Z == 0. */
5182 0, /* CS, C == 1. */
5183 AARCH64_CC_C, /* CC, C == 0. */
5184 0, /* MI, N == 1. */
5185 AARCH64_CC_N, /* PL, N == 0. */
5186 0, /* VS, V == 1. */
5187 AARCH64_CC_V, /* VC, V == 0. */
5188 0, /* HI, C ==1 && Z == 0. */
5189 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5190 AARCH64_CC_V, /* GE, N == V. */
5191 0, /* LT, N != V. */
5192 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5193 0, /* LE, !(Z == 0 && N == V). */
5194 0, /* AL, Any. */
5195 0 /* NV, Any. */
5198 /* Print operand X to file F in a target specific manner according to CODE.
5199 The acceptable formatting commands given by CODE are:
5200 'c': An integer or symbol address without a preceding #
5201 sign.
5202 'e': Print the sign/zero-extend size as a character 8->b,
5203 16->h, 32->w.
5204 'p': Prints N such that 2^N == X (X must be power of 2 and
5205 const int).
5206 'P': Print the number of non-zero bits in X (a const_int).
5207 'H': Print the higher numbered register of a pair (TImode)
5208 of regs.
5209 'm': Print a condition (eq, ne, etc).
5210 'M': Same as 'm', but invert condition.
5211 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5212 'S/T/U/V': Print a FP/SIMD register name for a register list.
5213 The register printed is the FP/SIMD register name
5214 of X + 0/1/2/3 for S/T/U/V.
5215 'R': Print a scalar FP/SIMD register name + 1.
5216 'X': Print bottom 16 bits of integer constant in hex.
5217 'w/x': Print a general register name or the zero register
5218 (32-bit or 64-bit).
5219 '0': Print a normal operand, if it's a general register,
5220 then we assume DImode.
5221 'k': Print NZCV for conditional compare instructions.
5222 'A': Output address constant representing the first
5223 argument of X, specifying a relocation offset
5224 if appropriate.
5225 'L': Output constant address specified by X
5226 with a relocation offset if appropriate.
5227 'G': Prints address of X, specifying a PC relative
5228 relocation mode if appropriate. */
5230 static void
5231 aarch64_print_operand (FILE *f, rtx x, int code)
5233 switch (code)
5235 case 'c':
5236 switch (GET_CODE (x))
5238 case CONST_INT:
5239 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5240 break;
5242 case SYMBOL_REF:
5243 output_addr_const (f, x);
5244 break;
5246 case CONST:
5247 if (GET_CODE (XEXP (x, 0)) == PLUS
5248 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5250 output_addr_const (f, x);
5251 break;
5253 /* Fall through. */
5255 default:
5256 output_operand_lossage ("Unsupported operand for code '%c'", code);
5258 break;
5260 case 'e':
5262 int n;
5264 if (!CONST_INT_P (x)
5265 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5267 output_operand_lossage ("invalid operand for '%%%c'", code);
5268 return;
5271 switch (n)
5273 case 3:
5274 fputc ('b', f);
5275 break;
5276 case 4:
5277 fputc ('h', f);
5278 break;
5279 case 5:
5280 fputc ('w', f);
5281 break;
5282 default:
5283 output_operand_lossage ("invalid operand for '%%%c'", code);
5284 return;
5287 break;
5289 case 'p':
5291 int n;
5293 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5295 output_operand_lossage ("invalid operand for '%%%c'", code);
5296 return;
5299 asm_fprintf (f, "%d", n);
5301 break;
5303 case 'P':
5304 if (!CONST_INT_P (x))
5306 output_operand_lossage ("invalid operand for '%%%c'", code);
5307 return;
5310 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5311 break;
5313 case 'H':
5314 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5316 output_operand_lossage ("invalid operand for '%%%c'", code);
5317 return;
5320 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5321 break;
5323 case 'M':
5324 case 'm':
5326 int cond_code;
5327 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5328 if (x == const_true_rtx)
5330 if (code == 'M')
5331 fputs ("nv", f);
5332 return;
5335 if (!COMPARISON_P (x))
5337 output_operand_lossage ("invalid operand for '%%%c'", code);
5338 return;
5341 cond_code = aarch64_get_condition_code (x);
5342 gcc_assert (cond_code >= 0);
5343 if (code == 'M')
5344 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5345 fputs (aarch64_condition_codes[cond_code], f);
5347 break;
5349 case 'b':
5350 case 'h':
5351 case 's':
5352 case 'd':
5353 case 'q':
5354 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5356 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5357 return;
5359 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5360 break;
5362 case 'S':
5363 case 'T':
5364 case 'U':
5365 case 'V':
5366 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5368 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5369 return;
5371 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5372 break;
5374 case 'R':
5375 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5377 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5378 return;
5380 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5381 break;
5383 case 'X':
5384 if (!CONST_INT_P (x))
5386 output_operand_lossage ("invalid operand for '%%%c'", code);
5387 return;
5389 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5390 break;
5392 case 'w':
5393 case 'x':
5394 if (x == const0_rtx
5395 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5397 asm_fprintf (f, "%czr", code);
5398 break;
5401 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5403 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5404 break;
5407 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5409 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5410 break;
5413 /* Fall through */
5415 case 0:
5416 if (x == NULL)
5418 output_operand_lossage ("missing operand");
5419 return;
5422 switch (GET_CODE (x))
5424 case REG:
5425 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5426 break;
5428 case MEM:
5429 output_address (GET_MODE (x), XEXP (x, 0));
5430 /* Check all memory references are Pmode - even with ILP32. */
5431 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5432 break;
5434 case CONST:
5435 case LABEL_REF:
5436 case SYMBOL_REF:
5437 output_addr_const (asm_out_file, x);
5438 break;
5440 case CONST_INT:
5441 asm_fprintf (f, "%wd", INTVAL (x));
5442 break;
5444 case CONST_VECTOR:
5445 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5447 gcc_assert (
5448 aarch64_const_vec_all_same_in_range_p (x,
5449 HOST_WIDE_INT_MIN,
5450 HOST_WIDE_INT_MAX));
5451 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5453 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5455 fputc ('0', f);
5457 else
5458 gcc_unreachable ();
5459 break;
5461 case CONST_DOUBLE:
5462 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5463 be getting CONST_DOUBLEs holding integers. */
5464 gcc_assert (GET_MODE (x) != VOIDmode);
5465 if (aarch64_float_const_zero_rtx_p (x))
5467 fputc ('0', f);
5468 break;
5470 else if (aarch64_float_const_representable_p (x))
5472 #define buf_size 20
5473 char float_buf[buf_size] = {'\0'};
5474 real_to_decimal_for_mode (float_buf,
5475 CONST_DOUBLE_REAL_VALUE (x),
5476 buf_size, buf_size,
5477 1, GET_MODE (x));
5478 asm_fprintf (asm_out_file, "%s", float_buf);
5479 break;
5480 #undef buf_size
5482 output_operand_lossage ("invalid constant");
5483 return;
5484 default:
5485 output_operand_lossage ("invalid operand");
5486 return;
5488 break;
5490 case 'A':
5491 if (GET_CODE (x) == HIGH)
5492 x = XEXP (x, 0);
5494 switch (aarch64_classify_symbolic_expression (x))
5496 case SYMBOL_SMALL_GOT_4G:
5497 asm_fprintf (asm_out_file, ":got:");
5498 break;
5500 case SYMBOL_SMALL_TLSGD:
5501 asm_fprintf (asm_out_file, ":tlsgd:");
5502 break;
5504 case SYMBOL_SMALL_TLSDESC:
5505 asm_fprintf (asm_out_file, ":tlsdesc:");
5506 break;
5508 case SYMBOL_SMALL_TLSIE:
5509 asm_fprintf (asm_out_file, ":gottprel:");
5510 break;
5512 case SYMBOL_TLSLE24:
5513 asm_fprintf (asm_out_file, ":tprel:");
5514 break;
5516 case SYMBOL_TINY_GOT:
5517 gcc_unreachable ();
5518 break;
5520 default:
5521 break;
5523 output_addr_const (asm_out_file, x);
5524 break;
5526 case 'L':
5527 switch (aarch64_classify_symbolic_expression (x))
5529 case SYMBOL_SMALL_GOT_4G:
5530 asm_fprintf (asm_out_file, ":lo12:");
5531 break;
5533 case SYMBOL_SMALL_TLSGD:
5534 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5535 break;
5537 case SYMBOL_SMALL_TLSDESC:
5538 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5539 break;
5541 case SYMBOL_SMALL_TLSIE:
5542 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5543 break;
5545 case SYMBOL_TLSLE12:
5546 asm_fprintf (asm_out_file, ":tprel_lo12:");
5547 break;
5549 case SYMBOL_TLSLE24:
5550 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5551 break;
5553 case SYMBOL_TINY_GOT:
5554 asm_fprintf (asm_out_file, ":got:");
5555 break;
5557 case SYMBOL_TINY_TLSIE:
5558 asm_fprintf (asm_out_file, ":gottprel:");
5559 break;
5561 default:
5562 break;
5564 output_addr_const (asm_out_file, x);
5565 break;
5567 case 'G':
5568 switch (aarch64_classify_symbolic_expression (x))
5570 case SYMBOL_TLSLE24:
5571 asm_fprintf (asm_out_file, ":tprel_hi12:");
5572 break;
5573 default:
5574 break;
5576 output_addr_const (asm_out_file, x);
5577 break;
5579 case 'k':
5581 HOST_WIDE_INT cond_code;
5583 if (!CONST_INT_P (x))
5585 output_operand_lossage ("invalid operand for '%%%c'", code);
5586 return;
5589 cond_code = INTVAL (x);
5590 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5591 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5593 break;
5595 default:
5596 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5597 return;
5601 static void
5602 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5604 struct aarch64_address_info addr;
5606 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5607 switch (addr.type)
5609 case ADDRESS_REG_IMM:
5610 if (addr.offset == const0_rtx)
5611 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5612 else
5613 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5614 INTVAL (addr.offset));
5615 return;
5617 case ADDRESS_REG_REG:
5618 if (addr.shift == 0)
5619 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5620 reg_names [REGNO (addr.offset)]);
5621 else
5622 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5623 reg_names [REGNO (addr.offset)], addr.shift);
5624 return;
5626 case ADDRESS_REG_UXTW:
5627 if (addr.shift == 0)
5628 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5629 REGNO (addr.offset) - R0_REGNUM);
5630 else
5631 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5632 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5633 return;
5635 case ADDRESS_REG_SXTW:
5636 if (addr.shift == 0)
5637 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5638 REGNO (addr.offset) - R0_REGNUM);
5639 else
5640 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5641 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5642 return;
5644 case ADDRESS_REG_WB:
5645 switch (GET_CODE (x))
5647 case PRE_INC:
5648 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5649 GET_MODE_SIZE (mode));
5650 return;
5651 case POST_INC:
5652 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5653 GET_MODE_SIZE (mode));
5654 return;
5655 case PRE_DEC:
5656 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5657 GET_MODE_SIZE (mode));
5658 return;
5659 case POST_DEC:
5660 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5661 GET_MODE_SIZE (mode));
5662 return;
5663 case PRE_MODIFY:
5664 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5665 INTVAL (addr.offset));
5666 return;
5667 case POST_MODIFY:
5668 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5669 INTVAL (addr.offset));
5670 return;
5671 default:
5672 break;
5674 break;
5676 case ADDRESS_LO_SUM:
5677 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5678 output_addr_const (f, addr.offset);
5679 asm_fprintf (f, "]");
5680 return;
5682 case ADDRESS_SYMBOLIC:
5683 break;
5686 output_addr_const (f, x);
5689 bool
5690 aarch64_label_mentioned_p (rtx x)
5692 const char *fmt;
5693 int i;
5695 if (GET_CODE (x) == LABEL_REF)
5696 return true;
5698 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5699 referencing instruction, but they are constant offsets, not
5700 symbols. */
5701 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5702 return false;
5704 fmt = GET_RTX_FORMAT (GET_CODE (x));
5705 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5707 if (fmt[i] == 'E')
5709 int j;
5711 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5712 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5713 return 1;
5715 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5716 return 1;
5719 return 0;
5722 /* Implement REGNO_REG_CLASS. */
5724 enum reg_class
5725 aarch64_regno_regclass (unsigned regno)
5727 if (GP_REGNUM_P (regno))
5728 return GENERAL_REGS;
5730 if (regno == SP_REGNUM)
5731 return STACK_REG;
5733 if (regno == FRAME_POINTER_REGNUM
5734 || regno == ARG_POINTER_REGNUM)
5735 return POINTER_REGS;
5737 if (FP_REGNUM_P (regno))
5738 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5740 return NO_REGS;
5743 static rtx
5744 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5746 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5747 where mask is selected by alignment and size of the offset.
5748 We try to pick as large a range for the offset as possible to
5749 maximize the chance of a CSE. However, for aligned addresses
5750 we limit the range to 4k so that structures with different sized
5751 elements are likely to use the same base. We need to be careful
5752 not to split a CONST for some forms of address expression, otherwise
5753 it will generate sub-optimal code. */
5755 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5757 rtx base = XEXP (x, 0);
5758 rtx offset_rtx = XEXP (x, 1);
5759 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5761 if (GET_CODE (base) == PLUS)
5763 rtx op0 = XEXP (base, 0);
5764 rtx op1 = XEXP (base, 1);
5766 /* Force any scaling into a temp for CSE. */
5767 op0 = force_reg (Pmode, op0);
5768 op1 = force_reg (Pmode, op1);
5770 /* Let the pointer register be in op0. */
5771 if (REG_POINTER (op1))
5772 std::swap (op0, op1);
5774 /* If the pointer is virtual or frame related, then we know that
5775 virtual register instantiation or register elimination is going
5776 to apply a second constant. We want the two constants folded
5777 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5778 if (virt_or_elim_regno_p (REGNO (op0)))
5780 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5781 NULL_RTX, true, OPTAB_DIRECT);
5782 return gen_rtx_PLUS (Pmode, base, op1);
5785 /* Otherwise, in order to encourage CSE (and thence loop strength
5786 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5787 base = expand_binop (Pmode, add_optab, op0, op1,
5788 NULL_RTX, true, OPTAB_DIRECT);
5789 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5792 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5793 HOST_WIDE_INT base_offset;
5794 if (GET_MODE_SIZE (mode) > 16)
5795 base_offset = (offset + 0x400) & ~0x7f0;
5796 /* For offsets aren't a multiple of the access size, the limit is
5797 -256...255. */
5798 else if (offset & (GET_MODE_SIZE (mode) - 1))
5800 base_offset = (offset + 0x100) & ~0x1ff;
5802 /* BLKmode typically uses LDP of X-registers. */
5803 if (mode == BLKmode)
5804 base_offset = (offset + 512) & ~0x3ff;
5806 /* Small negative offsets are supported. */
5807 else if (IN_RANGE (offset, -256, 0))
5808 base_offset = 0;
5809 else if (mode == TImode || mode == TFmode)
5810 base_offset = (offset + 0x100) & ~0x1ff;
5811 /* Use 12-bit offset by access size. */
5812 else
5813 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5815 if (base_offset != 0)
5817 base = plus_constant (Pmode, base, base_offset);
5818 base = force_operand (base, NULL_RTX);
5819 return plus_constant (Pmode, base, offset - base_offset);
5823 return x;
5826 /* Return the reload icode required for a constant pool in mode. */
5827 static enum insn_code
5828 aarch64_constant_pool_reload_icode (machine_mode mode)
5830 switch (mode)
5832 case E_SFmode:
5833 return CODE_FOR_aarch64_reload_movcpsfdi;
5835 case E_DFmode:
5836 return CODE_FOR_aarch64_reload_movcpdfdi;
5838 case E_TFmode:
5839 return CODE_FOR_aarch64_reload_movcptfdi;
5841 case E_V8QImode:
5842 return CODE_FOR_aarch64_reload_movcpv8qidi;
5844 case E_V16QImode:
5845 return CODE_FOR_aarch64_reload_movcpv16qidi;
5847 case E_V4HImode:
5848 return CODE_FOR_aarch64_reload_movcpv4hidi;
5850 case E_V8HImode:
5851 return CODE_FOR_aarch64_reload_movcpv8hidi;
5853 case E_V2SImode:
5854 return CODE_FOR_aarch64_reload_movcpv2sidi;
5856 case E_V4SImode:
5857 return CODE_FOR_aarch64_reload_movcpv4sidi;
5859 case E_V2DImode:
5860 return CODE_FOR_aarch64_reload_movcpv2didi;
5862 case E_V2DFmode:
5863 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5865 default:
5866 gcc_unreachable ();
5869 gcc_unreachable ();
5871 static reg_class_t
5872 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5873 reg_class_t rclass,
5874 machine_mode mode,
5875 secondary_reload_info *sri)
5878 /* If we have to disable direct literal pool loads and stores because the
5879 function is too big, then we need a scratch register. */
5880 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5881 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5882 || targetm.vector_mode_supported_p (GET_MODE (x)))
5883 && !aarch64_pcrelative_literal_loads)
5885 sri->icode = aarch64_constant_pool_reload_icode (mode);
5886 return NO_REGS;
5889 /* Without the TARGET_SIMD instructions we cannot move a Q register
5890 to a Q register directly. We need a scratch. */
5891 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5892 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5893 && reg_class_subset_p (rclass, FP_REGS))
5895 if (mode == TFmode)
5896 sri->icode = CODE_FOR_aarch64_reload_movtf;
5897 else if (mode == TImode)
5898 sri->icode = CODE_FOR_aarch64_reload_movti;
5899 return NO_REGS;
5902 /* A TFmode or TImode memory access should be handled via an FP_REGS
5903 because AArch64 has richer addressing modes for LDR/STR instructions
5904 than LDP/STP instructions. */
5905 if (TARGET_FLOAT && rclass == GENERAL_REGS
5906 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5907 return FP_REGS;
5909 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5910 return GENERAL_REGS;
5912 return NO_REGS;
5915 static bool
5916 aarch64_can_eliminate (const int from, const int to)
5918 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5919 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5921 if (frame_pointer_needed)
5923 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5924 return true;
5925 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5926 return false;
5927 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5928 && !cfun->calls_alloca)
5929 return true;
5930 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5931 return true;
5933 return false;
5936 return true;
5939 HOST_WIDE_INT
5940 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5942 aarch64_layout_frame ();
5944 if (to == HARD_FRAME_POINTER_REGNUM)
5946 if (from == ARG_POINTER_REGNUM)
5947 return cfun->machine->frame.hard_fp_offset;
5949 if (from == FRAME_POINTER_REGNUM)
5950 return cfun->machine->frame.hard_fp_offset
5951 - cfun->machine->frame.locals_offset;
5954 if (to == STACK_POINTER_REGNUM)
5956 if (from == FRAME_POINTER_REGNUM)
5957 return cfun->machine->frame.frame_size
5958 - cfun->machine->frame.locals_offset;
5961 return cfun->machine->frame.frame_size;
5964 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5965 previous frame. */
5968 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5970 if (count != 0)
5971 return const0_rtx;
5972 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5976 static void
5977 aarch64_asm_trampoline_template (FILE *f)
5979 if (TARGET_ILP32)
5981 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5982 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5984 else
5986 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5987 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5989 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5990 assemble_aligned_integer (4, const0_rtx);
5991 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5992 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5995 static void
5996 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5998 rtx fnaddr, mem, a_tramp;
5999 const int tramp_code_sz = 16;
6001 /* Don't need to copy the trailing D-words, we fill those in below. */
6002 emit_block_move (m_tramp, assemble_trampoline_template (),
6003 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
6004 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
6005 fnaddr = XEXP (DECL_RTL (fndecl), 0);
6006 if (GET_MODE (fnaddr) != ptr_mode)
6007 fnaddr = convert_memory_address (ptr_mode, fnaddr);
6008 emit_move_insn (mem, fnaddr);
6010 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
6011 emit_move_insn (mem, chain_value);
6013 /* XXX We should really define a "clear_cache" pattern and use
6014 gen_clear_cache(). */
6015 a_tramp = XEXP (m_tramp, 0);
6016 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
6017 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6018 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6019 ptr_mode);
6022 static unsigned char
6023 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6025 switch (regclass)
6027 case CALLER_SAVE_REGS:
6028 case POINTER_REGS:
6029 case GENERAL_REGS:
6030 case ALL_REGS:
6031 case POINTER_AND_FP_REGS:
6032 case FP_REGS:
6033 case FP_LO_REGS:
6034 return
6035 aarch64_vector_mode_p (mode)
6036 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6037 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6038 case STACK_REG:
6039 return 1;
6041 case NO_REGS:
6042 return 0;
6044 default:
6045 break;
6047 gcc_unreachable ();
6050 static reg_class_t
6051 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6053 if (regclass == POINTER_REGS)
6054 return GENERAL_REGS;
6056 if (regclass == STACK_REG)
6058 if (REG_P(x)
6059 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6060 return regclass;
6062 return NO_REGS;
6065 /* Register eliminiation can result in a request for
6066 SP+constant->FP_REGS. We cannot support such operations which
6067 use SP as source and an FP_REG as destination, so reject out
6068 right now. */
6069 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6071 rtx lhs = XEXP (x, 0);
6073 /* Look through a possible SUBREG introduced by ILP32. */
6074 if (GET_CODE (lhs) == SUBREG)
6075 lhs = SUBREG_REG (lhs);
6077 gcc_assert (REG_P (lhs));
6078 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6079 POINTER_REGS));
6080 return NO_REGS;
6083 return regclass;
6086 void
6087 aarch64_asm_output_labelref (FILE* f, const char *name)
6089 asm_fprintf (f, "%U%s", name);
6092 static void
6093 aarch64_elf_asm_constructor (rtx symbol, int priority)
6095 if (priority == DEFAULT_INIT_PRIORITY)
6096 default_ctor_section_asm_out_constructor (symbol, priority);
6097 else
6099 section *s;
6100 /* While priority is known to be in range [0, 65535], so 18 bytes
6101 would be enough, the compiler might not know that. To avoid
6102 -Wformat-truncation false positive, use a larger size. */
6103 char buf[23];
6104 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6105 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6106 switch_to_section (s);
6107 assemble_align (POINTER_SIZE);
6108 assemble_aligned_integer (POINTER_BYTES, symbol);
6112 static void
6113 aarch64_elf_asm_destructor (rtx symbol, int priority)
6115 if (priority == DEFAULT_INIT_PRIORITY)
6116 default_dtor_section_asm_out_destructor (symbol, priority);
6117 else
6119 section *s;
6120 /* While priority is known to be in range [0, 65535], so 18 bytes
6121 would be enough, the compiler might not know that. To avoid
6122 -Wformat-truncation false positive, use a larger size. */
6123 char buf[23];
6124 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6125 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6126 switch_to_section (s);
6127 assemble_align (POINTER_SIZE);
6128 assemble_aligned_integer (POINTER_BYTES, symbol);
6132 const char*
6133 aarch64_output_casesi (rtx *operands)
6135 char buf[100];
6136 char label[100];
6137 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6138 int index;
6139 static const char *const patterns[4][2] =
6142 "ldrb\t%w3, [%0,%w1,uxtw]",
6143 "add\t%3, %4, %w3, sxtb #2"
6146 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6147 "add\t%3, %4, %w3, sxth #2"
6150 "ldr\t%w3, [%0,%w1,uxtw #2]",
6151 "add\t%3, %4, %w3, sxtw #2"
6153 /* We assume that DImode is only generated when not optimizing and
6154 that we don't really need 64-bit address offsets. That would
6155 imply an object file with 8GB of code in a single function! */
6157 "ldr\t%w3, [%0,%w1,uxtw #2]",
6158 "add\t%3, %4, %w3, sxtw #2"
6162 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6164 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6165 index = exact_log2 (GET_MODE_SIZE (mode));
6167 gcc_assert (index >= 0 && index <= 3);
6169 /* Need to implement table size reduction, by chaning the code below. */
6170 output_asm_insn (patterns[index][0], operands);
6171 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6172 snprintf (buf, sizeof (buf),
6173 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6174 output_asm_insn (buf, operands);
6175 output_asm_insn (patterns[index][1], operands);
6176 output_asm_insn ("br\t%3", operands);
6177 assemble_label (asm_out_file, label);
6178 return "";
6182 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6183 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6184 operator. */
6187 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6189 if (shift >= 0 && shift <= 3)
6191 int size;
6192 for (size = 8; size <= 32; size *= 2)
6194 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6195 if (mask == bits << shift)
6196 return size;
6199 return 0;
6202 /* Constant pools are per function only when PC relative
6203 literal loads are true or we are in the large memory
6204 model. */
6206 static inline bool
6207 aarch64_can_use_per_function_literal_pools_p (void)
6209 return (aarch64_pcrelative_literal_loads
6210 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6213 static bool
6214 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6216 /* Fixme:: In an ideal world this would work similar
6217 to the logic in aarch64_select_rtx_section but this
6218 breaks bootstrap in gcc go. For now we workaround
6219 this by returning false here. */
6220 return false;
6223 /* Select appropriate section for constants depending
6224 on where we place literal pools. */
6226 static section *
6227 aarch64_select_rtx_section (machine_mode mode,
6228 rtx x,
6229 unsigned HOST_WIDE_INT align)
6231 if (aarch64_can_use_per_function_literal_pools_p ())
6232 return function_section (current_function_decl);
6234 return default_elf_select_rtx_section (mode, x, align);
6237 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6238 void
6239 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6240 HOST_WIDE_INT offset)
6242 /* When using per-function literal pools, we must ensure that any code
6243 section is aligned to the minimal instruction length, lest we get
6244 errors from the assembler re "unaligned instructions". */
6245 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6246 ASM_OUTPUT_ALIGN (f, 2);
6249 /* Costs. */
6251 /* Helper function for rtx cost calculation. Strip a shift expression
6252 from X. Returns the inner operand if successful, or the original
6253 expression on failure. */
6254 static rtx
6255 aarch64_strip_shift (rtx x)
6257 rtx op = x;
6259 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6260 we can convert both to ROR during final output. */
6261 if ((GET_CODE (op) == ASHIFT
6262 || GET_CODE (op) == ASHIFTRT
6263 || GET_CODE (op) == LSHIFTRT
6264 || GET_CODE (op) == ROTATERT
6265 || GET_CODE (op) == ROTATE)
6266 && CONST_INT_P (XEXP (op, 1)))
6267 return XEXP (op, 0);
6269 if (GET_CODE (op) == MULT
6270 && CONST_INT_P (XEXP (op, 1))
6271 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6272 return XEXP (op, 0);
6274 return x;
6277 /* Helper function for rtx cost calculation. Strip an extend
6278 expression from X. Returns the inner operand if successful, or the
6279 original expression on failure. We deal with a number of possible
6280 canonicalization variations here. If STRIP_SHIFT is true, then
6281 we can strip off a shift also. */
6282 static rtx
6283 aarch64_strip_extend (rtx x, bool strip_shift)
6285 scalar_int_mode mode;
6286 rtx op = x;
6288 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6289 return op;
6291 /* Zero and sign extraction of a widened value. */
6292 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6293 && XEXP (op, 2) == const0_rtx
6294 && GET_CODE (XEXP (op, 0)) == MULT
6295 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6296 XEXP (op, 1)))
6297 return XEXP (XEXP (op, 0), 0);
6299 /* It can also be represented (for zero-extend) as an AND with an
6300 immediate. */
6301 if (GET_CODE (op) == AND
6302 && GET_CODE (XEXP (op, 0)) == MULT
6303 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6304 && CONST_INT_P (XEXP (op, 1))
6305 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6306 INTVAL (XEXP (op, 1))) != 0)
6307 return XEXP (XEXP (op, 0), 0);
6309 /* Now handle extended register, as this may also have an optional
6310 left shift by 1..4. */
6311 if (strip_shift
6312 && GET_CODE (op) == ASHIFT
6313 && CONST_INT_P (XEXP (op, 1))
6314 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6315 op = XEXP (op, 0);
6317 if (GET_CODE (op) == ZERO_EXTEND
6318 || GET_CODE (op) == SIGN_EXTEND)
6319 op = XEXP (op, 0);
6321 if (op != x)
6322 return op;
6324 return x;
6327 /* Return true iff CODE is a shift supported in combination
6328 with arithmetic instructions. */
6330 static bool
6331 aarch64_shift_p (enum rtx_code code)
6333 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6337 /* Return true iff X is a cheap shift without a sign extend. */
6339 static bool
6340 aarch64_cheap_mult_shift_p (rtx x)
6342 rtx op0, op1;
6344 op0 = XEXP (x, 0);
6345 op1 = XEXP (x, 1);
6347 if (!(aarch64_tune_params.extra_tuning_flags
6348 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6349 return false;
6351 if (GET_CODE (op0) == SIGN_EXTEND)
6352 return false;
6354 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6355 && UINTVAL (op1) <= 4)
6356 return true;
6358 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6359 return false;
6361 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6363 if (l2 > 0 && l2 <= 4)
6364 return true;
6366 return false;
6369 /* Helper function for rtx cost calculation. Calculate the cost of
6370 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6371 Return the calculated cost of the expression, recursing manually in to
6372 operands where needed. */
6374 static int
6375 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6377 rtx op0, op1;
6378 const struct cpu_cost_table *extra_cost
6379 = aarch64_tune_params.insn_extra_cost;
6380 int cost = 0;
6381 bool compound_p = (outer == PLUS || outer == MINUS);
6382 machine_mode mode = GET_MODE (x);
6384 gcc_checking_assert (code == MULT);
6386 op0 = XEXP (x, 0);
6387 op1 = XEXP (x, 1);
6389 if (VECTOR_MODE_P (mode))
6390 mode = GET_MODE_INNER (mode);
6392 /* Integer multiply/fma. */
6393 if (GET_MODE_CLASS (mode) == MODE_INT)
6395 /* The multiply will be canonicalized as a shift, cost it as such. */
6396 if (aarch64_shift_p (GET_CODE (x))
6397 || (CONST_INT_P (op1)
6398 && exact_log2 (INTVAL (op1)) > 0))
6400 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6401 || GET_CODE (op0) == SIGN_EXTEND;
6402 if (speed)
6404 if (compound_p)
6406 /* If the shift is considered cheap,
6407 then don't add any cost. */
6408 if (aarch64_cheap_mult_shift_p (x))
6410 else if (REG_P (op1))
6411 /* ARITH + shift-by-register. */
6412 cost += extra_cost->alu.arith_shift_reg;
6413 else if (is_extend)
6414 /* ARITH + extended register. We don't have a cost field
6415 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6416 cost += extra_cost->alu.extend_arith;
6417 else
6418 /* ARITH + shift-by-immediate. */
6419 cost += extra_cost->alu.arith_shift;
6421 else
6422 /* LSL (immediate). */
6423 cost += extra_cost->alu.shift;
6426 /* Strip extends as we will have costed them in the case above. */
6427 if (is_extend)
6428 op0 = aarch64_strip_extend (op0, true);
6430 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6432 return cost;
6435 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6436 compound and let the below cases handle it. After all, MNEG is a
6437 special-case alias of MSUB. */
6438 if (GET_CODE (op0) == NEG)
6440 op0 = XEXP (op0, 0);
6441 compound_p = true;
6444 /* Integer multiplies or FMAs have zero/sign extending variants. */
6445 if ((GET_CODE (op0) == ZERO_EXTEND
6446 && GET_CODE (op1) == ZERO_EXTEND)
6447 || (GET_CODE (op0) == SIGN_EXTEND
6448 && GET_CODE (op1) == SIGN_EXTEND))
6450 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6451 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6453 if (speed)
6455 if (compound_p)
6456 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6457 cost += extra_cost->mult[0].extend_add;
6458 else
6459 /* MUL/SMULL/UMULL. */
6460 cost += extra_cost->mult[0].extend;
6463 return cost;
6466 /* This is either an integer multiply or a MADD. In both cases
6467 we want to recurse and cost the operands. */
6468 cost += rtx_cost (op0, mode, MULT, 0, speed);
6469 cost += rtx_cost (op1, mode, MULT, 1, speed);
6471 if (speed)
6473 if (compound_p)
6474 /* MADD/MSUB. */
6475 cost += extra_cost->mult[mode == DImode].add;
6476 else
6477 /* MUL. */
6478 cost += extra_cost->mult[mode == DImode].simple;
6481 return cost;
6483 else
6485 if (speed)
6487 /* Floating-point FMA/FMUL can also support negations of the
6488 operands, unless the rounding mode is upward or downward in
6489 which case FNMUL is different than FMUL with operand negation. */
6490 bool neg0 = GET_CODE (op0) == NEG;
6491 bool neg1 = GET_CODE (op1) == NEG;
6492 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6494 if (neg0)
6495 op0 = XEXP (op0, 0);
6496 if (neg1)
6497 op1 = XEXP (op1, 0);
6500 if (compound_p)
6501 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6502 cost += extra_cost->fp[mode == DFmode].fma;
6503 else
6504 /* FMUL/FNMUL. */
6505 cost += extra_cost->fp[mode == DFmode].mult;
6508 cost += rtx_cost (op0, mode, MULT, 0, speed);
6509 cost += rtx_cost (op1, mode, MULT, 1, speed);
6510 return cost;
6514 static int
6515 aarch64_address_cost (rtx x,
6516 machine_mode mode,
6517 addr_space_t as ATTRIBUTE_UNUSED,
6518 bool speed)
6520 enum rtx_code c = GET_CODE (x);
6521 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6522 struct aarch64_address_info info;
6523 int cost = 0;
6524 info.shift = 0;
6526 if (!aarch64_classify_address (&info, x, mode, c, false))
6528 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6530 /* This is a CONST or SYMBOL ref which will be split
6531 in a different way depending on the code model in use.
6532 Cost it through the generic infrastructure. */
6533 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6534 /* Divide through by the cost of one instruction to
6535 bring it to the same units as the address costs. */
6536 cost_symbol_ref /= COSTS_N_INSNS (1);
6537 /* The cost is then the cost of preparing the address,
6538 followed by an immediate (possibly 0) offset. */
6539 return cost_symbol_ref + addr_cost->imm_offset;
6541 else
6543 /* This is most likely a jump table from a case
6544 statement. */
6545 return addr_cost->register_offset;
6549 switch (info.type)
6551 case ADDRESS_LO_SUM:
6552 case ADDRESS_SYMBOLIC:
6553 case ADDRESS_REG_IMM:
6554 cost += addr_cost->imm_offset;
6555 break;
6557 case ADDRESS_REG_WB:
6558 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6559 cost += addr_cost->pre_modify;
6560 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6561 cost += addr_cost->post_modify;
6562 else
6563 gcc_unreachable ();
6565 break;
6567 case ADDRESS_REG_REG:
6568 cost += addr_cost->register_offset;
6569 break;
6571 case ADDRESS_REG_SXTW:
6572 cost += addr_cost->register_sextend;
6573 break;
6575 case ADDRESS_REG_UXTW:
6576 cost += addr_cost->register_zextend;
6577 break;
6579 default:
6580 gcc_unreachable ();
6584 if (info.shift > 0)
6586 /* For the sake of calculating the cost of the shifted register
6587 component, we can treat same sized modes in the same way. */
6588 switch (GET_MODE_BITSIZE (mode))
6590 case 16:
6591 cost += addr_cost->addr_scale_costs.hi;
6592 break;
6594 case 32:
6595 cost += addr_cost->addr_scale_costs.si;
6596 break;
6598 case 64:
6599 cost += addr_cost->addr_scale_costs.di;
6600 break;
6602 /* We can't tell, or this is a 128-bit vector. */
6603 default:
6604 cost += addr_cost->addr_scale_costs.ti;
6605 break;
6609 return cost;
6612 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6613 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6614 to be taken. */
6617 aarch64_branch_cost (bool speed_p, bool predictable_p)
6619 /* When optimizing for speed, use the cost of unpredictable branches. */
6620 const struct cpu_branch_cost *branch_costs =
6621 aarch64_tune_params.branch_costs;
6623 if (!speed_p || predictable_p)
6624 return branch_costs->predictable;
6625 else
6626 return branch_costs->unpredictable;
6629 /* Return true if the RTX X in mode MODE is a zero or sign extract
6630 usable in an ADD or SUB (extended register) instruction. */
6631 static bool
6632 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6634 /* Catch add with a sign extract.
6635 This is add_<optab><mode>_multp2. */
6636 if (GET_CODE (x) == SIGN_EXTRACT
6637 || GET_CODE (x) == ZERO_EXTRACT)
6639 rtx op0 = XEXP (x, 0);
6640 rtx op1 = XEXP (x, 1);
6641 rtx op2 = XEXP (x, 2);
6643 if (GET_CODE (op0) == MULT
6644 && CONST_INT_P (op1)
6645 && op2 == const0_rtx
6646 && CONST_INT_P (XEXP (op0, 1))
6647 && aarch64_is_extend_from_extract (mode,
6648 XEXP (op0, 1),
6649 op1))
6651 return true;
6654 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6655 No shift. */
6656 else if (GET_CODE (x) == SIGN_EXTEND
6657 || GET_CODE (x) == ZERO_EXTEND)
6658 return REG_P (XEXP (x, 0));
6660 return false;
6663 static bool
6664 aarch64_frint_unspec_p (unsigned int u)
6666 switch (u)
6668 case UNSPEC_FRINTZ:
6669 case UNSPEC_FRINTP:
6670 case UNSPEC_FRINTM:
6671 case UNSPEC_FRINTA:
6672 case UNSPEC_FRINTN:
6673 case UNSPEC_FRINTX:
6674 case UNSPEC_FRINTI:
6675 return true;
6677 default:
6678 return false;
6682 /* Return true iff X is an rtx that will match an extr instruction
6683 i.e. as described in the *extr<mode>5_insn family of patterns.
6684 OP0 and OP1 will be set to the operands of the shifts involved
6685 on success and will be NULL_RTX otherwise. */
6687 static bool
6688 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6690 rtx op0, op1;
6691 scalar_int_mode mode;
6692 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6693 return false;
6695 *res_op0 = NULL_RTX;
6696 *res_op1 = NULL_RTX;
6698 if (GET_CODE (x) != IOR)
6699 return false;
6701 op0 = XEXP (x, 0);
6702 op1 = XEXP (x, 1);
6704 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6705 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6707 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6708 if (GET_CODE (op1) == ASHIFT)
6709 std::swap (op0, op1);
6711 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6712 return false;
6714 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6715 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6717 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6718 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6720 *res_op0 = XEXP (op0, 0);
6721 *res_op1 = XEXP (op1, 0);
6722 return true;
6726 return false;
6729 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6730 storing it in *COST. Result is true if the total cost of the operation
6731 has now been calculated. */
6732 static bool
6733 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6735 rtx inner;
6736 rtx comparator;
6737 enum rtx_code cmpcode;
6739 if (COMPARISON_P (op0))
6741 inner = XEXP (op0, 0);
6742 comparator = XEXP (op0, 1);
6743 cmpcode = GET_CODE (op0);
6745 else
6747 inner = op0;
6748 comparator = const0_rtx;
6749 cmpcode = NE;
6752 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6754 /* Conditional branch. */
6755 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6756 return true;
6757 else
6759 if (cmpcode == NE || cmpcode == EQ)
6761 if (comparator == const0_rtx)
6763 /* TBZ/TBNZ/CBZ/CBNZ. */
6764 if (GET_CODE (inner) == ZERO_EXTRACT)
6765 /* TBZ/TBNZ. */
6766 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6767 ZERO_EXTRACT, 0, speed);
6768 else
6769 /* CBZ/CBNZ. */
6770 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6772 return true;
6775 else if (cmpcode == LT || cmpcode == GE)
6777 /* TBZ/TBNZ. */
6778 if (comparator == const0_rtx)
6779 return true;
6783 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6785 /* CCMP. */
6786 if (GET_CODE (op1) == COMPARE)
6788 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6789 if (XEXP (op1, 1) == const0_rtx)
6790 *cost += 1;
6791 if (speed)
6793 machine_mode mode = GET_MODE (XEXP (op1, 0));
6794 const struct cpu_cost_table *extra_cost
6795 = aarch64_tune_params.insn_extra_cost;
6797 if (GET_MODE_CLASS (mode) == MODE_INT)
6798 *cost += extra_cost->alu.arith;
6799 else
6800 *cost += extra_cost->fp[mode == DFmode].compare;
6802 return true;
6805 /* It's a conditional operation based on the status flags,
6806 so it must be some flavor of CSEL. */
6808 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6809 if (GET_CODE (op1) == NEG
6810 || GET_CODE (op1) == NOT
6811 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6812 op1 = XEXP (op1, 0);
6813 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6815 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6816 op1 = XEXP (op1, 0);
6817 op2 = XEXP (op2, 0);
6820 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6821 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6822 return true;
6825 /* We don't know what this is, cost all operands. */
6826 return false;
6829 /* Check whether X is a bitfield operation of the form shift + extend that
6830 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6831 operand to which the bitfield operation is applied. Otherwise return
6832 NULL_RTX. */
6834 static rtx
6835 aarch64_extend_bitfield_pattern_p (rtx x)
6837 rtx_code outer_code = GET_CODE (x);
6838 machine_mode outer_mode = GET_MODE (x);
6840 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6841 && outer_mode != SImode && outer_mode != DImode)
6842 return NULL_RTX;
6844 rtx inner = XEXP (x, 0);
6845 rtx_code inner_code = GET_CODE (inner);
6846 machine_mode inner_mode = GET_MODE (inner);
6847 rtx op = NULL_RTX;
6849 switch (inner_code)
6851 case ASHIFT:
6852 if (CONST_INT_P (XEXP (inner, 1))
6853 && (inner_mode == QImode || inner_mode == HImode))
6854 op = XEXP (inner, 0);
6855 break;
6856 case LSHIFTRT:
6857 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6858 && (inner_mode == QImode || inner_mode == HImode))
6859 op = XEXP (inner, 0);
6860 break;
6861 case ASHIFTRT:
6862 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6863 && (inner_mode == QImode || inner_mode == HImode))
6864 op = XEXP (inner, 0);
6865 break;
6866 default:
6867 break;
6870 return op;
6873 /* Return true if the mask and a shift amount from an RTX of the form
6874 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6875 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6877 bool
6878 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6879 rtx shft_amnt)
6881 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6882 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6883 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6884 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6887 /* Calculate the cost of calculating X, storing it in *COST. Result
6888 is true if the total cost of the operation has now been calculated. */
6889 static bool
6890 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6891 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6893 rtx op0, op1, op2;
6894 const struct cpu_cost_table *extra_cost
6895 = aarch64_tune_params.insn_extra_cost;
6896 int code = GET_CODE (x);
6897 scalar_int_mode int_mode;
6899 /* By default, assume that everything has equivalent cost to the
6900 cheapest instruction. Any additional costs are applied as a delta
6901 above this default. */
6902 *cost = COSTS_N_INSNS (1);
6904 switch (code)
6906 case SET:
6907 /* The cost depends entirely on the operands to SET. */
6908 *cost = 0;
6909 op0 = SET_DEST (x);
6910 op1 = SET_SRC (x);
6912 switch (GET_CODE (op0))
6914 case MEM:
6915 if (speed)
6917 rtx address = XEXP (op0, 0);
6918 if (VECTOR_MODE_P (mode))
6919 *cost += extra_cost->ldst.storev;
6920 else if (GET_MODE_CLASS (mode) == MODE_INT)
6921 *cost += extra_cost->ldst.store;
6922 else if (mode == SFmode)
6923 *cost += extra_cost->ldst.storef;
6924 else if (mode == DFmode)
6925 *cost += extra_cost->ldst.stored;
6927 *cost +=
6928 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6929 0, speed));
6932 *cost += rtx_cost (op1, mode, SET, 1, speed);
6933 return true;
6935 case SUBREG:
6936 if (! REG_P (SUBREG_REG (op0)))
6937 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6939 /* Fall through. */
6940 case REG:
6941 /* The cost is one per vector-register copied. */
6942 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6944 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
6945 *cost = COSTS_N_INSNS (nregs);
6947 /* const0_rtx is in general free, but we will use an
6948 instruction to set a register to 0. */
6949 else if (REG_P (op1) || op1 == const0_rtx)
6951 /* The cost is 1 per register copied. */
6952 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
6953 *cost = COSTS_N_INSNS (nregs);
6955 else
6956 /* Cost is just the cost of the RHS of the set. */
6957 *cost += rtx_cost (op1, mode, SET, 1, speed);
6958 return true;
6960 case ZERO_EXTRACT:
6961 case SIGN_EXTRACT:
6962 /* Bit-field insertion. Strip any redundant widening of
6963 the RHS to meet the width of the target. */
6964 if (GET_CODE (op1) == SUBREG)
6965 op1 = SUBREG_REG (op1);
6966 if ((GET_CODE (op1) == ZERO_EXTEND
6967 || GET_CODE (op1) == SIGN_EXTEND)
6968 && CONST_INT_P (XEXP (op0, 1))
6969 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6970 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6971 op1 = XEXP (op1, 0);
6973 if (CONST_INT_P (op1))
6975 /* MOV immediate is assumed to always be cheap. */
6976 *cost = COSTS_N_INSNS (1);
6978 else
6980 /* BFM. */
6981 if (speed)
6982 *cost += extra_cost->alu.bfi;
6983 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6986 return true;
6988 default:
6989 /* We can't make sense of this, assume default cost. */
6990 *cost = COSTS_N_INSNS (1);
6991 return false;
6993 return false;
6995 case CONST_INT:
6996 /* If an instruction can incorporate a constant within the
6997 instruction, the instruction's expression avoids calling
6998 rtx_cost() on the constant. If rtx_cost() is called on a
6999 constant, then it is usually because the constant must be
7000 moved into a register by one or more instructions.
7002 The exception is constant 0, which can be expressed
7003 as XZR/WZR and is therefore free. The exception to this is
7004 if we have (set (reg) (const0_rtx)) in which case we must cost
7005 the move. However, we can catch that when we cost the SET, so
7006 we don't need to consider that here. */
7007 if (x == const0_rtx)
7008 *cost = 0;
7009 else
7011 /* To an approximation, building any other constant is
7012 proportionally expensive to the number of instructions
7013 required to build that constant. This is true whether we
7014 are compiling for SPEED or otherwise. */
7015 if (!is_a <scalar_int_mode> (mode, &int_mode))
7016 int_mode = word_mode;
7017 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7018 (NULL_RTX, x, false, int_mode));
7020 return true;
7022 case CONST_DOUBLE:
7024 /* First determine number of instructions to do the move
7025 as an integer constant. */
7026 if (!aarch64_float_const_representable_p (x)
7027 && !aarch64_can_const_movi_rtx_p (x, mode)
7028 && aarch64_float_const_rtx_p (x))
7030 unsigned HOST_WIDE_INT ival;
7031 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7032 gcc_assert (succeed);
7034 scalar_int_mode imode = (mode == HFmode
7035 ? SImode
7036 : int_mode_for_mode (mode).require ());
7037 int ncost = aarch64_internal_mov_immediate
7038 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7039 *cost += COSTS_N_INSNS (ncost);
7040 return true;
7043 if (speed)
7045 /* mov[df,sf]_aarch64. */
7046 if (aarch64_float_const_representable_p (x))
7047 /* FMOV (scalar immediate). */
7048 *cost += extra_cost->fp[mode == DFmode].fpconst;
7049 else if (!aarch64_float_const_zero_rtx_p (x))
7051 /* This will be a load from memory. */
7052 if (mode == DFmode)
7053 *cost += extra_cost->ldst.loadd;
7054 else
7055 *cost += extra_cost->ldst.loadf;
7057 else
7058 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7059 or MOV v0.s[0], wzr - neither of which are modeled by the
7060 cost tables. Just use the default cost. */
7065 return true;
7067 case MEM:
7068 if (speed)
7070 /* For loads we want the base cost of a load, plus an
7071 approximation for the additional cost of the addressing
7072 mode. */
7073 rtx address = XEXP (x, 0);
7074 if (VECTOR_MODE_P (mode))
7075 *cost += extra_cost->ldst.loadv;
7076 else if (GET_MODE_CLASS (mode) == MODE_INT)
7077 *cost += extra_cost->ldst.load;
7078 else if (mode == SFmode)
7079 *cost += extra_cost->ldst.loadf;
7080 else if (mode == DFmode)
7081 *cost += extra_cost->ldst.loadd;
7083 *cost +=
7084 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7085 0, speed));
7088 return true;
7090 case NEG:
7091 op0 = XEXP (x, 0);
7093 if (VECTOR_MODE_P (mode))
7095 if (speed)
7097 /* FNEG. */
7098 *cost += extra_cost->vect.alu;
7100 return false;
7103 if (GET_MODE_CLASS (mode) == MODE_INT)
7105 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7106 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7108 /* CSETM. */
7109 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7110 return true;
7113 /* Cost this as SUB wzr, X. */
7114 op0 = CONST0_RTX (mode);
7115 op1 = XEXP (x, 0);
7116 goto cost_minus;
7119 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7121 /* Support (neg(fma...)) as a single instruction only if
7122 sign of zeros is unimportant. This matches the decision
7123 making in aarch64.md. */
7124 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7126 /* FNMADD. */
7127 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7128 return true;
7130 if (GET_CODE (op0) == MULT)
7132 /* FNMUL. */
7133 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7134 return true;
7136 if (speed)
7137 /* FNEG. */
7138 *cost += extra_cost->fp[mode == DFmode].neg;
7139 return false;
7142 return false;
7144 case CLRSB:
7145 case CLZ:
7146 if (speed)
7148 if (VECTOR_MODE_P (mode))
7149 *cost += extra_cost->vect.alu;
7150 else
7151 *cost += extra_cost->alu.clz;
7154 return false;
7156 case COMPARE:
7157 op0 = XEXP (x, 0);
7158 op1 = XEXP (x, 1);
7160 if (op1 == const0_rtx
7161 && GET_CODE (op0) == AND)
7163 x = op0;
7164 mode = GET_MODE (op0);
7165 goto cost_logic;
7168 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7170 /* TODO: A write to the CC flags possibly costs extra, this
7171 needs encoding in the cost tables. */
7173 mode = GET_MODE (op0);
7174 /* ANDS. */
7175 if (GET_CODE (op0) == AND)
7177 x = op0;
7178 goto cost_logic;
7181 if (GET_CODE (op0) == PLUS)
7183 /* ADDS (and CMN alias). */
7184 x = op0;
7185 goto cost_plus;
7188 if (GET_CODE (op0) == MINUS)
7190 /* SUBS. */
7191 x = op0;
7192 goto cost_minus;
7195 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7196 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7197 && CONST_INT_P (XEXP (op0, 2)))
7199 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7200 Handle it here directly rather than going to cost_logic
7201 since we know the immediate generated for the TST is valid
7202 so we can avoid creating an intermediate rtx for it only
7203 for costing purposes. */
7204 if (speed)
7205 *cost += extra_cost->alu.logical;
7207 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7208 ZERO_EXTRACT, 0, speed);
7209 return true;
7212 if (GET_CODE (op1) == NEG)
7214 /* CMN. */
7215 if (speed)
7216 *cost += extra_cost->alu.arith;
7218 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7219 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7220 return true;
7223 /* CMP.
7225 Compare can freely swap the order of operands, and
7226 canonicalization puts the more complex operation first.
7227 But the integer MINUS logic expects the shift/extend
7228 operation in op1. */
7229 if (! (REG_P (op0)
7230 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7232 op0 = XEXP (x, 1);
7233 op1 = XEXP (x, 0);
7235 goto cost_minus;
7238 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7240 /* FCMP. */
7241 if (speed)
7242 *cost += extra_cost->fp[mode == DFmode].compare;
7244 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7246 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7247 /* FCMP supports constant 0.0 for no extra cost. */
7248 return true;
7250 return false;
7253 if (VECTOR_MODE_P (mode))
7255 /* Vector compare. */
7256 if (speed)
7257 *cost += extra_cost->vect.alu;
7259 if (aarch64_float_const_zero_rtx_p (op1))
7261 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7262 cost. */
7263 return true;
7265 return false;
7267 return false;
7269 case MINUS:
7271 op0 = XEXP (x, 0);
7272 op1 = XEXP (x, 1);
7274 cost_minus:
7275 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7277 /* Detect valid immediates. */
7278 if ((GET_MODE_CLASS (mode) == MODE_INT
7279 || (GET_MODE_CLASS (mode) == MODE_CC
7280 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7281 && CONST_INT_P (op1)
7282 && aarch64_uimm12_shift (INTVAL (op1)))
7284 if (speed)
7285 /* SUB(S) (immediate). */
7286 *cost += extra_cost->alu.arith;
7287 return true;
7290 /* Look for SUB (extended register). */
7291 if (is_a <scalar_int_mode> (mode, &int_mode)
7292 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7294 if (speed)
7295 *cost += extra_cost->alu.extend_arith;
7297 op1 = aarch64_strip_extend (op1, true);
7298 *cost += rtx_cost (op1, VOIDmode,
7299 (enum rtx_code) GET_CODE (op1), 0, speed);
7300 return true;
7303 rtx new_op1 = aarch64_strip_extend (op1, false);
7305 /* Cost this as an FMA-alike operation. */
7306 if ((GET_CODE (new_op1) == MULT
7307 || aarch64_shift_p (GET_CODE (new_op1)))
7308 && code != COMPARE)
7310 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7311 (enum rtx_code) code,
7312 speed);
7313 return true;
7316 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7318 if (speed)
7320 if (VECTOR_MODE_P (mode))
7322 /* Vector SUB. */
7323 *cost += extra_cost->vect.alu;
7325 else if (GET_MODE_CLASS (mode) == MODE_INT)
7327 /* SUB(S). */
7328 *cost += extra_cost->alu.arith;
7330 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7332 /* FSUB. */
7333 *cost += extra_cost->fp[mode == DFmode].addsub;
7336 return true;
7339 case PLUS:
7341 rtx new_op0;
7343 op0 = XEXP (x, 0);
7344 op1 = XEXP (x, 1);
7346 cost_plus:
7347 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7348 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7350 /* CSINC. */
7351 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7352 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7353 return true;
7356 if (GET_MODE_CLASS (mode) == MODE_INT
7357 && CONST_INT_P (op1)
7358 && aarch64_uimm12_shift (INTVAL (op1)))
7360 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7362 if (speed)
7363 /* ADD (immediate). */
7364 *cost += extra_cost->alu.arith;
7365 return true;
7368 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7370 /* Look for ADD (extended register). */
7371 if (is_a <scalar_int_mode> (mode, &int_mode)
7372 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7374 if (speed)
7375 *cost += extra_cost->alu.extend_arith;
7377 op0 = aarch64_strip_extend (op0, true);
7378 *cost += rtx_cost (op0, VOIDmode,
7379 (enum rtx_code) GET_CODE (op0), 0, speed);
7380 return true;
7383 /* Strip any extend, leave shifts behind as we will
7384 cost them through mult_cost. */
7385 new_op0 = aarch64_strip_extend (op0, false);
7387 if (GET_CODE (new_op0) == MULT
7388 || aarch64_shift_p (GET_CODE (new_op0)))
7390 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7391 speed);
7392 return true;
7395 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7397 if (speed)
7399 if (VECTOR_MODE_P (mode))
7401 /* Vector ADD. */
7402 *cost += extra_cost->vect.alu;
7404 else if (GET_MODE_CLASS (mode) == MODE_INT)
7406 /* ADD. */
7407 *cost += extra_cost->alu.arith;
7409 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7411 /* FADD. */
7412 *cost += extra_cost->fp[mode == DFmode].addsub;
7415 return true;
7418 case BSWAP:
7419 *cost = COSTS_N_INSNS (1);
7421 if (speed)
7423 if (VECTOR_MODE_P (mode))
7424 *cost += extra_cost->vect.alu;
7425 else
7426 *cost += extra_cost->alu.rev;
7428 return false;
7430 case IOR:
7431 if (aarch_rev16_p (x))
7433 *cost = COSTS_N_INSNS (1);
7435 if (speed)
7437 if (VECTOR_MODE_P (mode))
7438 *cost += extra_cost->vect.alu;
7439 else
7440 *cost += extra_cost->alu.rev;
7442 return true;
7445 if (aarch64_extr_rtx_p (x, &op0, &op1))
7447 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7448 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7449 if (speed)
7450 *cost += extra_cost->alu.shift;
7452 return true;
7454 /* Fall through. */
7455 case XOR:
7456 case AND:
7457 cost_logic:
7458 op0 = XEXP (x, 0);
7459 op1 = XEXP (x, 1);
7461 if (VECTOR_MODE_P (mode))
7463 if (speed)
7464 *cost += extra_cost->vect.alu;
7465 return true;
7468 if (code == AND
7469 && GET_CODE (op0) == MULT
7470 && CONST_INT_P (XEXP (op0, 1))
7471 && CONST_INT_P (op1)
7472 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7473 INTVAL (op1)) != 0)
7475 /* This is a UBFM/SBFM. */
7476 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7477 if (speed)
7478 *cost += extra_cost->alu.bfx;
7479 return true;
7482 if (is_int_mode (mode, &int_mode))
7484 if (CONST_INT_P (op1))
7486 /* We have a mask + shift version of a UBFIZ
7487 i.e. the *andim_ashift<mode>_bfiz pattern. */
7488 if (GET_CODE (op0) == ASHIFT
7489 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7490 XEXP (op0, 1)))
7492 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7493 (enum rtx_code) code, 0, speed);
7494 if (speed)
7495 *cost += extra_cost->alu.bfx;
7497 return true;
7499 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7501 /* We possibly get the immediate for free, this is not
7502 modelled. */
7503 *cost += rtx_cost (op0, int_mode,
7504 (enum rtx_code) code, 0, speed);
7505 if (speed)
7506 *cost += extra_cost->alu.logical;
7508 return true;
7511 else
7513 rtx new_op0 = op0;
7515 /* Handle ORN, EON, or BIC. */
7516 if (GET_CODE (op0) == NOT)
7517 op0 = XEXP (op0, 0);
7519 new_op0 = aarch64_strip_shift (op0);
7521 /* If we had a shift on op0 then this is a logical-shift-
7522 by-register/immediate operation. Otherwise, this is just
7523 a logical operation. */
7524 if (speed)
7526 if (new_op0 != op0)
7528 /* Shift by immediate. */
7529 if (CONST_INT_P (XEXP (op0, 1)))
7530 *cost += extra_cost->alu.log_shift;
7531 else
7532 *cost += extra_cost->alu.log_shift_reg;
7534 else
7535 *cost += extra_cost->alu.logical;
7538 /* In both cases we want to cost both operands. */
7539 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7540 0, speed);
7541 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7542 1, speed);
7544 return true;
7547 return false;
7549 case NOT:
7550 x = XEXP (x, 0);
7551 op0 = aarch64_strip_shift (x);
7553 if (VECTOR_MODE_P (mode))
7555 /* Vector NOT. */
7556 *cost += extra_cost->vect.alu;
7557 return false;
7560 /* MVN-shifted-reg. */
7561 if (op0 != x)
7563 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7565 if (speed)
7566 *cost += extra_cost->alu.log_shift;
7568 return true;
7570 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7571 Handle the second form here taking care that 'a' in the above can
7572 be a shift. */
7573 else if (GET_CODE (op0) == XOR)
7575 rtx newop0 = XEXP (op0, 0);
7576 rtx newop1 = XEXP (op0, 1);
7577 rtx op0_stripped = aarch64_strip_shift (newop0);
7579 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7580 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7582 if (speed)
7584 if (op0_stripped != newop0)
7585 *cost += extra_cost->alu.log_shift;
7586 else
7587 *cost += extra_cost->alu.logical;
7590 return true;
7592 /* MVN. */
7593 if (speed)
7594 *cost += extra_cost->alu.logical;
7596 return false;
7598 case ZERO_EXTEND:
7600 op0 = XEXP (x, 0);
7601 /* If a value is written in SI mode, then zero extended to DI
7602 mode, the operation will in general be free as a write to
7603 a 'w' register implicitly zeroes the upper bits of an 'x'
7604 register. However, if this is
7606 (set (reg) (zero_extend (reg)))
7608 we must cost the explicit register move. */
7609 if (mode == DImode
7610 && GET_MODE (op0) == SImode
7611 && outer == SET)
7613 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7615 /* If OP_COST is non-zero, then the cost of the zero extend
7616 is effectively the cost of the inner operation. Otherwise
7617 we have a MOV instruction and we take the cost from the MOV
7618 itself. This is true independently of whether we are
7619 optimizing for space or time. */
7620 if (op_cost)
7621 *cost = op_cost;
7623 return true;
7625 else if (MEM_P (op0))
7627 /* All loads can zero extend to any size for free. */
7628 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7629 return true;
7632 op0 = aarch64_extend_bitfield_pattern_p (x);
7633 if (op0)
7635 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7636 if (speed)
7637 *cost += extra_cost->alu.bfx;
7638 return true;
7641 if (speed)
7643 if (VECTOR_MODE_P (mode))
7645 /* UMOV. */
7646 *cost += extra_cost->vect.alu;
7648 else
7650 /* We generate an AND instead of UXTB/UXTH. */
7651 *cost += extra_cost->alu.logical;
7654 return false;
7656 case SIGN_EXTEND:
7657 if (MEM_P (XEXP (x, 0)))
7659 /* LDRSH. */
7660 if (speed)
7662 rtx address = XEXP (XEXP (x, 0), 0);
7663 *cost += extra_cost->ldst.load_sign_extend;
7665 *cost +=
7666 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7667 0, speed));
7669 return true;
7672 op0 = aarch64_extend_bitfield_pattern_p (x);
7673 if (op0)
7675 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7676 if (speed)
7677 *cost += extra_cost->alu.bfx;
7678 return true;
7681 if (speed)
7683 if (VECTOR_MODE_P (mode))
7684 *cost += extra_cost->vect.alu;
7685 else
7686 *cost += extra_cost->alu.extend;
7688 return false;
7690 case ASHIFT:
7691 op0 = XEXP (x, 0);
7692 op1 = XEXP (x, 1);
7694 if (CONST_INT_P (op1))
7696 if (speed)
7698 if (VECTOR_MODE_P (mode))
7700 /* Vector shift (immediate). */
7701 *cost += extra_cost->vect.alu;
7703 else
7705 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7706 aliases. */
7707 *cost += extra_cost->alu.shift;
7711 /* We can incorporate zero/sign extend for free. */
7712 if (GET_CODE (op0) == ZERO_EXTEND
7713 || GET_CODE (op0) == SIGN_EXTEND)
7714 op0 = XEXP (op0, 0);
7716 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7717 return true;
7719 else
7721 if (VECTOR_MODE_P (mode))
7723 if (speed)
7724 /* Vector shift (register). */
7725 *cost += extra_cost->vect.alu;
7727 else
7729 if (speed)
7730 /* LSLV. */
7731 *cost += extra_cost->alu.shift_reg;
7733 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7734 && CONST_INT_P (XEXP (op1, 1))
7735 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7737 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7738 /* We already demanded XEXP (op1, 0) to be REG_P, so
7739 don't recurse into it. */
7740 return true;
7743 return false; /* All arguments need to be in registers. */
7746 case ROTATE:
7747 case ROTATERT:
7748 case LSHIFTRT:
7749 case ASHIFTRT:
7750 op0 = XEXP (x, 0);
7751 op1 = XEXP (x, 1);
7753 if (CONST_INT_P (op1))
7755 /* ASR (immediate) and friends. */
7756 if (speed)
7758 if (VECTOR_MODE_P (mode))
7759 *cost += extra_cost->vect.alu;
7760 else
7761 *cost += extra_cost->alu.shift;
7764 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7765 return true;
7767 else
7769 if (VECTOR_MODE_P (mode))
7771 if (speed)
7772 /* Vector shift (register). */
7773 *cost += extra_cost->vect.alu;
7775 else
7777 if (speed)
7778 /* ASR (register) and friends. */
7779 *cost += extra_cost->alu.shift_reg;
7781 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7782 && CONST_INT_P (XEXP (op1, 1))
7783 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7785 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7786 /* We already demanded XEXP (op1, 0) to be REG_P, so
7787 don't recurse into it. */
7788 return true;
7791 return false; /* All arguments need to be in registers. */
7794 case SYMBOL_REF:
7796 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7797 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7799 /* LDR. */
7800 if (speed)
7801 *cost += extra_cost->ldst.load;
7803 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7804 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7806 /* ADRP, followed by ADD. */
7807 *cost += COSTS_N_INSNS (1);
7808 if (speed)
7809 *cost += 2 * extra_cost->alu.arith;
7811 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7812 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7814 /* ADR. */
7815 if (speed)
7816 *cost += extra_cost->alu.arith;
7819 if (flag_pic)
7821 /* One extra load instruction, after accessing the GOT. */
7822 *cost += COSTS_N_INSNS (1);
7823 if (speed)
7824 *cost += extra_cost->ldst.load;
7826 return true;
7828 case HIGH:
7829 case LO_SUM:
7830 /* ADRP/ADD (immediate). */
7831 if (speed)
7832 *cost += extra_cost->alu.arith;
7833 return true;
7835 case ZERO_EXTRACT:
7836 case SIGN_EXTRACT:
7837 /* UBFX/SBFX. */
7838 if (speed)
7840 if (VECTOR_MODE_P (mode))
7841 *cost += extra_cost->vect.alu;
7842 else
7843 *cost += extra_cost->alu.bfx;
7846 /* We can trust that the immediates used will be correct (there
7847 are no by-register forms), so we need only cost op0. */
7848 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7849 return true;
7851 case MULT:
7852 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7853 /* aarch64_rtx_mult_cost always handles recursion to its
7854 operands. */
7855 return true;
7857 case MOD:
7858 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7859 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7860 an unconditional negate. This case should only ever be reached through
7861 the set_smod_pow2_cheap check in expmed.c. */
7862 if (CONST_INT_P (XEXP (x, 1))
7863 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7864 && (mode == SImode || mode == DImode))
7866 /* We expand to 4 instructions. Reset the baseline. */
7867 *cost = COSTS_N_INSNS (4);
7869 if (speed)
7870 *cost += 2 * extra_cost->alu.logical
7871 + 2 * extra_cost->alu.arith;
7873 return true;
7876 /* Fall-through. */
7877 case UMOD:
7878 if (speed)
7880 /* Slighly prefer UMOD over SMOD. */
7881 if (VECTOR_MODE_P (mode))
7882 *cost += extra_cost->vect.alu;
7883 else if (GET_MODE_CLASS (mode) == MODE_INT)
7884 *cost += (extra_cost->mult[mode == DImode].add
7885 + extra_cost->mult[mode == DImode].idiv
7886 + (code == MOD ? 1 : 0));
7888 return false; /* All arguments need to be in registers. */
7890 case DIV:
7891 case UDIV:
7892 case SQRT:
7893 if (speed)
7895 if (VECTOR_MODE_P (mode))
7896 *cost += extra_cost->vect.alu;
7897 else if (GET_MODE_CLASS (mode) == MODE_INT)
7898 /* There is no integer SQRT, so only DIV and UDIV can get
7899 here. */
7900 *cost += (extra_cost->mult[mode == DImode].idiv
7901 /* Slighly prefer UDIV over SDIV. */
7902 + (code == DIV ? 1 : 0));
7903 else
7904 *cost += extra_cost->fp[mode == DFmode].div;
7906 return false; /* All arguments need to be in registers. */
7908 case IF_THEN_ELSE:
7909 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7910 XEXP (x, 2), cost, speed);
7912 case EQ:
7913 case NE:
7914 case GT:
7915 case GTU:
7916 case LT:
7917 case LTU:
7918 case GE:
7919 case GEU:
7920 case LE:
7921 case LEU:
7923 return false; /* All arguments must be in registers. */
7925 case FMA:
7926 op0 = XEXP (x, 0);
7927 op1 = XEXP (x, 1);
7928 op2 = XEXP (x, 2);
7930 if (speed)
7932 if (VECTOR_MODE_P (mode))
7933 *cost += extra_cost->vect.alu;
7934 else
7935 *cost += extra_cost->fp[mode == DFmode].fma;
7938 /* FMSUB, FNMADD, and FNMSUB are free. */
7939 if (GET_CODE (op0) == NEG)
7940 op0 = XEXP (op0, 0);
7942 if (GET_CODE (op2) == NEG)
7943 op2 = XEXP (op2, 0);
7945 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7946 and the by-element operand as operand 0. */
7947 if (GET_CODE (op1) == NEG)
7948 op1 = XEXP (op1, 0);
7950 /* Catch vector-by-element operations. The by-element operand can
7951 either be (vec_duplicate (vec_select (x))) or just
7952 (vec_select (x)), depending on whether we are multiplying by
7953 a vector or a scalar.
7955 Canonicalization is not very good in these cases, FMA4 will put the
7956 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7957 if (GET_CODE (op0) == VEC_DUPLICATE)
7958 op0 = XEXP (op0, 0);
7959 else if (GET_CODE (op1) == VEC_DUPLICATE)
7960 op1 = XEXP (op1, 0);
7962 if (GET_CODE (op0) == VEC_SELECT)
7963 op0 = XEXP (op0, 0);
7964 else if (GET_CODE (op1) == VEC_SELECT)
7965 op1 = XEXP (op1, 0);
7967 /* If the remaining parameters are not registers,
7968 get the cost to put them into registers. */
7969 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7970 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7971 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7972 return true;
7974 case FLOAT:
7975 case UNSIGNED_FLOAT:
7976 if (speed)
7977 *cost += extra_cost->fp[mode == DFmode].fromint;
7978 return false;
7980 case FLOAT_EXTEND:
7981 if (speed)
7983 if (VECTOR_MODE_P (mode))
7985 /*Vector truncate. */
7986 *cost += extra_cost->vect.alu;
7988 else
7989 *cost += extra_cost->fp[mode == DFmode].widen;
7991 return false;
7993 case FLOAT_TRUNCATE:
7994 if (speed)
7996 if (VECTOR_MODE_P (mode))
7998 /*Vector conversion. */
7999 *cost += extra_cost->vect.alu;
8001 else
8002 *cost += extra_cost->fp[mode == DFmode].narrow;
8004 return false;
8006 case FIX:
8007 case UNSIGNED_FIX:
8008 x = XEXP (x, 0);
8009 /* Strip the rounding part. They will all be implemented
8010 by the fcvt* family of instructions anyway. */
8011 if (GET_CODE (x) == UNSPEC)
8013 unsigned int uns_code = XINT (x, 1);
8015 if (uns_code == UNSPEC_FRINTA
8016 || uns_code == UNSPEC_FRINTM
8017 || uns_code == UNSPEC_FRINTN
8018 || uns_code == UNSPEC_FRINTP
8019 || uns_code == UNSPEC_FRINTZ)
8020 x = XVECEXP (x, 0, 0);
8023 if (speed)
8025 if (VECTOR_MODE_P (mode))
8026 *cost += extra_cost->vect.alu;
8027 else
8028 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8031 /* We can combine fmul by a power of 2 followed by a fcvt into a single
8032 fixed-point fcvt. */
8033 if (GET_CODE (x) == MULT
8034 && ((VECTOR_MODE_P (mode)
8035 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8036 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8038 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8039 0, speed);
8040 return true;
8043 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8044 return true;
8046 case ABS:
8047 if (VECTOR_MODE_P (mode))
8049 /* ABS (vector). */
8050 if (speed)
8051 *cost += extra_cost->vect.alu;
8053 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8055 op0 = XEXP (x, 0);
8057 /* FABD, which is analogous to FADD. */
8058 if (GET_CODE (op0) == MINUS)
8060 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8061 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8062 if (speed)
8063 *cost += extra_cost->fp[mode == DFmode].addsub;
8065 return true;
8067 /* Simple FABS is analogous to FNEG. */
8068 if (speed)
8069 *cost += extra_cost->fp[mode == DFmode].neg;
8071 else
8073 /* Integer ABS will either be split to
8074 two arithmetic instructions, or will be an ABS
8075 (scalar), which we don't model. */
8076 *cost = COSTS_N_INSNS (2);
8077 if (speed)
8078 *cost += 2 * extra_cost->alu.arith;
8080 return false;
8082 case SMAX:
8083 case SMIN:
8084 if (speed)
8086 if (VECTOR_MODE_P (mode))
8087 *cost += extra_cost->vect.alu;
8088 else
8090 /* FMAXNM/FMINNM/FMAX/FMIN.
8091 TODO: This may not be accurate for all implementations, but
8092 we do not model this in the cost tables. */
8093 *cost += extra_cost->fp[mode == DFmode].addsub;
8096 return false;
8098 case UNSPEC:
8099 /* The floating point round to integer frint* instructions. */
8100 if (aarch64_frint_unspec_p (XINT (x, 1)))
8102 if (speed)
8103 *cost += extra_cost->fp[mode == DFmode].roundint;
8105 return false;
8108 if (XINT (x, 1) == UNSPEC_RBIT)
8110 if (speed)
8111 *cost += extra_cost->alu.rev;
8113 return false;
8115 break;
8117 case TRUNCATE:
8119 /* Decompose <su>muldi3_highpart. */
8120 if (/* (truncate:DI */
8121 mode == DImode
8122 /* (lshiftrt:TI */
8123 && GET_MODE (XEXP (x, 0)) == TImode
8124 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8125 /* (mult:TI */
8126 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8127 /* (ANY_EXTEND:TI (reg:DI))
8128 (ANY_EXTEND:TI (reg:DI))) */
8129 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8130 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8131 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8132 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8133 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8134 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8135 /* (const_int 64) */
8136 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8137 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8139 /* UMULH/SMULH. */
8140 if (speed)
8141 *cost += extra_cost->mult[mode == DImode].extend;
8142 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8143 mode, MULT, 0, speed);
8144 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8145 mode, MULT, 1, speed);
8146 return true;
8149 /* Fall through. */
8150 default:
8151 break;
8154 if (dump_file
8155 && flag_aarch64_verbose_cost)
8156 fprintf (dump_file,
8157 "\nFailed to cost RTX. Assuming default cost.\n");
8159 return true;
8162 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8163 calculated for X. This cost is stored in *COST. Returns true
8164 if the total cost of X was calculated. */
8165 static bool
8166 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8167 int param, int *cost, bool speed)
8169 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8171 if (dump_file
8172 && flag_aarch64_verbose_cost)
8174 print_rtl_single (dump_file, x);
8175 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8176 speed ? "Hot" : "Cold",
8177 *cost, result ? "final" : "partial");
8180 return result;
8183 static int
8184 aarch64_register_move_cost (machine_mode mode,
8185 reg_class_t from_i, reg_class_t to_i)
8187 enum reg_class from = (enum reg_class) from_i;
8188 enum reg_class to = (enum reg_class) to_i;
8189 const struct cpu_regmove_cost *regmove_cost
8190 = aarch64_tune_params.regmove_cost;
8192 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8193 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8194 to = GENERAL_REGS;
8196 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8197 from = GENERAL_REGS;
8199 /* Moving between GPR and stack cost is the same as GP2GP. */
8200 if ((from == GENERAL_REGS && to == STACK_REG)
8201 || (to == GENERAL_REGS && from == STACK_REG))
8202 return regmove_cost->GP2GP;
8204 /* To/From the stack register, we move via the gprs. */
8205 if (to == STACK_REG || from == STACK_REG)
8206 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8207 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8209 if (GET_MODE_SIZE (mode) == 16)
8211 /* 128-bit operations on general registers require 2 instructions. */
8212 if (from == GENERAL_REGS && to == GENERAL_REGS)
8213 return regmove_cost->GP2GP * 2;
8214 else if (from == GENERAL_REGS)
8215 return regmove_cost->GP2FP * 2;
8216 else if (to == GENERAL_REGS)
8217 return regmove_cost->FP2GP * 2;
8219 /* When AdvSIMD instructions are disabled it is not possible to move
8220 a 128-bit value directly between Q registers. This is handled in
8221 secondary reload. A general register is used as a scratch to move
8222 the upper DI value and the lower DI value is moved directly,
8223 hence the cost is the sum of three moves. */
8224 if (! TARGET_SIMD)
8225 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8227 return regmove_cost->FP2FP;
8230 if (from == GENERAL_REGS && to == GENERAL_REGS)
8231 return regmove_cost->GP2GP;
8232 else if (from == GENERAL_REGS)
8233 return regmove_cost->GP2FP;
8234 else if (to == GENERAL_REGS)
8235 return regmove_cost->FP2GP;
8237 return regmove_cost->FP2FP;
8240 static int
8241 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8242 reg_class_t rclass ATTRIBUTE_UNUSED,
8243 bool in ATTRIBUTE_UNUSED)
8245 return aarch64_tune_params.memmov_cost;
8248 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8249 to optimize 1.0/sqrt. */
8251 static bool
8252 use_rsqrt_p (machine_mode mode)
8254 return (!flag_trapping_math
8255 && flag_unsafe_math_optimizations
8256 && ((aarch64_tune_params.approx_modes->recip_sqrt
8257 & AARCH64_APPROX_MODE (mode))
8258 || flag_mrecip_low_precision_sqrt));
8261 /* Function to decide when to use the approximate reciprocal square root
8262 builtin. */
8264 static tree
8265 aarch64_builtin_reciprocal (tree fndecl)
8267 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8269 if (!use_rsqrt_p (mode))
8270 return NULL_TREE;
8271 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8274 typedef rtx (*rsqrte_type) (rtx, rtx);
8276 /* Select reciprocal square root initial estimate insn depending on machine
8277 mode. */
8279 static rsqrte_type
8280 get_rsqrte_type (machine_mode mode)
8282 switch (mode)
8284 case E_DFmode: return gen_aarch64_rsqrtedf;
8285 case E_SFmode: return gen_aarch64_rsqrtesf;
8286 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8287 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8288 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8289 default: gcc_unreachable ();
8293 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8295 /* Select reciprocal square root series step insn depending on machine mode. */
8297 static rsqrts_type
8298 get_rsqrts_type (machine_mode mode)
8300 switch (mode)
8302 case E_DFmode: return gen_aarch64_rsqrtsdf;
8303 case E_SFmode: return gen_aarch64_rsqrtssf;
8304 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8305 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8306 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8307 default: gcc_unreachable ();
8311 /* Emit instruction sequence to compute either the approximate square root
8312 or its approximate reciprocal, depending on the flag RECP, and return
8313 whether the sequence was emitted or not. */
8315 bool
8316 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8318 machine_mode mode = GET_MODE (dst);
8320 if (GET_MODE_INNER (mode) == HFmode)
8322 gcc_assert (!recp);
8323 return false;
8326 if (!recp)
8328 if (!(flag_mlow_precision_sqrt
8329 || (aarch64_tune_params.approx_modes->sqrt
8330 & AARCH64_APPROX_MODE (mode))))
8331 return false;
8333 if (flag_finite_math_only
8334 || flag_trapping_math
8335 || !flag_unsafe_math_optimizations
8336 || optimize_function_for_size_p (cfun))
8337 return false;
8339 else
8340 /* Caller assumes we cannot fail. */
8341 gcc_assert (use_rsqrt_p (mode));
8343 machine_mode mmsk = mode_for_int_vector (mode).require ();
8344 rtx xmsk = gen_reg_rtx (mmsk);
8345 if (!recp)
8346 /* When calculating the approximate square root, compare the
8347 argument with 0.0 and create a mask. */
8348 emit_insn (gen_rtx_SET (xmsk,
8349 gen_rtx_NEG (mmsk,
8350 gen_rtx_EQ (mmsk, src,
8351 CONST0_RTX (mode)))));
8353 /* Estimate the approximate reciprocal square root. */
8354 rtx xdst = gen_reg_rtx (mode);
8355 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8357 /* Iterate over the series twice for SF and thrice for DF. */
8358 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8360 /* Optionally iterate over the series once less for faster performance
8361 while sacrificing the accuracy. */
8362 if ((recp && flag_mrecip_low_precision_sqrt)
8363 || (!recp && flag_mlow_precision_sqrt))
8364 iterations--;
8366 /* Iterate over the series to calculate the approximate reciprocal square
8367 root. */
8368 rtx x1 = gen_reg_rtx (mode);
8369 while (iterations--)
8371 rtx x2 = gen_reg_rtx (mode);
8372 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8374 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8376 if (iterations > 0)
8377 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8380 if (!recp)
8382 /* Qualify the approximate reciprocal square root when the argument is
8383 0.0 by squashing the intermediary result to 0.0. */
8384 rtx xtmp = gen_reg_rtx (mmsk);
8385 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8386 gen_rtx_SUBREG (mmsk, xdst, 0)));
8387 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8389 /* Calculate the approximate square root. */
8390 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8393 /* Finalize the approximation. */
8394 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8396 return true;
8399 typedef rtx (*recpe_type) (rtx, rtx);
8401 /* Select reciprocal initial estimate insn depending on machine mode. */
8403 static recpe_type
8404 get_recpe_type (machine_mode mode)
8406 switch (mode)
8408 case E_SFmode: return (gen_aarch64_frecpesf);
8409 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8410 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8411 case E_DFmode: return (gen_aarch64_frecpedf);
8412 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8413 default: gcc_unreachable ();
8417 typedef rtx (*recps_type) (rtx, rtx, rtx);
8419 /* Select reciprocal series step insn depending on machine mode. */
8421 static recps_type
8422 get_recps_type (machine_mode mode)
8424 switch (mode)
8426 case E_SFmode: return (gen_aarch64_frecpssf);
8427 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8428 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8429 case E_DFmode: return (gen_aarch64_frecpsdf);
8430 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8431 default: gcc_unreachable ();
8435 /* Emit the instruction sequence to compute the approximation for the division
8436 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8438 bool
8439 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8441 machine_mode mode = GET_MODE (quo);
8443 if (GET_MODE_INNER (mode) == HFmode)
8444 return false;
8446 bool use_approx_division_p = (flag_mlow_precision_div
8447 || (aarch64_tune_params.approx_modes->division
8448 & AARCH64_APPROX_MODE (mode)));
8450 if (!flag_finite_math_only
8451 || flag_trapping_math
8452 || !flag_unsafe_math_optimizations
8453 || optimize_function_for_size_p (cfun)
8454 || !use_approx_division_p)
8455 return false;
8457 /* Estimate the approximate reciprocal. */
8458 rtx xrcp = gen_reg_rtx (mode);
8459 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8461 /* Iterate over the series twice for SF and thrice for DF. */
8462 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8464 /* Optionally iterate over the series once less for faster performance,
8465 while sacrificing the accuracy. */
8466 if (flag_mlow_precision_div)
8467 iterations--;
8469 /* Iterate over the series to calculate the approximate reciprocal. */
8470 rtx xtmp = gen_reg_rtx (mode);
8471 while (iterations--)
8473 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8475 if (iterations > 0)
8476 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8479 if (num != CONST1_RTX (mode))
8481 /* As the approximate reciprocal of DEN is already calculated, only
8482 calculate the approximate division when NUM is not 1.0. */
8483 rtx xnum = force_reg (mode, num);
8484 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8487 /* Finalize the approximation. */
8488 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8489 return true;
8492 /* Return the number of instructions that can be issued per cycle. */
8493 static int
8494 aarch64_sched_issue_rate (void)
8496 return aarch64_tune_params.issue_rate;
8499 static int
8500 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8502 int issue_rate = aarch64_sched_issue_rate ();
8504 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8508 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8509 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8510 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8512 static int
8513 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8514 int ready_index)
8516 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8520 /* Vectorizer cost model target hooks. */
8522 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8523 static int
8524 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8525 tree vectype,
8526 int misalign ATTRIBUTE_UNUSED)
8528 unsigned elements;
8529 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8530 bool fp = false;
8532 if (vectype != NULL)
8533 fp = FLOAT_TYPE_P (vectype);
8535 switch (type_of_cost)
8537 case scalar_stmt:
8538 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8540 case scalar_load:
8541 return costs->scalar_load_cost;
8543 case scalar_store:
8544 return costs->scalar_store_cost;
8546 case vector_stmt:
8547 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8549 case vector_load:
8550 return costs->vec_align_load_cost;
8552 case vector_store:
8553 return costs->vec_store_cost;
8555 case vec_to_scalar:
8556 return costs->vec_to_scalar_cost;
8558 case scalar_to_vec:
8559 return costs->scalar_to_vec_cost;
8561 case unaligned_load:
8562 case vector_gather_load:
8563 return costs->vec_unalign_load_cost;
8565 case unaligned_store:
8566 case vector_scatter_store:
8567 return costs->vec_unalign_store_cost;
8569 case cond_branch_taken:
8570 return costs->cond_taken_branch_cost;
8572 case cond_branch_not_taken:
8573 return costs->cond_not_taken_branch_cost;
8575 case vec_perm:
8576 return costs->vec_permute_cost;
8578 case vec_promote_demote:
8579 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8581 case vec_construct:
8582 elements = TYPE_VECTOR_SUBPARTS (vectype);
8583 return elements / 2 + 1;
8585 default:
8586 gcc_unreachable ();
8590 /* Implement targetm.vectorize.add_stmt_cost. */
8591 static unsigned
8592 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8593 struct _stmt_vec_info *stmt_info, int misalign,
8594 enum vect_cost_model_location where)
8596 unsigned *cost = (unsigned *) data;
8597 unsigned retval = 0;
8599 if (flag_vect_cost_model)
8601 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8602 int stmt_cost =
8603 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8605 /* Statements in an inner loop relative to the loop being
8606 vectorized are weighted more heavily. The value here is
8607 arbitrary and could potentially be improved with analysis. */
8608 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8609 count *= 50; /* FIXME */
8611 retval = (unsigned) (count * stmt_cost);
8612 cost[where] += retval;
8615 return retval;
8618 static void initialize_aarch64_code_model (struct gcc_options *);
8620 /* Parse the TO_PARSE string and put the architecture struct that it
8621 selects into RES and the architectural features into ISA_FLAGS.
8622 Return an aarch64_parse_opt_result describing the parse result.
8623 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8625 static enum aarch64_parse_opt_result
8626 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8627 unsigned long *isa_flags)
8629 char *ext;
8630 const struct processor *arch;
8631 char *str = (char *) alloca (strlen (to_parse) + 1);
8632 size_t len;
8634 strcpy (str, to_parse);
8636 ext = strchr (str, '+');
8638 if (ext != NULL)
8639 len = ext - str;
8640 else
8641 len = strlen (str);
8643 if (len == 0)
8644 return AARCH64_PARSE_MISSING_ARG;
8647 /* Loop through the list of supported ARCHes to find a match. */
8648 for (arch = all_architectures; arch->name != NULL; arch++)
8650 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8652 unsigned long isa_temp = arch->flags;
8654 if (ext != NULL)
8656 /* TO_PARSE string contains at least one extension. */
8657 enum aarch64_parse_opt_result ext_res
8658 = aarch64_parse_extension (ext, &isa_temp);
8660 if (ext_res != AARCH64_PARSE_OK)
8661 return ext_res;
8663 /* Extension parsing was successful. Confirm the result
8664 arch and ISA flags. */
8665 *res = arch;
8666 *isa_flags = isa_temp;
8667 return AARCH64_PARSE_OK;
8671 /* ARCH name not found in list. */
8672 return AARCH64_PARSE_INVALID_ARG;
8675 /* Parse the TO_PARSE string and put the result tuning in RES and the
8676 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8677 describing the parse result. If there is an error parsing, RES and
8678 ISA_FLAGS are left unchanged. */
8680 static enum aarch64_parse_opt_result
8681 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8682 unsigned long *isa_flags)
8684 char *ext;
8685 const struct processor *cpu;
8686 char *str = (char *) alloca (strlen (to_parse) + 1);
8687 size_t len;
8689 strcpy (str, to_parse);
8691 ext = strchr (str, '+');
8693 if (ext != NULL)
8694 len = ext - str;
8695 else
8696 len = strlen (str);
8698 if (len == 0)
8699 return AARCH64_PARSE_MISSING_ARG;
8702 /* Loop through the list of supported CPUs to find a match. */
8703 for (cpu = all_cores; cpu->name != NULL; cpu++)
8705 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8707 unsigned long isa_temp = cpu->flags;
8710 if (ext != NULL)
8712 /* TO_PARSE string contains at least one extension. */
8713 enum aarch64_parse_opt_result ext_res
8714 = aarch64_parse_extension (ext, &isa_temp);
8716 if (ext_res != AARCH64_PARSE_OK)
8717 return ext_res;
8719 /* Extension parsing was successfull. Confirm the result
8720 cpu and ISA flags. */
8721 *res = cpu;
8722 *isa_flags = isa_temp;
8723 return AARCH64_PARSE_OK;
8727 /* CPU name not found in list. */
8728 return AARCH64_PARSE_INVALID_ARG;
8731 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8732 Return an aarch64_parse_opt_result describing the parse result.
8733 If the parsing fails the RES does not change. */
8735 static enum aarch64_parse_opt_result
8736 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8738 const struct processor *cpu;
8739 char *str = (char *) alloca (strlen (to_parse) + 1);
8741 strcpy (str, to_parse);
8743 /* Loop through the list of supported CPUs to find a match. */
8744 for (cpu = all_cores; cpu->name != NULL; cpu++)
8746 if (strcmp (cpu->name, str) == 0)
8748 *res = cpu;
8749 return AARCH64_PARSE_OK;
8753 /* CPU name not found in list. */
8754 return AARCH64_PARSE_INVALID_ARG;
8757 /* Parse TOKEN, which has length LENGTH to see if it is an option
8758 described in FLAG. If it is, return the index bit for that fusion type.
8759 If not, error (printing OPTION_NAME) and return zero. */
8761 static unsigned int
8762 aarch64_parse_one_option_token (const char *token,
8763 size_t length,
8764 const struct aarch64_flag_desc *flag,
8765 const char *option_name)
8767 for (; flag->name != NULL; flag++)
8769 if (length == strlen (flag->name)
8770 && !strncmp (flag->name, token, length))
8771 return flag->flag;
8774 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8775 return 0;
8778 /* Parse OPTION which is a comma-separated list of flags to enable.
8779 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8780 default state we inherit from the CPU tuning structures. OPTION_NAME
8781 gives the top-level option we are parsing in the -moverride string,
8782 for use in error messages. */
8784 static unsigned int
8785 aarch64_parse_boolean_options (const char *option,
8786 const struct aarch64_flag_desc *flags,
8787 unsigned int initial_state,
8788 const char *option_name)
8790 const char separator = '.';
8791 const char* specs = option;
8792 const char* ntoken = option;
8793 unsigned int found_flags = initial_state;
8795 while ((ntoken = strchr (specs, separator)))
8797 size_t token_length = ntoken - specs;
8798 unsigned token_ops = aarch64_parse_one_option_token (specs,
8799 token_length,
8800 flags,
8801 option_name);
8802 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8803 in the token stream, reset the supported operations. So:
8805 adrp+add.cmp+branch.none.adrp+add
8807 would have the result of turning on only adrp+add fusion. */
8808 if (!token_ops)
8809 found_flags = 0;
8811 found_flags |= token_ops;
8812 specs = ++ntoken;
8815 /* We ended with a comma, print something. */
8816 if (!(*specs))
8818 error ("%s string ill-formed\n", option_name);
8819 return 0;
8822 /* We still have one more token to parse. */
8823 size_t token_length = strlen (specs);
8824 unsigned token_ops = aarch64_parse_one_option_token (specs,
8825 token_length,
8826 flags,
8827 option_name);
8828 if (!token_ops)
8829 found_flags = 0;
8831 found_flags |= token_ops;
8832 return found_flags;
8835 /* Support for overriding instruction fusion. */
8837 static void
8838 aarch64_parse_fuse_string (const char *fuse_string,
8839 struct tune_params *tune)
8841 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8842 aarch64_fusible_pairs,
8843 tune->fusible_ops,
8844 "fuse=");
8847 /* Support for overriding other tuning flags. */
8849 static void
8850 aarch64_parse_tune_string (const char *tune_string,
8851 struct tune_params *tune)
8853 tune->extra_tuning_flags
8854 = aarch64_parse_boolean_options (tune_string,
8855 aarch64_tuning_flags,
8856 tune->extra_tuning_flags,
8857 "tune=");
8860 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8861 we understand. If it is, extract the option string and handoff to
8862 the appropriate function. */
8864 void
8865 aarch64_parse_one_override_token (const char* token,
8866 size_t length,
8867 struct tune_params *tune)
8869 const struct aarch64_tuning_override_function *fn
8870 = aarch64_tuning_override_functions;
8872 const char *option_part = strchr (token, '=');
8873 if (!option_part)
8875 error ("tuning string missing in option (%s)", token);
8876 return;
8879 /* Get the length of the option name. */
8880 length = option_part - token;
8881 /* Skip the '=' to get to the option string. */
8882 option_part++;
8884 for (; fn->name != NULL; fn++)
8886 if (!strncmp (fn->name, token, length))
8888 fn->parse_override (option_part, tune);
8889 return;
8893 error ("unknown tuning option (%s)",token);
8894 return;
8897 /* A checking mechanism for the implementation of the tls size. */
8899 static void
8900 initialize_aarch64_tls_size (struct gcc_options *opts)
8902 if (aarch64_tls_size == 0)
8903 aarch64_tls_size = 24;
8905 switch (opts->x_aarch64_cmodel_var)
8907 case AARCH64_CMODEL_TINY:
8908 /* Both the default and maximum TLS size allowed under tiny is 1M which
8909 needs two instructions to address, so we clamp the size to 24. */
8910 if (aarch64_tls_size > 24)
8911 aarch64_tls_size = 24;
8912 break;
8913 case AARCH64_CMODEL_SMALL:
8914 /* The maximum TLS size allowed under small is 4G. */
8915 if (aarch64_tls_size > 32)
8916 aarch64_tls_size = 32;
8917 break;
8918 case AARCH64_CMODEL_LARGE:
8919 /* The maximum TLS size allowed under large is 16E.
8920 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8921 if (aarch64_tls_size > 48)
8922 aarch64_tls_size = 48;
8923 break;
8924 default:
8925 gcc_unreachable ();
8928 return;
8931 /* Parse STRING looking for options in the format:
8932 string :: option:string
8933 option :: name=substring
8934 name :: {a-z}
8935 substring :: defined by option. */
8937 static void
8938 aarch64_parse_override_string (const char* input_string,
8939 struct tune_params* tune)
8941 const char separator = ':';
8942 size_t string_length = strlen (input_string) + 1;
8943 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8944 char *string = string_root;
8945 strncpy (string, input_string, string_length);
8946 string[string_length - 1] = '\0';
8948 char* ntoken = string;
8950 while ((ntoken = strchr (string, separator)))
8952 size_t token_length = ntoken - string;
8953 /* Make this substring look like a string. */
8954 *ntoken = '\0';
8955 aarch64_parse_one_override_token (string, token_length, tune);
8956 string = ++ntoken;
8959 /* One last option to parse. */
8960 aarch64_parse_one_override_token (string, strlen (string), tune);
8961 free (string_root);
8965 static void
8966 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8968 /* PR 70044: We have to be careful about being called multiple times for the
8969 same function. This means all changes should be repeatable. */
8971 /* If the frame pointer is enabled, set it to a special value that behaves
8972 similar to frame pointer omission. If we don't do this all leaf functions
8973 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
8974 If flag_omit_frame_pointer has this special value, we must force the
8975 frame pointer if not in a leaf function. We also need to force it in a
8976 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
8977 if (opts->x_flag_omit_frame_pointer == 0)
8978 opts->x_flag_omit_frame_pointer = 2;
8980 /* If not optimizing for size, set the default
8981 alignment to what the target wants. */
8982 if (!opts->x_optimize_size)
8984 if (opts->x_align_loops <= 0)
8985 opts->x_align_loops = aarch64_tune_params.loop_align;
8986 if (opts->x_align_jumps <= 0)
8987 opts->x_align_jumps = aarch64_tune_params.jump_align;
8988 if (opts->x_align_functions <= 0)
8989 opts->x_align_functions = aarch64_tune_params.function_align;
8992 /* We default to no pc-relative literal loads. */
8994 aarch64_pcrelative_literal_loads = false;
8996 /* If -mpc-relative-literal-loads is set on the command line, this
8997 implies that the user asked for PC relative literal loads. */
8998 if (opts->x_pcrelative_literal_loads == 1)
8999 aarch64_pcrelative_literal_loads = true;
9001 /* In the tiny memory model it makes no sense to disallow PC relative
9002 literal pool loads. */
9003 if (aarch64_cmodel == AARCH64_CMODEL_TINY
9004 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9005 aarch64_pcrelative_literal_loads = true;
9007 /* When enabling the lower precision Newton series for the square root, also
9008 enable it for the reciprocal square root, since the latter is an
9009 intermediary step for the former. */
9010 if (flag_mlow_precision_sqrt)
9011 flag_mrecip_low_precision_sqrt = true;
9014 /* 'Unpack' up the internal tuning structs and update the options
9015 in OPTS. The caller must have set up selected_tune and selected_arch
9016 as all the other target-specific codegen decisions are
9017 derived from them. */
9019 void
9020 aarch64_override_options_internal (struct gcc_options *opts)
9022 aarch64_tune_flags = selected_tune->flags;
9023 aarch64_tune = selected_tune->sched_core;
9024 /* Make a copy of the tuning parameters attached to the core, which
9025 we may later overwrite. */
9026 aarch64_tune_params = *(selected_tune->tune);
9027 aarch64_architecture_version = selected_arch->architecture_version;
9029 if (opts->x_aarch64_override_tune_string)
9030 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9031 &aarch64_tune_params);
9033 /* This target defaults to strict volatile bitfields. */
9034 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9035 opts->x_flag_strict_volatile_bitfields = 1;
9037 initialize_aarch64_code_model (opts);
9038 initialize_aarch64_tls_size (opts);
9040 int queue_depth = 0;
9041 switch (aarch64_tune_params.autoprefetcher_model)
9043 case tune_params::AUTOPREFETCHER_OFF:
9044 queue_depth = -1;
9045 break;
9046 case tune_params::AUTOPREFETCHER_WEAK:
9047 queue_depth = 0;
9048 break;
9049 case tune_params::AUTOPREFETCHER_STRONG:
9050 queue_depth = max_insn_queue_index + 1;
9051 break;
9052 default:
9053 gcc_unreachable ();
9056 /* We don't mind passing in global_options_set here as we don't use
9057 the *options_set structs anyway. */
9058 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9059 queue_depth,
9060 opts->x_param_values,
9061 global_options_set.x_param_values);
9063 /* Set up parameters to be used in prefetching algorithm. Do not
9064 override the defaults unless we are tuning for a core we have
9065 researched values for. */
9066 if (aarch64_tune_params.prefetch->num_slots > 0)
9067 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9068 aarch64_tune_params.prefetch->num_slots,
9069 opts->x_param_values,
9070 global_options_set.x_param_values);
9071 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9072 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9073 aarch64_tune_params.prefetch->l1_cache_size,
9074 opts->x_param_values,
9075 global_options_set.x_param_values);
9076 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9077 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9078 aarch64_tune_params.prefetch->l1_cache_line_size,
9079 opts->x_param_values,
9080 global_options_set.x_param_values);
9081 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9082 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9083 aarch64_tune_params.prefetch->l2_cache_size,
9084 opts->x_param_values,
9085 global_options_set.x_param_values);
9087 /* Use the alternative scheduling-pressure algorithm by default. */
9088 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
9089 opts->x_param_values,
9090 global_options_set.x_param_values);
9092 /* Enable sw prefetching at specified optimization level for
9093 CPUS that have prefetch. Lower optimization level threshold by 1
9094 when profiling is enabled. */
9095 if (opts->x_flag_prefetch_loop_arrays < 0
9096 && !opts->x_optimize_size
9097 && aarch64_tune_params.prefetch->default_opt_level >= 0
9098 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9099 opts->x_flag_prefetch_loop_arrays = 1;
9101 aarch64_override_options_after_change_1 (opts);
9104 /* Print a hint with a suggestion for a core or architecture name that
9105 most closely resembles what the user passed in STR. ARCH is true if
9106 the user is asking for an architecture name. ARCH is false if the user
9107 is asking for a core name. */
9109 static void
9110 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9112 auto_vec<const char *> candidates;
9113 const struct processor *entry = arch ? all_architectures : all_cores;
9114 for (; entry->name != NULL; entry++)
9115 candidates.safe_push (entry->name);
9116 char *s;
9117 const char *hint = candidates_list_and_hint (str, s, candidates);
9118 if (hint)
9119 inform (input_location, "valid arguments are: %s;"
9120 " did you mean %qs?", s, hint);
9121 XDELETEVEC (s);
9124 /* Print a hint with a suggestion for a core name that most closely resembles
9125 what the user passed in STR. */
9127 inline static void
9128 aarch64_print_hint_for_core (const char *str)
9130 aarch64_print_hint_for_core_or_arch (str, false);
9133 /* Print a hint with a suggestion for an architecture name that most closely
9134 resembles what the user passed in STR. */
9136 inline static void
9137 aarch64_print_hint_for_arch (const char *str)
9139 aarch64_print_hint_for_core_or_arch (str, true);
9142 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9143 specified in STR and throw errors if appropriate. Put the results if
9144 they are valid in RES and ISA_FLAGS. Return whether the option is
9145 valid. */
9147 static bool
9148 aarch64_validate_mcpu (const char *str, const struct processor **res,
9149 unsigned long *isa_flags)
9151 enum aarch64_parse_opt_result parse_res
9152 = aarch64_parse_cpu (str, res, isa_flags);
9154 if (parse_res == AARCH64_PARSE_OK)
9155 return true;
9157 switch (parse_res)
9159 case AARCH64_PARSE_MISSING_ARG:
9160 error ("missing cpu name in %<-mcpu=%s%>", str);
9161 break;
9162 case AARCH64_PARSE_INVALID_ARG:
9163 error ("unknown value %qs for -mcpu", str);
9164 aarch64_print_hint_for_core (str);
9165 break;
9166 case AARCH64_PARSE_INVALID_FEATURE:
9167 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9168 break;
9169 default:
9170 gcc_unreachable ();
9173 return false;
9176 /* Validate a command-line -march option. Parse the arch and extensions
9177 (if any) specified in STR and throw errors if appropriate. Put the
9178 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9179 option is valid. */
9181 static bool
9182 aarch64_validate_march (const char *str, const struct processor **res,
9183 unsigned long *isa_flags)
9185 enum aarch64_parse_opt_result parse_res
9186 = aarch64_parse_arch (str, res, isa_flags);
9188 if (parse_res == AARCH64_PARSE_OK)
9189 return true;
9191 switch (parse_res)
9193 case AARCH64_PARSE_MISSING_ARG:
9194 error ("missing arch name in %<-march=%s%>", str);
9195 break;
9196 case AARCH64_PARSE_INVALID_ARG:
9197 error ("unknown value %qs for -march", str);
9198 aarch64_print_hint_for_arch (str);
9199 break;
9200 case AARCH64_PARSE_INVALID_FEATURE:
9201 error ("invalid feature modifier in %<-march=%s%>", str);
9202 break;
9203 default:
9204 gcc_unreachable ();
9207 return false;
9210 /* Validate a command-line -mtune option. Parse the cpu
9211 specified in STR and throw errors if appropriate. Put the
9212 result, if it is valid, in RES. Return whether the option is
9213 valid. */
9215 static bool
9216 aarch64_validate_mtune (const char *str, const struct processor **res)
9218 enum aarch64_parse_opt_result parse_res
9219 = aarch64_parse_tune (str, res);
9221 if (parse_res == AARCH64_PARSE_OK)
9222 return true;
9224 switch (parse_res)
9226 case AARCH64_PARSE_MISSING_ARG:
9227 error ("missing cpu name in %<-mtune=%s%>", str);
9228 break;
9229 case AARCH64_PARSE_INVALID_ARG:
9230 error ("unknown value %qs for -mtune", str);
9231 aarch64_print_hint_for_core (str);
9232 break;
9233 default:
9234 gcc_unreachable ();
9236 return false;
9239 /* Return the CPU corresponding to the enum CPU.
9240 If it doesn't specify a cpu, return the default. */
9242 static const struct processor *
9243 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9245 if (cpu != aarch64_none)
9246 return &all_cores[cpu];
9248 /* The & 0x3f is to extract the bottom 6 bits that encode the
9249 default cpu as selected by the --with-cpu GCC configure option
9250 in config.gcc.
9251 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9252 flags mechanism should be reworked to make it more sane. */
9253 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9256 /* Return the architecture corresponding to the enum ARCH.
9257 If it doesn't specify a valid architecture, return the default. */
9259 static const struct processor *
9260 aarch64_get_arch (enum aarch64_arch arch)
9262 if (arch != aarch64_no_arch)
9263 return &all_architectures[arch];
9265 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9267 return &all_architectures[cpu->arch];
9270 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9271 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9272 tuning structs. In particular it must set selected_tune and
9273 aarch64_isa_flags that define the available ISA features and tuning
9274 decisions. It must also set selected_arch as this will be used to
9275 output the .arch asm tags for each function. */
9277 static void
9278 aarch64_override_options (void)
9280 unsigned long cpu_isa = 0;
9281 unsigned long arch_isa = 0;
9282 aarch64_isa_flags = 0;
9284 bool valid_cpu = true;
9285 bool valid_tune = true;
9286 bool valid_arch = true;
9288 selected_cpu = NULL;
9289 selected_arch = NULL;
9290 selected_tune = NULL;
9292 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9293 If either of -march or -mtune is given, they override their
9294 respective component of -mcpu. */
9295 if (aarch64_cpu_string)
9296 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9297 &cpu_isa);
9299 if (aarch64_arch_string)
9300 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9301 &arch_isa);
9303 if (aarch64_tune_string)
9304 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9306 /* If the user did not specify a processor, choose the default
9307 one for them. This will be the CPU set during configuration using
9308 --with-cpu, otherwise it is "generic". */
9309 if (!selected_cpu)
9311 if (selected_arch)
9313 selected_cpu = &all_cores[selected_arch->ident];
9314 aarch64_isa_flags = arch_isa;
9315 explicit_arch = selected_arch->arch;
9317 else
9319 /* Get default configure-time CPU. */
9320 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9321 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9324 if (selected_tune)
9325 explicit_tune_core = selected_tune->ident;
9327 /* If both -mcpu and -march are specified check that they are architecturally
9328 compatible, warn if they're not and prefer the -march ISA flags. */
9329 else if (selected_arch)
9331 if (selected_arch->arch != selected_cpu->arch)
9333 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9334 all_architectures[selected_cpu->arch].name,
9335 selected_arch->name);
9337 aarch64_isa_flags = arch_isa;
9338 explicit_arch = selected_arch->arch;
9339 explicit_tune_core = selected_tune ? selected_tune->ident
9340 : selected_cpu->ident;
9342 else
9344 /* -mcpu but no -march. */
9345 aarch64_isa_flags = cpu_isa;
9346 explicit_tune_core = selected_tune ? selected_tune->ident
9347 : selected_cpu->ident;
9348 gcc_assert (selected_cpu);
9349 selected_arch = &all_architectures[selected_cpu->arch];
9350 explicit_arch = selected_arch->arch;
9353 /* Set the arch as well as we will need it when outputing
9354 the .arch directive in assembly. */
9355 if (!selected_arch)
9357 gcc_assert (selected_cpu);
9358 selected_arch = &all_architectures[selected_cpu->arch];
9361 if (!selected_tune)
9362 selected_tune = selected_cpu;
9364 #ifndef HAVE_AS_MABI_OPTION
9365 /* The compiler may have been configured with 2.23.* binutils, which does
9366 not have support for ILP32. */
9367 if (TARGET_ILP32)
9368 error ("Assembler does not support -mabi=ilp32");
9369 #endif
9371 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9372 sorry ("Return address signing is only supported for -mabi=lp64");
9374 /* Make sure we properly set up the explicit options. */
9375 if ((aarch64_cpu_string && valid_cpu)
9376 || (aarch64_tune_string && valid_tune))
9377 gcc_assert (explicit_tune_core != aarch64_none);
9379 if ((aarch64_cpu_string && valid_cpu)
9380 || (aarch64_arch_string && valid_arch))
9381 gcc_assert (explicit_arch != aarch64_no_arch);
9383 aarch64_override_options_internal (&global_options);
9385 /* Save these options as the default ones in case we push and pop them later
9386 while processing functions with potential target attributes. */
9387 target_option_default_node = target_option_current_node
9388 = build_target_option_node (&global_options);
9391 /* Implement targetm.override_options_after_change. */
9393 static void
9394 aarch64_override_options_after_change (void)
9396 aarch64_override_options_after_change_1 (&global_options);
9399 static struct machine_function *
9400 aarch64_init_machine_status (void)
9402 struct machine_function *machine;
9403 machine = ggc_cleared_alloc<machine_function> ();
9404 return machine;
9407 void
9408 aarch64_init_expanders (void)
9410 init_machine_status = aarch64_init_machine_status;
9413 /* A checking mechanism for the implementation of the various code models. */
9414 static void
9415 initialize_aarch64_code_model (struct gcc_options *opts)
9417 if (opts->x_flag_pic)
9419 switch (opts->x_aarch64_cmodel_var)
9421 case AARCH64_CMODEL_TINY:
9422 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9423 break;
9424 case AARCH64_CMODEL_SMALL:
9425 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9426 aarch64_cmodel = (flag_pic == 2
9427 ? AARCH64_CMODEL_SMALL_PIC
9428 : AARCH64_CMODEL_SMALL_SPIC);
9429 #else
9430 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9431 #endif
9432 break;
9433 case AARCH64_CMODEL_LARGE:
9434 sorry ("code model %qs with -f%s", "large",
9435 opts->x_flag_pic > 1 ? "PIC" : "pic");
9436 break;
9437 default:
9438 gcc_unreachable ();
9441 else
9442 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9445 /* Implement TARGET_OPTION_SAVE. */
9447 static void
9448 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9450 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9453 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9454 using the information saved in PTR. */
9456 static void
9457 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9459 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9460 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9461 opts->x_explicit_arch = ptr->x_explicit_arch;
9462 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9463 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9465 aarch64_override_options_internal (opts);
9468 /* Implement TARGET_OPTION_PRINT. */
9470 static void
9471 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9473 const struct processor *cpu
9474 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9475 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9476 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9477 std::string extension
9478 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9480 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9481 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9482 arch->name, extension.c_str ());
9485 static GTY(()) tree aarch64_previous_fndecl;
9487 void
9488 aarch64_reset_previous_fndecl (void)
9490 aarch64_previous_fndecl = NULL;
9493 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9494 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9495 make sure optab availability predicates are recomputed when necessary. */
9497 void
9498 aarch64_save_restore_target_globals (tree new_tree)
9500 if (TREE_TARGET_GLOBALS (new_tree))
9501 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9502 else if (new_tree == target_option_default_node)
9503 restore_target_globals (&default_target_globals);
9504 else
9505 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9508 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9509 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9510 of the function, if such exists. This function may be called multiple
9511 times on a single function so use aarch64_previous_fndecl to avoid
9512 setting up identical state. */
9514 static void
9515 aarch64_set_current_function (tree fndecl)
9517 if (!fndecl || fndecl == aarch64_previous_fndecl)
9518 return;
9520 tree old_tree = (aarch64_previous_fndecl
9521 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9522 : NULL_TREE);
9524 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9526 /* If current function has no attributes but the previous one did,
9527 use the default node. */
9528 if (!new_tree && old_tree)
9529 new_tree = target_option_default_node;
9531 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9532 the default have been handled by aarch64_save_restore_target_globals from
9533 aarch64_pragma_target_parse. */
9534 if (old_tree == new_tree)
9535 return;
9537 aarch64_previous_fndecl = fndecl;
9539 /* First set the target options. */
9540 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9542 aarch64_save_restore_target_globals (new_tree);
9545 /* Enum describing the various ways we can handle attributes.
9546 In many cases we can reuse the generic option handling machinery. */
9548 enum aarch64_attr_opt_type
9550 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9551 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9552 aarch64_attr_enum, /* Attribute sets an enum variable. */
9553 aarch64_attr_custom /* Attribute requires a custom handling function. */
9556 /* All the information needed to handle a target attribute.
9557 NAME is the name of the attribute.
9558 ATTR_TYPE specifies the type of behavior of the attribute as described
9559 in the definition of enum aarch64_attr_opt_type.
9560 ALLOW_NEG is true if the attribute supports a "no-" form.
9561 HANDLER is the function that takes the attribute string as an argument
9562 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
9563 OPT_NUM is the enum specifying the option that the attribute modifies.
9564 This is needed for attributes that mirror the behavior of a command-line
9565 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9566 aarch64_attr_enum. */
9568 struct aarch64_attribute_info
9570 const char *name;
9571 enum aarch64_attr_opt_type attr_type;
9572 bool allow_neg;
9573 bool (*handler) (const char *);
9574 enum opt_code opt_num;
9577 /* Handle the ARCH_STR argument to the arch= target attribute. */
9579 static bool
9580 aarch64_handle_attr_arch (const char *str)
9582 const struct processor *tmp_arch = NULL;
9583 enum aarch64_parse_opt_result parse_res
9584 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9586 if (parse_res == AARCH64_PARSE_OK)
9588 gcc_assert (tmp_arch);
9589 selected_arch = tmp_arch;
9590 explicit_arch = selected_arch->arch;
9591 return true;
9594 switch (parse_res)
9596 case AARCH64_PARSE_MISSING_ARG:
9597 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
9598 break;
9599 case AARCH64_PARSE_INVALID_ARG:
9600 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
9601 aarch64_print_hint_for_arch (str);
9602 break;
9603 case AARCH64_PARSE_INVALID_FEATURE:
9604 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9605 break;
9606 default:
9607 gcc_unreachable ();
9610 return false;
9613 /* Handle the argument CPU_STR to the cpu= target attribute. */
9615 static bool
9616 aarch64_handle_attr_cpu (const char *str)
9618 const struct processor *tmp_cpu = NULL;
9619 enum aarch64_parse_opt_result parse_res
9620 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9622 if (parse_res == AARCH64_PARSE_OK)
9624 gcc_assert (tmp_cpu);
9625 selected_tune = tmp_cpu;
9626 explicit_tune_core = selected_tune->ident;
9628 selected_arch = &all_architectures[tmp_cpu->arch];
9629 explicit_arch = selected_arch->arch;
9630 return true;
9633 switch (parse_res)
9635 case AARCH64_PARSE_MISSING_ARG:
9636 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
9637 break;
9638 case AARCH64_PARSE_INVALID_ARG:
9639 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
9640 aarch64_print_hint_for_core (str);
9641 break;
9642 case AARCH64_PARSE_INVALID_FEATURE:
9643 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9644 break;
9645 default:
9646 gcc_unreachable ();
9649 return false;
9652 /* Handle the argument STR to the tune= target attribute. */
9654 static bool
9655 aarch64_handle_attr_tune (const char *str)
9657 const struct processor *tmp_tune = NULL;
9658 enum aarch64_parse_opt_result parse_res
9659 = aarch64_parse_tune (str, &tmp_tune);
9661 if (parse_res == AARCH64_PARSE_OK)
9663 gcc_assert (tmp_tune);
9664 selected_tune = tmp_tune;
9665 explicit_tune_core = selected_tune->ident;
9666 return true;
9669 switch (parse_res)
9671 case AARCH64_PARSE_INVALID_ARG:
9672 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
9673 aarch64_print_hint_for_core (str);
9674 break;
9675 default:
9676 gcc_unreachable ();
9679 return false;
9682 /* Parse an architecture extensions target attribute string specified in STR.
9683 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9684 if successful. Update aarch64_isa_flags to reflect the ISA features
9685 modified. */
9687 static bool
9688 aarch64_handle_attr_isa_flags (char *str)
9690 enum aarch64_parse_opt_result parse_res;
9691 unsigned long isa_flags = aarch64_isa_flags;
9693 /* We allow "+nothing" in the beginning to clear out all architectural
9694 features if the user wants to handpick specific features. */
9695 if (strncmp ("+nothing", str, 8) == 0)
9697 isa_flags = 0;
9698 str += 8;
9701 parse_res = aarch64_parse_extension (str, &isa_flags);
9703 if (parse_res == AARCH64_PARSE_OK)
9705 aarch64_isa_flags = isa_flags;
9706 return true;
9709 switch (parse_res)
9711 case AARCH64_PARSE_MISSING_ARG:
9712 error ("missing value in %<target()%> pragma or attribute");
9713 break;
9715 case AARCH64_PARSE_INVALID_FEATURE:
9716 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9717 break;
9719 default:
9720 gcc_unreachable ();
9723 return false;
9726 /* The target attributes that we support. On top of these we also support just
9727 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9728 handled explicitly in aarch64_process_one_target_attr. */
9730 static const struct aarch64_attribute_info aarch64_attributes[] =
9732 { "general-regs-only", aarch64_attr_mask, false, NULL,
9733 OPT_mgeneral_regs_only },
9734 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9735 OPT_mfix_cortex_a53_835769 },
9736 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9737 OPT_mfix_cortex_a53_843419 },
9738 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9739 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9740 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9741 OPT_momit_leaf_frame_pointer },
9742 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9743 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9744 OPT_march_ },
9745 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9746 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9747 OPT_mtune_ },
9748 { "sign-return-address", aarch64_attr_enum, false, NULL,
9749 OPT_msign_return_address_ },
9750 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9753 /* Parse ARG_STR which contains the definition of one target attribute.
9754 Show appropriate errors if any or return true if the attribute is valid. */
9756 static bool
9757 aarch64_process_one_target_attr (char *arg_str)
9759 bool invert = false;
9761 size_t len = strlen (arg_str);
9763 if (len == 0)
9765 error ("malformed %<target()%> pragma or attribute");
9766 return false;
9769 char *str_to_check = (char *) alloca (len + 1);
9770 strcpy (str_to_check, arg_str);
9772 /* Skip leading whitespace. */
9773 while (*str_to_check == ' ' || *str_to_check == '\t')
9774 str_to_check++;
9776 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9777 It is easier to detect and handle it explicitly here rather than going
9778 through the machinery for the rest of the target attributes in this
9779 function. */
9780 if (*str_to_check == '+')
9781 return aarch64_handle_attr_isa_flags (str_to_check);
9783 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9785 invert = true;
9786 str_to_check += 3;
9788 char *arg = strchr (str_to_check, '=');
9790 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9791 and point ARG to "foo". */
9792 if (arg)
9794 *arg = '\0';
9795 arg++;
9797 const struct aarch64_attribute_info *p_attr;
9798 bool found = false;
9799 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9801 /* If the names don't match up, or the user has given an argument
9802 to an attribute that doesn't accept one, or didn't give an argument
9803 to an attribute that expects one, fail to match. */
9804 if (strcmp (str_to_check, p_attr->name) != 0)
9805 continue;
9807 found = true;
9808 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9809 || p_attr->attr_type == aarch64_attr_enum;
9811 if (attr_need_arg_p ^ (arg != NULL))
9813 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
9814 return false;
9817 /* If the name matches but the attribute does not allow "no-" versions
9818 then we can't match. */
9819 if (invert && !p_attr->allow_neg)
9821 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
9822 return false;
9825 switch (p_attr->attr_type)
9827 /* Has a custom handler registered.
9828 For example, cpu=, arch=, tune=. */
9829 case aarch64_attr_custom:
9830 gcc_assert (p_attr->handler);
9831 if (!p_attr->handler (arg))
9832 return false;
9833 break;
9835 /* Either set or unset a boolean option. */
9836 case aarch64_attr_bool:
9838 struct cl_decoded_option decoded;
9840 generate_option (p_attr->opt_num, NULL, !invert,
9841 CL_TARGET, &decoded);
9842 aarch64_handle_option (&global_options, &global_options_set,
9843 &decoded, input_location);
9844 break;
9846 /* Set or unset a bit in the target_flags. aarch64_handle_option
9847 should know what mask to apply given the option number. */
9848 case aarch64_attr_mask:
9850 struct cl_decoded_option decoded;
9851 /* We only need to specify the option number.
9852 aarch64_handle_option will know which mask to apply. */
9853 decoded.opt_index = p_attr->opt_num;
9854 decoded.value = !invert;
9855 aarch64_handle_option (&global_options, &global_options_set,
9856 &decoded, input_location);
9857 break;
9859 /* Use the option setting machinery to set an option to an enum. */
9860 case aarch64_attr_enum:
9862 gcc_assert (arg);
9863 bool valid;
9864 int value;
9865 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9866 &value, CL_TARGET);
9867 if (valid)
9869 set_option (&global_options, NULL, p_attr->opt_num, value,
9870 NULL, DK_UNSPECIFIED, input_location,
9871 global_dc);
9873 else
9875 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
9877 break;
9879 default:
9880 gcc_unreachable ();
9884 /* If we reached here we either have found an attribute and validated
9885 it or didn't match any. If we matched an attribute but its arguments
9886 were malformed we will have returned false already. */
9887 return found;
9890 /* Count how many times the character C appears in
9891 NULL-terminated string STR. */
9893 static unsigned int
9894 num_occurences_in_str (char c, char *str)
9896 unsigned int res = 0;
9897 while (*str != '\0')
9899 if (*str == c)
9900 res++;
9902 str++;
9905 return res;
9908 /* Parse the tree in ARGS that contains the target attribute information
9909 and update the global target options space. */
9911 bool
9912 aarch64_process_target_attr (tree args)
9914 if (TREE_CODE (args) == TREE_LIST)
9918 tree head = TREE_VALUE (args);
9919 if (head)
9921 if (!aarch64_process_target_attr (head))
9922 return false;
9924 args = TREE_CHAIN (args);
9925 } while (args);
9927 return true;
9930 if (TREE_CODE (args) != STRING_CST)
9932 error ("attribute %<target%> argument not a string");
9933 return false;
9936 size_t len = strlen (TREE_STRING_POINTER (args));
9937 char *str_to_check = (char *) alloca (len + 1);
9938 strcpy (str_to_check, TREE_STRING_POINTER (args));
9940 if (len == 0)
9942 error ("malformed %<target()%> pragma or attribute");
9943 return false;
9946 /* Used to catch empty spaces between commas i.e.
9947 attribute ((target ("attr1,,attr2"))). */
9948 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9950 /* Handle multiple target attributes separated by ','. */
9951 char *token = strtok (str_to_check, ",");
9953 unsigned int num_attrs = 0;
9954 while (token)
9956 num_attrs++;
9957 if (!aarch64_process_one_target_attr (token))
9959 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
9960 return false;
9963 token = strtok (NULL, ",");
9966 if (num_attrs != num_commas + 1)
9968 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
9969 return false;
9972 return true;
9975 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9976 process attribute ((target ("..."))). */
9978 static bool
9979 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9981 struct cl_target_option cur_target;
9982 bool ret;
9983 tree old_optimize;
9984 tree new_target, new_optimize;
9985 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9987 /* If what we're processing is the current pragma string then the
9988 target option node is already stored in target_option_current_node
9989 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9990 having to re-parse the string. This is especially useful to keep
9991 arm_neon.h compile times down since that header contains a lot
9992 of intrinsics enclosed in pragmas. */
9993 if (!existing_target && args == current_target_pragma)
9995 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9996 return true;
9998 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10000 old_optimize = build_optimization_node (&global_options);
10001 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10003 /* If the function changed the optimization levels as well as setting
10004 target options, start with the optimizations specified. */
10005 if (func_optimize && func_optimize != old_optimize)
10006 cl_optimization_restore (&global_options,
10007 TREE_OPTIMIZATION (func_optimize));
10009 /* Save the current target options to restore at the end. */
10010 cl_target_option_save (&cur_target, &global_options);
10012 /* If fndecl already has some target attributes applied to it, unpack
10013 them so that we add this attribute on top of them, rather than
10014 overwriting them. */
10015 if (existing_target)
10017 struct cl_target_option *existing_options
10018 = TREE_TARGET_OPTION (existing_target);
10020 if (existing_options)
10021 cl_target_option_restore (&global_options, existing_options);
10023 else
10024 cl_target_option_restore (&global_options,
10025 TREE_TARGET_OPTION (target_option_current_node));
10027 ret = aarch64_process_target_attr (args);
10029 /* Set up any additional state. */
10030 if (ret)
10032 aarch64_override_options_internal (&global_options);
10033 /* Initialize SIMD builtins if we haven't already.
10034 Set current_target_pragma to NULL for the duration so that
10035 the builtin initialization code doesn't try to tag the functions
10036 being built with the attributes specified by any current pragma, thus
10037 going into an infinite recursion. */
10038 if (TARGET_SIMD)
10040 tree saved_current_target_pragma = current_target_pragma;
10041 current_target_pragma = NULL;
10042 aarch64_init_simd_builtins ();
10043 current_target_pragma = saved_current_target_pragma;
10045 new_target = build_target_option_node (&global_options);
10047 else
10048 new_target = NULL;
10050 new_optimize = build_optimization_node (&global_options);
10052 if (fndecl && ret)
10054 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10056 if (old_optimize != new_optimize)
10057 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10060 cl_target_option_restore (&global_options, &cur_target);
10062 if (old_optimize != new_optimize)
10063 cl_optimization_restore (&global_options,
10064 TREE_OPTIMIZATION (old_optimize));
10065 return ret;
10068 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10069 tri-bool options (yes, no, don't care) and the default value is
10070 DEF, determine whether to reject inlining. */
10072 static bool
10073 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10074 int dont_care, int def)
10076 /* If the callee doesn't care, always allow inlining. */
10077 if (callee == dont_care)
10078 return true;
10080 /* If the caller doesn't care, always allow inlining. */
10081 if (caller == dont_care)
10082 return true;
10084 /* Otherwise, allow inlining if either the callee and caller values
10085 agree, or if the callee is using the default value. */
10086 return (callee == caller || callee == def);
10089 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10090 to inline CALLEE into CALLER based on target-specific info.
10091 Make sure that the caller and callee have compatible architectural
10092 features. Then go through the other possible target attributes
10093 and see if they can block inlining. Try not to reject always_inline
10094 callees unless they are incompatible architecturally. */
10096 static bool
10097 aarch64_can_inline_p (tree caller, tree callee)
10099 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10100 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10102 /* If callee has no option attributes, then it is ok to inline. */
10103 if (!callee_tree)
10104 return true;
10106 struct cl_target_option *caller_opts
10107 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10108 : target_option_default_node);
10110 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10113 /* Callee's ISA flags should be a subset of the caller's. */
10114 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10115 != callee_opts->x_aarch64_isa_flags)
10116 return false;
10118 /* Allow non-strict aligned functions inlining into strict
10119 aligned ones. */
10120 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10121 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10122 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10123 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10124 return false;
10126 bool always_inline = lookup_attribute ("always_inline",
10127 DECL_ATTRIBUTES (callee));
10129 /* If the architectural features match up and the callee is always_inline
10130 then the other attributes don't matter. */
10131 if (always_inline)
10132 return true;
10134 if (caller_opts->x_aarch64_cmodel_var
10135 != callee_opts->x_aarch64_cmodel_var)
10136 return false;
10138 if (caller_opts->x_aarch64_tls_dialect
10139 != callee_opts->x_aarch64_tls_dialect)
10140 return false;
10142 /* Honour explicit requests to workaround errata. */
10143 if (!aarch64_tribools_ok_for_inlining_p (
10144 caller_opts->x_aarch64_fix_a53_err835769,
10145 callee_opts->x_aarch64_fix_a53_err835769,
10146 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10147 return false;
10149 if (!aarch64_tribools_ok_for_inlining_p (
10150 caller_opts->x_aarch64_fix_a53_err843419,
10151 callee_opts->x_aarch64_fix_a53_err843419,
10152 2, TARGET_FIX_ERR_A53_843419))
10153 return false;
10155 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10156 caller and calle and they don't match up, reject inlining. */
10157 if (!aarch64_tribools_ok_for_inlining_p (
10158 caller_opts->x_flag_omit_leaf_frame_pointer,
10159 callee_opts->x_flag_omit_leaf_frame_pointer,
10160 2, 1))
10161 return false;
10163 /* If the callee has specific tuning overrides, respect them. */
10164 if (callee_opts->x_aarch64_override_tune_string != NULL
10165 && caller_opts->x_aarch64_override_tune_string == NULL)
10166 return false;
10168 /* If the user specified tuning override strings for the
10169 caller and callee and they don't match up, reject inlining.
10170 We just do a string compare here, we don't analyze the meaning
10171 of the string, as it would be too costly for little gain. */
10172 if (callee_opts->x_aarch64_override_tune_string
10173 && caller_opts->x_aarch64_override_tune_string
10174 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10175 caller_opts->x_aarch64_override_tune_string) != 0))
10176 return false;
10178 return true;
10181 /* Return true if SYMBOL_REF X binds locally. */
10183 static bool
10184 aarch64_symbol_binds_local_p (const_rtx x)
10186 return (SYMBOL_REF_DECL (x)
10187 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10188 : SYMBOL_REF_LOCAL_P (x));
10191 /* Return true if SYMBOL_REF X is thread local */
10192 static bool
10193 aarch64_tls_symbol_p (rtx x)
10195 if (! TARGET_HAVE_TLS)
10196 return false;
10198 if (GET_CODE (x) != SYMBOL_REF)
10199 return false;
10201 return SYMBOL_REF_TLS_MODEL (x) != 0;
10204 /* Classify a TLS symbol into one of the TLS kinds. */
10205 enum aarch64_symbol_type
10206 aarch64_classify_tls_symbol (rtx x)
10208 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10210 switch (tls_kind)
10212 case TLS_MODEL_GLOBAL_DYNAMIC:
10213 case TLS_MODEL_LOCAL_DYNAMIC:
10214 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10216 case TLS_MODEL_INITIAL_EXEC:
10217 switch (aarch64_cmodel)
10219 case AARCH64_CMODEL_TINY:
10220 case AARCH64_CMODEL_TINY_PIC:
10221 return SYMBOL_TINY_TLSIE;
10222 default:
10223 return SYMBOL_SMALL_TLSIE;
10226 case TLS_MODEL_LOCAL_EXEC:
10227 if (aarch64_tls_size == 12)
10228 return SYMBOL_TLSLE12;
10229 else if (aarch64_tls_size == 24)
10230 return SYMBOL_TLSLE24;
10231 else if (aarch64_tls_size == 32)
10232 return SYMBOL_TLSLE32;
10233 else if (aarch64_tls_size == 48)
10234 return SYMBOL_TLSLE48;
10235 else
10236 gcc_unreachable ();
10238 case TLS_MODEL_EMULATED:
10239 case TLS_MODEL_NONE:
10240 return SYMBOL_FORCE_TO_MEM;
10242 default:
10243 gcc_unreachable ();
10247 /* Return the method that should be used to access SYMBOL_REF or
10248 LABEL_REF X. */
10250 enum aarch64_symbol_type
10251 aarch64_classify_symbol (rtx x, rtx offset)
10253 if (GET_CODE (x) == LABEL_REF)
10255 switch (aarch64_cmodel)
10257 case AARCH64_CMODEL_LARGE:
10258 return SYMBOL_FORCE_TO_MEM;
10260 case AARCH64_CMODEL_TINY_PIC:
10261 case AARCH64_CMODEL_TINY:
10262 return SYMBOL_TINY_ABSOLUTE;
10264 case AARCH64_CMODEL_SMALL_SPIC:
10265 case AARCH64_CMODEL_SMALL_PIC:
10266 case AARCH64_CMODEL_SMALL:
10267 return SYMBOL_SMALL_ABSOLUTE;
10269 default:
10270 gcc_unreachable ();
10274 if (GET_CODE (x) == SYMBOL_REF)
10276 if (aarch64_tls_symbol_p (x))
10277 return aarch64_classify_tls_symbol (x);
10279 switch (aarch64_cmodel)
10281 case AARCH64_CMODEL_TINY:
10282 /* When we retrieve symbol + offset address, we have to make sure
10283 the offset does not cause overflow of the final address. But
10284 we have no way of knowing the address of symbol at compile time
10285 so we can't accurately say if the distance between the PC and
10286 symbol + offset is outside the addressible range of +/-1M in the
10287 TINY code model. So we rely on images not being greater than
10288 1M and cap the offset at 1M and anything beyond 1M will have to
10289 be loaded using an alternative mechanism. Furthermore if the
10290 symbol is a weak reference to something that isn't known to
10291 resolve to a symbol in this module, then force to memory. */
10292 if ((SYMBOL_REF_WEAK (x)
10293 && !aarch64_symbol_binds_local_p (x))
10294 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10295 return SYMBOL_FORCE_TO_MEM;
10296 return SYMBOL_TINY_ABSOLUTE;
10298 case AARCH64_CMODEL_SMALL:
10299 /* Same reasoning as the tiny code model, but the offset cap here is
10300 4G. */
10301 if ((SYMBOL_REF_WEAK (x)
10302 && !aarch64_symbol_binds_local_p (x))
10303 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10304 HOST_WIDE_INT_C (4294967264)))
10305 return SYMBOL_FORCE_TO_MEM;
10306 return SYMBOL_SMALL_ABSOLUTE;
10308 case AARCH64_CMODEL_TINY_PIC:
10309 if (!aarch64_symbol_binds_local_p (x))
10310 return SYMBOL_TINY_GOT;
10311 return SYMBOL_TINY_ABSOLUTE;
10313 case AARCH64_CMODEL_SMALL_SPIC:
10314 case AARCH64_CMODEL_SMALL_PIC:
10315 if (!aarch64_symbol_binds_local_p (x))
10316 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10317 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10318 return SYMBOL_SMALL_ABSOLUTE;
10320 case AARCH64_CMODEL_LARGE:
10321 /* This is alright even in PIC code as the constant
10322 pool reference is always PC relative and within
10323 the same translation unit. */
10324 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10325 return SYMBOL_SMALL_ABSOLUTE;
10326 else
10327 return SYMBOL_FORCE_TO_MEM;
10329 default:
10330 gcc_unreachable ();
10334 /* By default push everything into the constant pool. */
10335 return SYMBOL_FORCE_TO_MEM;
10338 bool
10339 aarch64_constant_address_p (rtx x)
10341 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10344 bool
10345 aarch64_legitimate_pic_operand_p (rtx x)
10347 if (GET_CODE (x) == SYMBOL_REF
10348 || (GET_CODE (x) == CONST
10349 && GET_CODE (XEXP (x, 0)) == PLUS
10350 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10351 return false;
10353 return true;
10356 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
10357 that should be rematerialized rather than spilled. */
10359 static bool
10360 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10362 /* Support CSE and rematerialization of common constants. */
10363 if (CONST_INT_P (x) || CONST_DOUBLE_P (x) || GET_CODE (x) == CONST_VECTOR)
10364 return true;
10366 /* Do not allow vector struct mode constants. We could support
10367 0 and -1 easily, but they need support in aarch64-simd.md. */
10368 if (aarch64_vect_struct_mode_p (mode))
10369 return false;
10371 /* Do not allow wide int constants - this requires support in movti. */
10372 if (CONST_WIDE_INT_P (x))
10373 return false;
10375 /* Do not allow const (plus (anchor_symbol, const_int)). */
10376 if (GET_CODE (x) == CONST)
10378 rtx offset;
10380 split_const (x, &x, &offset);
10382 if (SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
10383 return false;
10386 if (GET_CODE (x) == HIGH)
10387 x = XEXP (x, 0);
10389 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10390 so spilling them is better than rematerialization. */
10391 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10392 return true;
10394 /* Label references are always constant. */
10395 if (GET_CODE (x) == LABEL_REF)
10396 return true;
10398 return false;
10402 aarch64_load_tp (rtx target)
10404 if (!target
10405 || GET_MODE (target) != Pmode
10406 || !register_operand (target, Pmode))
10407 target = gen_reg_rtx (Pmode);
10409 /* Can return in any reg. */
10410 emit_insn (gen_aarch64_load_tp_hard (target));
10411 return target;
10414 /* On AAPCS systems, this is the "struct __va_list". */
10415 static GTY(()) tree va_list_type;
10417 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10418 Return the type to use as __builtin_va_list.
10420 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10422 struct __va_list
10424 void *__stack;
10425 void *__gr_top;
10426 void *__vr_top;
10427 int __gr_offs;
10428 int __vr_offs;
10429 }; */
10431 static tree
10432 aarch64_build_builtin_va_list (void)
10434 tree va_list_name;
10435 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10437 /* Create the type. */
10438 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10439 /* Give it the required name. */
10440 va_list_name = build_decl (BUILTINS_LOCATION,
10441 TYPE_DECL,
10442 get_identifier ("__va_list"),
10443 va_list_type);
10444 DECL_ARTIFICIAL (va_list_name) = 1;
10445 TYPE_NAME (va_list_type) = va_list_name;
10446 TYPE_STUB_DECL (va_list_type) = va_list_name;
10448 /* Create the fields. */
10449 f_stack = build_decl (BUILTINS_LOCATION,
10450 FIELD_DECL, get_identifier ("__stack"),
10451 ptr_type_node);
10452 f_grtop = build_decl (BUILTINS_LOCATION,
10453 FIELD_DECL, get_identifier ("__gr_top"),
10454 ptr_type_node);
10455 f_vrtop = build_decl (BUILTINS_LOCATION,
10456 FIELD_DECL, get_identifier ("__vr_top"),
10457 ptr_type_node);
10458 f_groff = build_decl (BUILTINS_LOCATION,
10459 FIELD_DECL, get_identifier ("__gr_offs"),
10460 integer_type_node);
10461 f_vroff = build_decl (BUILTINS_LOCATION,
10462 FIELD_DECL, get_identifier ("__vr_offs"),
10463 integer_type_node);
10465 /* Tell tree-stdarg pass about our internal offset fields.
10466 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10467 purpose to identify whether the code is updating va_list internal
10468 offset fields through irregular way. */
10469 va_list_gpr_counter_field = f_groff;
10470 va_list_fpr_counter_field = f_vroff;
10472 DECL_ARTIFICIAL (f_stack) = 1;
10473 DECL_ARTIFICIAL (f_grtop) = 1;
10474 DECL_ARTIFICIAL (f_vrtop) = 1;
10475 DECL_ARTIFICIAL (f_groff) = 1;
10476 DECL_ARTIFICIAL (f_vroff) = 1;
10478 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10479 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10480 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10481 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10482 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10484 TYPE_FIELDS (va_list_type) = f_stack;
10485 DECL_CHAIN (f_stack) = f_grtop;
10486 DECL_CHAIN (f_grtop) = f_vrtop;
10487 DECL_CHAIN (f_vrtop) = f_groff;
10488 DECL_CHAIN (f_groff) = f_vroff;
10490 /* Compute its layout. */
10491 layout_type (va_list_type);
10493 return va_list_type;
10496 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10497 static void
10498 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10500 const CUMULATIVE_ARGS *cum;
10501 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10502 tree stack, grtop, vrtop, groff, vroff;
10503 tree t;
10504 int gr_save_area_size = cfun->va_list_gpr_size;
10505 int vr_save_area_size = cfun->va_list_fpr_size;
10506 int vr_offset;
10508 cum = &crtl->args.info;
10509 if (cfun->va_list_gpr_size)
10510 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10511 cfun->va_list_gpr_size);
10512 if (cfun->va_list_fpr_size)
10513 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10514 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10516 if (!TARGET_FLOAT)
10518 gcc_assert (cum->aapcs_nvrn == 0);
10519 vr_save_area_size = 0;
10522 f_stack = TYPE_FIELDS (va_list_type_node);
10523 f_grtop = DECL_CHAIN (f_stack);
10524 f_vrtop = DECL_CHAIN (f_grtop);
10525 f_groff = DECL_CHAIN (f_vrtop);
10526 f_vroff = DECL_CHAIN (f_groff);
10528 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10529 NULL_TREE);
10530 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10531 NULL_TREE);
10532 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10533 NULL_TREE);
10534 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10535 NULL_TREE);
10536 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10537 NULL_TREE);
10539 /* Emit code to initialize STACK, which points to the next varargs stack
10540 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10541 by named arguments. STACK is 8-byte aligned. */
10542 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10543 if (cum->aapcs_stack_size > 0)
10544 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10545 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10546 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10548 /* Emit code to initialize GRTOP, the top of the GR save area.
10549 virtual_incoming_args_rtx should have been 16 byte aligned. */
10550 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10551 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10552 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10554 /* Emit code to initialize VRTOP, the top of the VR save area.
10555 This address is gr_save_area_bytes below GRTOP, rounded
10556 down to the next 16-byte boundary. */
10557 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10558 vr_offset = ROUND_UP (gr_save_area_size,
10559 STACK_BOUNDARY / BITS_PER_UNIT);
10561 if (vr_offset)
10562 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10563 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10564 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10566 /* Emit code to initialize GROFF, the offset from GRTOP of the
10567 next GPR argument. */
10568 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10569 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10570 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10572 /* Likewise emit code to initialize VROFF, the offset from FTOP
10573 of the next VR argument. */
10574 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10575 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10576 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10579 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10581 static tree
10582 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10583 gimple_seq *post_p ATTRIBUTE_UNUSED)
10585 tree addr;
10586 bool indirect_p;
10587 bool is_ha; /* is HFA or HVA. */
10588 bool dw_align; /* double-word align. */
10589 machine_mode ag_mode = VOIDmode;
10590 int nregs;
10591 machine_mode mode;
10593 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10594 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10595 HOST_WIDE_INT size, rsize, adjust, align;
10596 tree t, u, cond1, cond2;
10598 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10599 if (indirect_p)
10600 type = build_pointer_type (type);
10602 mode = TYPE_MODE (type);
10604 f_stack = TYPE_FIELDS (va_list_type_node);
10605 f_grtop = DECL_CHAIN (f_stack);
10606 f_vrtop = DECL_CHAIN (f_grtop);
10607 f_groff = DECL_CHAIN (f_vrtop);
10608 f_vroff = DECL_CHAIN (f_groff);
10610 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10611 f_stack, NULL_TREE);
10612 size = int_size_in_bytes (type);
10613 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10615 dw_align = false;
10616 adjust = 0;
10617 if (aarch64_vfp_is_call_or_return_candidate (mode,
10618 type,
10619 &ag_mode,
10620 &nregs,
10621 &is_ha))
10623 /* TYPE passed in fp/simd registers. */
10624 if (!TARGET_FLOAT)
10625 aarch64_err_no_fpadvsimd (mode, "varargs");
10627 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10628 unshare_expr (valist), f_vrtop, NULL_TREE);
10629 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10630 unshare_expr (valist), f_vroff, NULL_TREE);
10632 rsize = nregs * UNITS_PER_VREG;
10634 if (is_ha)
10636 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10637 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10639 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10640 && size < UNITS_PER_VREG)
10642 adjust = UNITS_PER_VREG - size;
10645 else
10647 /* TYPE passed in general registers. */
10648 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10649 unshare_expr (valist), f_grtop, NULL_TREE);
10650 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10651 unshare_expr (valist), f_groff, NULL_TREE);
10652 rsize = ROUND_UP (size, UNITS_PER_WORD);
10653 nregs = rsize / UNITS_PER_WORD;
10655 if (align > 8)
10656 dw_align = true;
10658 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10659 && size < UNITS_PER_WORD)
10661 adjust = UNITS_PER_WORD - size;
10665 /* Get a local temporary for the field value. */
10666 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10668 /* Emit code to branch if off >= 0. */
10669 t = build2 (GE_EXPR, boolean_type_node, off,
10670 build_int_cst (TREE_TYPE (off), 0));
10671 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10673 if (dw_align)
10675 /* Emit: offs = (offs + 15) & -16. */
10676 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10677 build_int_cst (TREE_TYPE (off), 15));
10678 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10679 build_int_cst (TREE_TYPE (off), -16));
10680 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10682 else
10683 roundup = NULL;
10685 /* Update ap.__[g|v]r_offs */
10686 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10687 build_int_cst (TREE_TYPE (off), rsize));
10688 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10690 /* String up. */
10691 if (roundup)
10692 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10694 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10695 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10696 build_int_cst (TREE_TYPE (f_off), 0));
10697 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10699 /* String up: make sure the assignment happens before the use. */
10700 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10701 COND_EXPR_ELSE (cond1) = t;
10703 /* Prepare the trees handling the argument that is passed on the stack;
10704 the top level node will store in ON_STACK. */
10705 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10706 if (align > 8)
10708 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10709 t = fold_convert (intDI_type_node, arg);
10710 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10711 build_int_cst (TREE_TYPE (t), 15));
10712 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10713 build_int_cst (TREE_TYPE (t), -16));
10714 t = fold_convert (TREE_TYPE (arg), t);
10715 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10717 else
10718 roundup = NULL;
10719 /* Advance ap.__stack */
10720 t = fold_convert (intDI_type_node, arg);
10721 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10722 build_int_cst (TREE_TYPE (t), size + 7));
10723 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10724 build_int_cst (TREE_TYPE (t), -8));
10725 t = fold_convert (TREE_TYPE (arg), t);
10726 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10727 /* String up roundup and advance. */
10728 if (roundup)
10729 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10730 /* String up with arg */
10731 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10732 /* Big-endianness related address adjustment. */
10733 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10734 && size < UNITS_PER_WORD)
10736 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10737 size_int (UNITS_PER_WORD - size));
10738 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10741 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10742 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10744 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10745 t = off;
10746 if (adjust)
10747 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10748 build_int_cst (TREE_TYPE (off), adjust));
10750 t = fold_convert (sizetype, t);
10751 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10753 if (is_ha)
10755 /* type ha; // treat as "struct {ftype field[n];}"
10756 ... [computing offs]
10757 for (i = 0; i <nregs; ++i, offs += 16)
10758 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10759 return ha; */
10760 int i;
10761 tree tmp_ha, field_t, field_ptr_t;
10763 /* Declare a local variable. */
10764 tmp_ha = create_tmp_var_raw (type, "ha");
10765 gimple_add_tmp_var (tmp_ha);
10767 /* Establish the base type. */
10768 switch (ag_mode)
10770 case E_SFmode:
10771 field_t = float_type_node;
10772 field_ptr_t = float_ptr_type_node;
10773 break;
10774 case E_DFmode:
10775 field_t = double_type_node;
10776 field_ptr_t = double_ptr_type_node;
10777 break;
10778 case E_TFmode:
10779 field_t = long_double_type_node;
10780 field_ptr_t = long_double_ptr_type_node;
10781 break;
10782 case E_HFmode:
10783 field_t = aarch64_fp16_type_node;
10784 field_ptr_t = aarch64_fp16_ptr_type_node;
10785 break;
10786 case E_V2SImode:
10787 case E_V4SImode:
10789 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10790 field_t = build_vector_type_for_mode (innertype, ag_mode);
10791 field_ptr_t = build_pointer_type (field_t);
10793 break;
10794 default:
10795 gcc_assert (0);
10798 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10799 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10800 addr = t;
10801 t = fold_convert (field_ptr_t, addr);
10802 t = build2 (MODIFY_EXPR, field_t,
10803 build1 (INDIRECT_REF, field_t, tmp_ha),
10804 build1 (INDIRECT_REF, field_t, t));
10806 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10807 for (i = 1; i < nregs; ++i)
10809 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10810 u = fold_convert (field_ptr_t, addr);
10811 u = build2 (MODIFY_EXPR, field_t,
10812 build2 (MEM_REF, field_t, tmp_ha,
10813 build_int_cst (field_ptr_t,
10814 (i *
10815 int_size_in_bytes (field_t)))),
10816 build1 (INDIRECT_REF, field_t, u));
10817 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10820 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10821 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10824 COND_EXPR_ELSE (cond2) = t;
10825 addr = fold_convert (build_pointer_type (type), cond1);
10826 addr = build_va_arg_indirect_ref (addr);
10828 if (indirect_p)
10829 addr = build_va_arg_indirect_ref (addr);
10831 return addr;
10834 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10836 static void
10837 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10838 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10839 int no_rtl)
10841 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10842 CUMULATIVE_ARGS local_cum;
10843 int gr_saved = cfun->va_list_gpr_size;
10844 int vr_saved = cfun->va_list_fpr_size;
10846 /* The caller has advanced CUM up to, but not beyond, the last named
10847 argument. Advance a local copy of CUM past the last "real" named
10848 argument, to find out how many registers are left over. */
10849 local_cum = *cum;
10850 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10852 /* Found out how many registers we need to save.
10853 Honor tree-stdvar analysis results. */
10854 if (cfun->va_list_gpr_size)
10855 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10856 cfun->va_list_gpr_size / UNITS_PER_WORD);
10857 if (cfun->va_list_fpr_size)
10858 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10859 cfun->va_list_fpr_size / UNITS_PER_VREG);
10861 if (!TARGET_FLOAT)
10863 gcc_assert (local_cum.aapcs_nvrn == 0);
10864 vr_saved = 0;
10867 if (!no_rtl)
10869 if (gr_saved > 0)
10871 rtx ptr, mem;
10873 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10874 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10875 - gr_saved * UNITS_PER_WORD);
10876 mem = gen_frame_mem (BLKmode, ptr);
10877 set_mem_alias_set (mem, get_varargs_alias_set ());
10879 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10880 mem, gr_saved);
10882 if (vr_saved > 0)
10884 /* We can't use move_block_from_reg, because it will use
10885 the wrong mode, storing D regs only. */
10886 machine_mode mode = TImode;
10887 int off, i, vr_start;
10889 /* Set OFF to the offset from virtual_incoming_args_rtx of
10890 the first vector register. The VR save area lies below
10891 the GR one, and is aligned to 16 bytes. */
10892 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10893 STACK_BOUNDARY / BITS_PER_UNIT);
10894 off -= vr_saved * UNITS_PER_VREG;
10896 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10897 for (i = 0; i < vr_saved; ++i)
10899 rtx ptr, mem;
10901 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10902 mem = gen_frame_mem (mode, ptr);
10903 set_mem_alias_set (mem, get_varargs_alias_set ());
10904 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10905 off += UNITS_PER_VREG;
10910 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10911 any complication of having crtl->args.pretend_args_size changed. */
10912 cfun->machine->frame.saved_varargs_size
10913 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10914 STACK_BOUNDARY / BITS_PER_UNIT)
10915 + vr_saved * UNITS_PER_VREG);
10918 static void
10919 aarch64_conditional_register_usage (void)
10921 int i;
10922 if (!TARGET_FLOAT)
10924 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10926 fixed_regs[i] = 1;
10927 call_used_regs[i] = 1;
10932 /* Walk down the type tree of TYPE counting consecutive base elements.
10933 If *MODEP is VOIDmode, then set it to the first valid floating point
10934 type. If a non-floating point type is found, or if a floating point
10935 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10936 otherwise return the count in the sub-tree. */
10937 static int
10938 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10940 machine_mode mode;
10941 HOST_WIDE_INT size;
10943 switch (TREE_CODE (type))
10945 case REAL_TYPE:
10946 mode = TYPE_MODE (type);
10947 if (mode != DFmode && mode != SFmode
10948 && mode != TFmode && mode != HFmode)
10949 return -1;
10951 if (*modep == VOIDmode)
10952 *modep = mode;
10954 if (*modep == mode)
10955 return 1;
10957 break;
10959 case COMPLEX_TYPE:
10960 mode = TYPE_MODE (TREE_TYPE (type));
10961 if (mode != DFmode && mode != SFmode
10962 && mode != TFmode && mode != HFmode)
10963 return -1;
10965 if (*modep == VOIDmode)
10966 *modep = mode;
10968 if (*modep == mode)
10969 return 2;
10971 break;
10973 case VECTOR_TYPE:
10974 /* Use V2SImode and V4SImode as representatives of all 64-bit
10975 and 128-bit vector types. */
10976 size = int_size_in_bytes (type);
10977 switch (size)
10979 case 8:
10980 mode = V2SImode;
10981 break;
10982 case 16:
10983 mode = V4SImode;
10984 break;
10985 default:
10986 return -1;
10989 if (*modep == VOIDmode)
10990 *modep = mode;
10992 /* Vector modes are considered to be opaque: two vectors are
10993 equivalent for the purposes of being homogeneous aggregates
10994 if they are the same size. */
10995 if (*modep == mode)
10996 return 1;
10998 break;
11000 case ARRAY_TYPE:
11002 int count;
11003 tree index = TYPE_DOMAIN (type);
11005 /* Can't handle incomplete types nor sizes that are not
11006 fixed. */
11007 if (!COMPLETE_TYPE_P (type)
11008 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11009 return -1;
11011 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11012 if (count == -1
11013 || !index
11014 || !TYPE_MAX_VALUE (index)
11015 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11016 || !TYPE_MIN_VALUE (index)
11017 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11018 || count < 0)
11019 return -1;
11021 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11022 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11024 /* There must be no padding. */
11025 if (wi::to_wide (TYPE_SIZE (type))
11026 != count * GET_MODE_BITSIZE (*modep))
11027 return -1;
11029 return count;
11032 case RECORD_TYPE:
11034 int count = 0;
11035 int sub_count;
11036 tree field;
11038 /* Can't handle incomplete types nor sizes that are not
11039 fixed. */
11040 if (!COMPLETE_TYPE_P (type)
11041 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11042 return -1;
11044 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11046 if (TREE_CODE (field) != FIELD_DECL)
11047 continue;
11049 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11050 if (sub_count < 0)
11051 return -1;
11052 count += sub_count;
11055 /* There must be no padding. */
11056 if (wi::to_wide (TYPE_SIZE (type))
11057 != count * GET_MODE_BITSIZE (*modep))
11058 return -1;
11060 return count;
11063 case UNION_TYPE:
11064 case QUAL_UNION_TYPE:
11066 /* These aren't very interesting except in a degenerate case. */
11067 int count = 0;
11068 int sub_count;
11069 tree field;
11071 /* Can't handle incomplete types nor sizes that are not
11072 fixed. */
11073 if (!COMPLETE_TYPE_P (type)
11074 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11075 return -1;
11077 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11079 if (TREE_CODE (field) != FIELD_DECL)
11080 continue;
11082 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11083 if (sub_count < 0)
11084 return -1;
11085 count = count > sub_count ? count : sub_count;
11088 /* There must be no padding. */
11089 if (wi::to_wide (TYPE_SIZE (type))
11090 != count * GET_MODE_BITSIZE (*modep))
11091 return -1;
11093 return count;
11096 default:
11097 break;
11100 return -1;
11103 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11104 type as described in AAPCS64 \S 4.1.2.
11106 See the comment above aarch64_composite_type_p for the notes on MODE. */
11108 static bool
11109 aarch64_short_vector_p (const_tree type,
11110 machine_mode mode)
11112 HOST_WIDE_INT size = -1;
11114 if (type && TREE_CODE (type) == VECTOR_TYPE)
11115 size = int_size_in_bytes (type);
11116 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11117 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11118 size = GET_MODE_SIZE (mode);
11120 return (size == 8 || size == 16);
11123 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11124 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11125 array types. The C99 floating-point complex types are also considered
11126 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11127 types, which are GCC extensions and out of the scope of AAPCS64, are
11128 treated as composite types here as well.
11130 Note that MODE itself is not sufficient in determining whether a type
11131 is such a composite type or not. This is because
11132 stor-layout.c:compute_record_mode may have already changed the MODE
11133 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11134 structure with only one field may have its MODE set to the mode of the
11135 field. Also an integer mode whose size matches the size of the
11136 RECORD_TYPE type may be used to substitute the original mode
11137 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11138 solely relied on. */
11140 static bool
11141 aarch64_composite_type_p (const_tree type,
11142 machine_mode mode)
11144 if (aarch64_short_vector_p (type, mode))
11145 return false;
11147 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11148 return true;
11150 if (mode == BLKmode
11151 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11152 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11153 return true;
11155 return false;
11158 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11159 shall be passed or returned in simd/fp register(s) (providing these
11160 parameter passing registers are available).
11162 Upon successful return, *COUNT returns the number of needed registers,
11163 *BASE_MODE returns the mode of the individual register and when IS_HAF
11164 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11165 floating-point aggregate or a homogeneous short-vector aggregate. */
11167 static bool
11168 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11169 const_tree type,
11170 machine_mode *base_mode,
11171 int *count,
11172 bool *is_ha)
11174 machine_mode new_mode = VOIDmode;
11175 bool composite_p = aarch64_composite_type_p (type, mode);
11177 if (is_ha != NULL) *is_ha = false;
11179 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11180 || aarch64_short_vector_p (type, mode))
11182 *count = 1;
11183 new_mode = mode;
11185 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11187 if (is_ha != NULL) *is_ha = true;
11188 *count = 2;
11189 new_mode = GET_MODE_INNER (mode);
11191 else if (type && composite_p)
11193 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11195 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11197 if (is_ha != NULL) *is_ha = true;
11198 *count = ag_count;
11200 else
11201 return false;
11203 else
11204 return false;
11206 *base_mode = new_mode;
11207 return true;
11210 /* Implement TARGET_STRUCT_VALUE_RTX. */
11212 static rtx
11213 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11214 int incoming ATTRIBUTE_UNUSED)
11216 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11219 /* Implements target hook vector_mode_supported_p. */
11220 static bool
11221 aarch64_vector_mode_supported_p (machine_mode mode)
11223 if (TARGET_SIMD
11224 && (mode == V4SImode || mode == V8HImode
11225 || mode == V16QImode || mode == V2DImode
11226 || mode == V2SImode || mode == V4HImode
11227 || mode == V8QImode || mode == V2SFmode
11228 || mode == V4SFmode || mode == V2DFmode
11229 || mode == V4HFmode || mode == V8HFmode
11230 || mode == V1DFmode))
11231 return true;
11233 return false;
11236 /* Return appropriate SIMD container
11237 for MODE within a vector of WIDTH bits. */
11238 static machine_mode
11239 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11241 gcc_assert (width == 64 || width == 128);
11242 if (TARGET_SIMD)
11244 if (width == 128)
11245 switch (mode)
11247 case E_DFmode:
11248 return V2DFmode;
11249 case E_SFmode:
11250 return V4SFmode;
11251 case E_HFmode:
11252 return V8HFmode;
11253 case E_SImode:
11254 return V4SImode;
11255 case E_HImode:
11256 return V8HImode;
11257 case E_QImode:
11258 return V16QImode;
11259 case E_DImode:
11260 return V2DImode;
11261 default:
11262 break;
11264 else
11265 switch (mode)
11267 case E_SFmode:
11268 return V2SFmode;
11269 case E_HFmode:
11270 return V4HFmode;
11271 case E_SImode:
11272 return V2SImode;
11273 case E_HImode:
11274 return V4HImode;
11275 case E_QImode:
11276 return V8QImode;
11277 default:
11278 break;
11281 return word_mode;
11284 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11285 static machine_mode
11286 aarch64_preferred_simd_mode (scalar_mode mode)
11288 return aarch64_simd_container_mode (mode, 128);
11291 /* Return the bitmask of possible vector sizes for the vectorizer
11292 to iterate over. */
11293 static unsigned int
11294 aarch64_autovectorize_vector_sizes (void)
11296 return (16 | 8);
11299 /* Implement TARGET_MANGLE_TYPE. */
11301 static const char *
11302 aarch64_mangle_type (const_tree type)
11304 /* The AArch64 ABI documents say that "__va_list" has to be
11305 managled as if it is in the "std" namespace. */
11306 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11307 return "St9__va_list";
11309 /* Half-precision float. */
11310 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11311 return "Dh";
11313 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11314 builtin types. */
11315 if (TYPE_NAME (type) != NULL)
11316 return aarch64_mangle_builtin_type (type);
11318 /* Use the default mangling. */
11319 return NULL;
11322 /* Find the first rtx_insn before insn that will generate an assembly
11323 instruction. */
11325 static rtx_insn *
11326 aarch64_prev_real_insn (rtx_insn *insn)
11328 if (!insn)
11329 return NULL;
11333 insn = prev_real_insn (insn);
11335 while (insn && recog_memoized (insn) < 0);
11337 return insn;
11340 static bool
11341 is_madd_op (enum attr_type t1)
11343 unsigned int i;
11344 /* A number of these may be AArch32 only. */
11345 enum attr_type mlatypes[] = {
11346 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11347 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11348 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11351 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11353 if (t1 == mlatypes[i])
11354 return true;
11357 return false;
11360 /* Check if there is a register dependency between a load and the insn
11361 for which we hold recog_data. */
11363 static bool
11364 dep_between_memop_and_curr (rtx memop)
11366 rtx load_reg;
11367 int opno;
11369 gcc_assert (GET_CODE (memop) == SET);
11371 if (!REG_P (SET_DEST (memop)))
11372 return false;
11374 load_reg = SET_DEST (memop);
11375 for (opno = 1; opno < recog_data.n_operands; opno++)
11377 rtx operand = recog_data.operand[opno];
11378 if (REG_P (operand)
11379 && reg_overlap_mentioned_p (load_reg, operand))
11380 return true;
11383 return false;
11387 /* When working around the Cortex-A53 erratum 835769,
11388 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11389 instruction and has a preceding memory instruction such that a NOP
11390 should be inserted between them. */
11392 bool
11393 aarch64_madd_needs_nop (rtx_insn* insn)
11395 enum attr_type attr_type;
11396 rtx_insn *prev;
11397 rtx body;
11399 if (!TARGET_FIX_ERR_A53_835769)
11400 return false;
11402 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11403 return false;
11405 attr_type = get_attr_type (insn);
11406 if (!is_madd_op (attr_type))
11407 return false;
11409 prev = aarch64_prev_real_insn (insn);
11410 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11411 Restore recog state to INSN to avoid state corruption. */
11412 extract_constrain_insn_cached (insn);
11414 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11415 return false;
11417 body = single_set (prev);
11419 /* If the previous insn is a memory op and there is no dependency between
11420 it and the DImode madd, emit a NOP between them. If body is NULL then we
11421 have a complex memory operation, probably a load/store pair.
11422 Be conservative for now and emit a NOP. */
11423 if (GET_MODE (recog_data.operand[0]) == DImode
11424 && (!body || !dep_between_memop_and_curr (body)))
11425 return true;
11427 return false;
11432 /* Implement FINAL_PRESCAN_INSN. */
11434 void
11435 aarch64_final_prescan_insn (rtx_insn *insn)
11437 if (aarch64_madd_needs_nop (insn))
11438 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11442 /* Return the equivalent letter for size. */
11443 static char
11444 sizetochar (int size)
11446 switch (size)
11448 case 64: return 'd';
11449 case 32: return 's';
11450 case 16: return 'h';
11451 case 8 : return 'b';
11452 default: gcc_unreachable ();
11456 /* Return true iff x is a uniform vector of floating-point
11457 constants, and the constant can be represented in
11458 quarter-precision form. Note, as aarch64_float_const_representable
11459 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11460 static bool
11461 aarch64_vect_float_const_representable_p (rtx x)
11463 rtx elt;
11464 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11465 && const_vec_duplicate_p (x, &elt)
11466 && aarch64_float_const_representable_p (elt));
11469 /* Return true for valid and false for invalid. */
11470 bool
11471 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11472 struct simd_immediate_info *info,
11473 enum simd_immediate_check which)
11475 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11476 matches = 1; \
11477 for (i = 0; i < idx; i += (STRIDE)) \
11478 if (!(TEST)) \
11479 matches = 0; \
11480 if (matches) \
11482 immtype = (CLASS); \
11483 elsize = (ELSIZE); \
11484 eshift = (SHIFT); \
11485 emvn = (NEG); \
11486 break; \
11489 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11490 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11491 unsigned char bytes[16];
11492 int immtype = -1, matches;
11493 unsigned int invmask = inverse ? 0xff : 0;
11494 int eshift, emvn;
11496 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11498 if (! (aarch64_simd_imm_zero_p (op, mode)
11499 || aarch64_vect_float_const_representable_p (op)))
11500 return false;
11502 if (info)
11504 rtx elt = CONST_VECTOR_ELT (op, 0);
11505 scalar_float_mode elt_mode
11506 = as_a <scalar_float_mode> (GET_MODE (elt));
11508 info->value = elt;
11509 info->element_width = GET_MODE_BITSIZE (elt_mode);
11510 info->mvn = false;
11511 info->shift = 0;
11514 return true;
11517 /* Splat vector constant out into a byte vector. */
11518 for (i = 0; i < n_elts; i++)
11520 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11521 it must be laid out in the vector register in reverse order. */
11522 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11523 unsigned HOST_WIDE_INT elpart;
11525 gcc_assert (CONST_INT_P (el));
11526 elpart = INTVAL (el);
11528 for (unsigned int byte = 0; byte < innersize; byte++)
11530 bytes[idx++] = (elpart & 0xff) ^ invmask;
11531 elpart >>= BITS_PER_UNIT;
11536 /* Sanity check. */
11537 gcc_assert (idx == GET_MODE_SIZE (mode));
11541 if (which & AARCH64_CHECK_ORR)
11543 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11544 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11546 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11547 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11549 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11550 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11552 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11553 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11555 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11557 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11560 if (which & AARCH64_CHECK_BIC)
11562 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11563 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11565 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11566 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11568 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11569 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11571 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11572 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11574 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11576 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11579 /* Shifting ones / 8-bit / 64-bit variants only checked
11580 for 'ALL' (MOVI/MVNI). */
11581 if (which == AARCH64_CHECK_MOV)
11583 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11584 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11586 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11587 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11589 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11590 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11592 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11593 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11595 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11597 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11598 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11601 while (0);
11603 if (immtype == -1)
11604 return false;
11606 if (info)
11608 info->element_width = elsize;
11609 info->mvn = emvn != 0;
11610 info->shift = eshift;
11612 unsigned HOST_WIDE_INT imm = 0;
11614 if (immtype >= 12 && immtype <= 15)
11615 info->msl = true;
11617 /* Un-invert bytes of recognized vector, if necessary. */
11618 if (invmask != 0)
11619 for (i = 0; i < idx; i++)
11620 bytes[i] ^= invmask;
11622 if (immtype == 17)
11624 /* FIXME: Broken on 32-bit H_W_I hosts. */
11625 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11627 for (i = 0; i < 8; i++)
11628 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11629 << (i * BITS_PER_UNIT);
11632 info->value = GEN_INT (imm);
11634 else
11636 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11637 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11639 /* Construct 'abcdefgh' because the assembler cannot handle
11640 generic constants. */
11641 if (info->mvn)
11642 imm = ~imm;
11643 imm = (imm >> info->shift) & 0xff;
11644 info->value = GEN_INT (imm);
11648 return true;
11649 #undef CHECK
11652 /* Check of immediate shift constants are within range. */
11653 bool
11654 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11656 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11657 if (left)
11658 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11659 else
11660 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11663 /* Return true if X is a uniform vector where all elements
11664 are either the floating-point constant 0.0 or the
11665 integer constant 0. */
11666 bool
11667 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11669 return x == CONST0_RTX (mode);
11673 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11674 operation of width WIDTH at bit position POS. */
11677 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11679 gcc_assert (CONST_INT_P (width));
11680 gcc_assert (CONST_INT_P (pos));
11682 unsigned HOST_WIDE_INT mask
11683 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11684 return GEN_INT (mask << UINTVAL (pos));
11687 bool
11688 aarch64_mov_operand_p (rtx x, machine_mode mode)
11690 if (GET_CODE (x) == HIGH
11691 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11692 return true;
11694 if (CONST_INT_P (x))
11695 return true;
11697 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11698 return true;
11700 return aarch64_classify_symbolic_expression (x)
11701 == SYMBOL_TINY_ABSOLUTE;
11704 /* Return a const_int vector of VAL. */
11706 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11708 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
11709 return gen_const_vec_duplicate (mode, c);
11712 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11714 bool
11715 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11717 machine_mode vmode;
11719 vmode = aarch64_preferred_simd_mode (mode);
11720 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11721 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11724 /* Construct and return a PARALLEL RTX vector with elements numbering the
11725 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11726 the vector - from the perspective of the architecture. This does not
11727 line up with GCC's perspective on lane numbers, so we end up with
11728 different masks depending on our target endian-ness. The diagram
11729 below may help. We must draw the distinction when building masks
11730 which select one half of the vector. An instruction selecting
11731 architectural low-lanes for a big-endian target, must be described using
11732 a mask selecting GCC high-lanes.
11734 Big-Endian Little-Endian
11736 GCC 0 1 2 3 3 2 1 0
11737 | x | x | x | x | | x | x | x | x |
11738 Architecture 3 2 1 0 3 2 1 0
11740 Low Mask: { 2, 3 } { 0, 1 }
11741 High Mask: { 0, 1 } { 2, 3 }
11743 MODE Is the mode of the vector and NUNITS is the number of units in it. */
11746 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
11748 rtvec v = rtvec_alloc (nunits / 2);
11749 int high_base = nunits / 2;
11750 int low_base = 0;
11751 int base;
11752 rtx t1;
11753 int i;
11755 if (BYTES_BIG_ENDIAN)
11756 base = high ? low_base : high_base;
11757 else
11758 base = high ? high_base : low_base;
11760 for (i = 0; i < nunits / 2; i++)
11761 RTVEC_ELT (v, i) = GEN_INT (base + i);
11763 t1 = gen_rtx_PARALLEL (mode, v);
11764 return t1;
11767 /* Check OP for validity as a PARALLEL RTX vector with elements
11768 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11769 from the perspective of the architecture. See the diagram above
11770 aarch64_simd_vect_par_cnst_half for more details. */
11772 bool
11773 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11774 bool high)
11776 if (!VECTOR_MODE_P (mode))
11777 return false;
11779 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, GET_MODE_NUNITS (mode),
11780 high);
11781 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11782 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11783 int i = 0;
11785 if (count_op != count_ideal)
11786 return false;
11788 for (i = 0; i < count_ideal; i++)
11790 rtx elt_op = XVECEXP (op, 0, i);
11791 rtx elt_ideal = XVECEXP (ideal, 0, i);
11793 if (!CONST_INT_P (elt_op)
11794 || INTVAL (elt_ideal) != INTVAL (elt_op))
11795 return false;
11797 return true;
11800 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11801 HIGH (exclusive). */
11802 void
11803 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11804 const_tree exp)
11806 HOST_WIDE_INT lane;
11807 gcc_assert (CONST_INT_P (operand));
11808 lane = INTVAL (operand);
11810 if (lane < low || lane >= high)
11812 if (exp)
11813 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11814 else
11815 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11819 /* Peform endian correction on lane number N, which indexes a vector
11820 of mode MODE, and return the result as an SImode rtx. */
11823 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
11825 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
11828 /* Return TRUE if OP is a valid vector addressing mode. */
11829 bool
11830 aarch64_simd_mem_operand_p (rtx op)
11832 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11833 || REG_P (XEXP (op, 0)));
11836 /* Emit a register copy from operand to operand, taking care not to
11837 early-clobber source registers in the process.
11839 COUNT is the number of components into which the copy needs to be
11840 decomposed. */
11841 void
11842 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11843 unsigned int count)
11845 unsigned int i;
11846 int rdest = REGNO (operands[0]);
11847 int rsrc = REGNO (operands[1]);
11849 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11850 || rdest < rsrc)
11851 for (i = 0; i < count; i++)
11852 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11853 gen_rtx_REG (mode, rsrc + i));
11854 else
11855 for (i = 0; i < count; i++)
11856 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11857 gen_rtx_REG (mode, rsrc + count - i - 1));
11860 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11861 one of VSTRUCT modes: OI, CI, or XI. */
11863 aarch64_simd_attr_length_rglist (machine_mode mode)
11865 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11868 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11869 alignment of a vector to 128 bits. */
11870 static HOST_WIDE_INT
11871 aarch64_simd_vector_alignment (const_tree type)
11873 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11874 return MIN (align, 128);
11877 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11878 static bool
11879 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11881 if (is_packed)
11882 return false;
11884 /* We guarantee alignment for vectors up to 128-bits. */
11885 if (tree_int_cst_compare (TYPE_SIZE (type),
11886 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11887 return false;
11889 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11890 return true;
11893 /* Return true if the vector misalignment factor is supported by the
11894 target. */
11895 static bool
11896 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11897 const_tree type, int misalignment,
11898 bool is_packed)
11900 if (TARGET_SIMD && STRICT_ALIGNMENT)
11902 /* Return if movmisalign pattern is not supported for this mode. */
11903 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11904 return false;
11906 /* Misalignment factor is unknown at compile time. */
11907 if (misalignment == -1)
11908 return false;
11910 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11911 is_packed);
11914 /* If VALS is a vector constant that can be loaded into a register
11915 using DUP, generate instructions to do so and return an RTX to
11916 assign to the register. Otherwise return NULL_RTX. */
11917 static rtx
11918 aarch64_simd_dup_constant (rtx vals)
11920 machine_mode mode = GET_MODE (vals);
11921 machine_mode inner_mode = GET_MODE_INNER (mode);
11922 rtx x;
11924 if (!const_vec_duplicate_p (vals, &x))
11925 return NULL_RTX;
11927 /* We can load this constant by using DUP and a constant in a
11928 single ARM register. This will be cheaper than a vector
11929 load. */
11930 x = copy_to_mode_reg (inner_mode, x);
11931 return gen_vec_duplicate (mode, x);
11935 /* Generate code to load VALS, which is a PARALLEL containing only
11936 constants (for vec_init) or CONST_VECTOR, efficiently into a
11937 register. Returns an RTX to copy into the register, or NULL_RTX
11938 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11939 static rtx
11940 aarch64_simd_make_constant (rtx vals)
11942 machine_mode mode = GET_MODE (vals);
11943 rtx const_dup;
11944 rtx const_vec = NULL_RTX;
11945 int n_elts = GET_MODE_NUNITS (mode);
11946 int n_const = 0;
11947 int i;
11949 if (GET_CODE (vals) == CONST_VECTOR)
11950 const_vec = vals;
11951 else if (GET_CODE (vals) == PARALLEL)
11953 /* A CONST_VECTOR must contain only CONST_INTs and
11954 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11955 Only store valid constants in a CONST_VECTOR. */
11956 for (i = 0; i < n_elts; ++i)
11958 rtx x = XVECEXP (vals, 0, i);
11959 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11960 n_const++;
11962 if (n_const == n_elts)
11963 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11965 else
11966 gcc_unreachable ();
11968 if (const_vec != NULL_RTX
11969 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11970 /* Load using MOVI/MVNI. */
11971 return const_vec;
11972 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11973 /* Loaded using DUP. */
11974 return const_dup;
11975 else if (const_vec != NULL_RTX)
11976 /* Load from constant pool. We can not take advantage of single-cycle
11977 LD1 because we need a PC-relative addressing mode. */
11978 return const_vec;
11979 else
11980 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11981 We can not construct an initializer. */
11982 return NULL_RTX;
11985 /* Expand a vector initialisation sequence, such that TARGET is
11986 initialised to contain VALS. */
11988 void
11989 aarch64_expand_vector_init (rtx target, rtx vals)
11991 machine_mode mode = GET_MODE (target);
11992 scalar_mode inner_mode = GET_MODE_INNER (mode);
11993 /* The number of vector elements. */
11994 int n_elts = GET_MODE_NUNITS (mode);
11995 /* The number of vector elements which are not constant. */
11996 int n_var = 0;
11997 rtx any_const = NULL_RTX;
11998 /* The first element of vals. */
11999 rtx v0 = XVECEXP (vals, 0, 0);
12000 bool all_same = true;
12002 /* Count the number of variable elements to initialise. */
12003 for (int i = 0; i < n_elts; ++i)
12005 rtx x = XVECEXP (vals, 0, i);
12006 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12007 ++n_var;
12008 else
12009 any_const = x;
12011 all_same &= rtx_equal_p (x, v0);
12014 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12015 how best to handle this. */
12016 if (n_var == 0)
12018 rtx constant = aarch64_simd_make_constant (vals);
12019 if (constant != NULL_RTX)
12021 emit_move_insn (target, constant);
12022 return;
12026 /* Splat a single non-constant element if we can. */
12027 if (all_same)
12029 rtx x = copy_to_mode_reg (inner_mode, v0);
12030 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
12031 return;
12034 enum insn_code icode = optab_handler (vec_set_optab, mode);
12035 gcc_assert (icode != CODE_FOR_nothing);
12037 /* If there are only variable elements, try to optimize
12038 the insertion using dup for the most common element
12039 followed by insertions. */
12041 /* The algorithm will fill matches[*][0] with the earliest matching element,
12042 and matches[X][1] with the count of duplicate elements (if X is the
12043 earliest element which has duplicates). */
12045 if (n_var == n_elts && n_elts <= 16)
12047 int matches[16][2] = {0};
12048 for (int i = 0; i < n_elts; i++)
12050 for (int j = 0; j <= i; j++)
12052 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12054 matches[i][0] = j;
12055 matches[j][1]++;
12056 break;
12060 int maxelement = 0;
12061 int maxv = 0;
12062 for (int i = 0; i < n_elts; i++)
12063 if (matches[i][1] > maxv)
12065 maxelement = i;
12066 maxv = matches[i][1];
12069 /* Create a duplicate of the most common element. */
12070 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12071 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
12073 /* Insert the rest. */
12074 for (int i = 0; i < n_elts; i++)
12076 rtx x = XVECEXP (vals, 0, i);
12077 if (matches[i][0] == maxelement)
12078 continue;
12079 x = copy_to_mode_reg (inner_mode, x);
12080 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12082 return;
12085 /* Initialise a vector which is part-variable. We want to first try
12086 to build those lanes which are constant in the most efficient way we
12087 can. */
12088 if (n_var != n_elts)
12090 rtx copy = copy_rtx (vals);
12092 /* Load constant part of vector. We really don't care what goes into the
12093 parts we will overwrite, but we're more likely to be able to load the
12094 constant efficiently if it has fewer, larger, repeating parts
12095 (see aarch64_simd_valid_immediate). */
12096 for (int i = 0; i < n_elts; i++)
12098 rtx x = XVECEXP (vals, 0, i);
12099 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12100 continue;
12101 rtx subst = any_const;
12102 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12104 /* Look in the copied vector, as more elements are const. */
12105 rtx test = XVECEXP (copy, 0, i ^ bit);
12106 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12108 subst = test;
12109 break;
12112 XVECEXP (copy, 0, i) = subst;
12114 aarch64_expand_vector_init (target, copy);
12117 /* Insert the variable lanes directly. */
12118 for (int i = 0; i < n_elts; i++)
12120 rtx x = XVECEXP (vals, 0, i);
12121 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12122 continue;
12123 x = copy_to_mode_reg (inner_mode, x);
12124 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12128 static unsigned HOST_WIDE_INT
12129 aarch64_shift_truncation_mask (machine_mode mode)
12131 return
12132 (!SHIFT_COUNT_TRUNCATED
12133 || aarch64_vector_mode_supported_p (mode)
12134 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12137 /* Select a format to encode pointers in exception handling data. */
12139 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12141 int type;
12142 switch (aarch64_cmodel)
12144 case AARCH64_CMODEL_TINY:
12145 case AARCH64_CMODEL_TINY_PIC:
12146 case AARCH64_CMODEL_SMALL:
12147 case AARCH64_CMODEL_SMALL_PIC:
12148 case AARCH64_CMODEL_SMALL_SPIC:
12149 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12150 for everything. */
12151 type = DW_EH_PE_sdata4;
12152 break;
12153 default:
12154 /* No assumptions here. 8-byte relocs required. */
12155 type = DW_EH_PE_sdata8;
12156 break;
12158 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12161 /* The last .arch and .tune assembly strings that we printed. */
12162 static std::string aarch64_last_printed_arch_string;
12163 static std::string aarch64_last_printed_tune_string;
12165 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12166 by the function fndecl. */
12168 void
12169 aarch64_declare_function_name (FILE *stream, const char* name,
12170 tree fndecl)
12172 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12174 struct cl_target_option *targ_options;
12175 if (target_parts)
12176 targ_options = TREE_TARGET_OPTION (target_parts);
12177 else
12178 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12179 gcc_assert (targ_options);
12181 const struct processor *this_arch
12182 = aarch64_get_arch (targ_options->x_explicit_arch);
12184 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12185 std::string extension
12186 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12187 this_arch->flags);
12188 /* Only update the assembler .arch string if it is distinct from the last
12189 such string we printed. */
12190 std::string to_print = this_arch->name + extension;
12191 if (to_print != aarch64_last_printed_arch_string)
12193 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12194 aarch64_last_printed_arch_string = to_print;
12197 /* Print the cpu name we're tuning for in the comments, might be
12198 useful to readers of the generated asm. Do it only when it changes
12199 from function to function and verbose assembly is requested. */
12200 const struct processor *this_tune
12201 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12203 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12205 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12206 this_tune->name);
12207 aarch64_last_printed_tune_string = this_tune->name;
12210 /* Don't forget the type directive for ELF. */
12211 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12212 ASM_OUTPUT_LABEL (stream, name);
12215 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12217 static void
12218 aarch64_start_file (void)
12220 struct cl_target_option *default_options
12221 = TREE_TARGET_OPTION (target_option_default_node);
12223 const struct processor *default_arch
12224 = aarch64_get_arch (default_options->x_explicit_arch);
12225 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12226 std::string extension
12227 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12228 default_arch->flags);
12230 aarch64_last_printed_arch_string = default_arch->name + extension;
12231 aarch64_last_printed_tune_string = "";
12232 asm_fprintf (asm_out_file, "\t.arch %s\n",
12233 aarch64_last_printed_arch_string.c_str ());
12235 default_file_start ();
12238 /* Emit load exclusive. */
12240 static void
12241 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12242 rtx mem, rtx model_rtx)
12244 rtx (*gen) (rtx, rtx, rtx);
12246 switch (mode)
12248 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12249 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12250 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12251 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12252 default:
12253 gcc_unreachable ();
12256 emit_insn (gen (rval, mem, model_rtx));
12259 /* Emit store exclusive. */
12261 static void
12262 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12263 rtx rval, rtx mem, rtx model_rtx)
12265 rtx (*gen) (rtx, rtx, rtx, rtx);
12267 switch (mode)
12269 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12270 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12271 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12272 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12273 default:
12274 gcc_unreachable ();
12277 emit_insn (gen (bval, rval, mem, model_rtx));
12280 /* Mark the previous jump instruction as unlikely. */
12282 static void
12283 aarch64_emit_unlikely_jump (rtx insn)
12285 rtx_insn *jump = emit_jump_insn (insn);
12286 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12289 /* Expand a compare and swap pattern. */
12291 void
12292 aarch64_expand_compare_and_swap (rtx operands[])
12294 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12295 machine_mode mode, cmp_mode;
12296 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12297 int idx;
12298 gen_cas_fn gen;
12299 const gen_cas_fn split_cas[] =
12301 gen_aarch64_compare_and_swapqi,
12302 gen_aarch64_compare_and_swaphi,
12303 gen_aarch64_compare_and_swapsi,
12304 gen_aarch64_compare_and_swapdi
12306 const gen_cas_fn atomic_cas[] =
12308 gen_aarch64_compare_and_swapqi_lse,
12309 gen_aarch64_compare_and_swaphi_lse,
12310 gen_aarch64_compare_and_swapsi_lse,
12311 gen_aarch64_compare_and_swapdi_lse
12314 bval = operands[0];
12315 rval = operands[1];
12316 mem = operands[2];
12317 oldval = operands[3];
12318 newval = operands[4];
12319 is_weak = operands[5];
12320 mod_s = operands[6];
12321 mod_f = operands[7];
12322 mode = GET_MODE (mem);
12323 cmp_mode = mode;
12325 /* Normally the succ memory model must be stronger than fail, but in the
12326 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12327 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12329 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12330 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12331 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12333 switch (mode)
12335 case E_QImode:
12336 case E_HImode:
12337 /* For short modes, we're going to perform the comparison in SImode,
12338 so do the zero-extension now. */
12339 cmp_mode = SImode;
12340 rval = gen_reg_rtx (SImode);
12341 oldval = convert_modes (SImode, mode, oldval, true);
12342 /* Fall through. */
12344 case E_SImode:
12345 case E_DImode:
12346 /* Force the value into a register if needed. */
12347 if (!aarch64_plus_operand (oldval, mode))
12348 oldval = force_reg (cmp_mode, oldval);
12349 break;
12351 default:
12352 gcc_unreachable ();
12355 switch (mode)
12357 case E_QImode: idx = 0; break;
12358 case E_HImode: idx = 1; break;
12359 case E_SImode: idx = 2; break;
12360 case E_DImode: idx = 3; break;
12361 default:
12362 gcc_unreachable ();
12364 if (TARGET_LSE)
12365 gen = atomic_cas[idx];
12366 else
12367 gen = split_cas[idx];
12369 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12371 if (mode == QImode || mode == HImode)
12372 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12374 x = gen_rtx_REG (CCmode, CC_REGNUM);
12375 x = gen_rtx_EQ (SImode, x, const0_rtx);
12376 emit_insn (gen_rtx_SET (bval, x));
12379 /* Test whether the target supports using a atomic load-operate instruction.
12380 CODE is the operation and AFTER is TRUE if the data in memory after the
12381 operation should be returned and FALSE if the data before the operation
12382 should be returned. Returns FALSE if the operation isn't supported by the
12383 architecture. */
12385 bool
12386 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12388 if (!TARGET_LSE)
12389 return false;
12391 switch (code)
12393 case SET:
12394 case AND:
12395 case IOR:
12396 case XOR:
12397 case MINUS:
12398 case PLUS:
12399 return true;
12400 default:
12401 return false;
12405 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12406 sequence implementing an atomic operation. */
12408 static void
12409 aarch64_emit_post_barrier (enum memmodel model)
12411 const enum memmodel base_model = memmodel_base (model);
12413 if (is_mm_sync (model)
12414 && (base_model == MEMMODEL_ACQUIRE
12415 || base_model == MEMMODEL_ACQ_REL
12416 || base_model == MEMMODEL_SEQ_CST))
12418 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12422 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12423 for the data in memory. EXPECTED is the value expected to be in memory.
12424 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12425 is the memory ordering to use. */
12427 void
12428 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12429 rtx expected, rtx desired,
12430 rtx model)
12432 rtx (*gen) (rtx, rtx, rtx, rtx);
12433 machine_mode mode;
12435 mode = GET_MODE (mem);
12437 switch (mode)
12439 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12440 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12441 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12442 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12443 default:
12444 gcc_unreachable ();
12447 /* Move the expected value into the CAS destination register. */
12448 emit_insn (gen_rtx_SET (rval, expected));
12450 /* Emit the CAS. */
12451 emit_insn (gen (rval, mem, desired, model));
12453 /* Compare the expected value with the value loaded by the CAS, to establish
12454 whether the swap was made. */
12455 aarch64_gen_compare_reg (EQ, rval, expected);
12458 /* Split a compare and swap pattern. */
12460 void
12461 aarch64_split_compare_and_swap (rtx operands[])
12463 rtx rval, mem, oldval, newval, scratch;
12464 machine_mode mode;
12465 bool is_weak;
12466 rtx_code_label *label1, *label2;
12467 rtx x, cond;
12468 enum memmodel model;
12469 rtx model_rtx;
12471 rval = operands[0];
12472 mem = operands[1];
12473 oldval = operands[2];
12474 newval = operands[3];
12475 is_weak = (operands[4] != const0_rtx);
12476 model_rtx = operands[5];
12477 scratch = operands[7];
12478 mode = GET_MODE (mem);
12479 model = memmodel_from_int (INTVAL (model_rtx));
12481 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12482 loop:
12483 .label1:
12484 LD[A]XR rval, [mem]
12485 CBNZ rval, .label2
12486 ST[L]XR scratch, newval, [mem]
12487 CBNZ scratch, .label1
12488 .label2:
12489 CMP rval, 0. */
12490 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12492 label1 = NULL;
12493 if (!is_weak)
12495 label1 = gen_label_rtx ();
12496 emit_label (label1);
12498 label2 = gen_label_rtx ();
12500 /* The initial load can be relaxed for a __sync operation since a final
12501 barrier will be emitted to stop code hoisting. */
12502 if (is_mm_sync (model))
12503 aarch64_emit_load_exclusive (mode, rval, mem,
12504 GEN_INT (MEMMODEL_RELAXED));
12505 else
12506 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12508 if (strong_zero_p)
12510 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12511 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12512 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12513 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12515 else
12517 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12518 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12519 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12520 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12521 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12524 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12526 if (!is_weak)
12528 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12529 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12530 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12531 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12533 else
12535 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12536 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12537 emit_insn (gen_rtx_SET (cond, x));
12540 emit_label (label2);
12541 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12542 to set the condition flags. If this is not used it will be removed by
12543 later passes. */
12544 if (strong_zero_p)
12546 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12547 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12548 emit_insn (gen_rtx_SET (cond, x));
12550 /* Emit any final barrier needed for a __sync operation. */
12551 if (is_mm_sync (model))
12552 aarch64_emit_post_barrier (model);
12555 /* Emit a BIC instruction. */
12557 static void
12558 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12560 rtx shift_rtx = GEN_INT (shift);
12561 rtx (*gen) (rtx, rtx, rtx, rtx);
12563 switch (mode)
12565 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12566 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12567 default:
12568 gcc_unreachable ();
12571 emit_insn (gen (dst, s2, shift_rtx, s1));
12574 /* Emit an atomic swap. */
12576 static void
12577 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12578 rtx mem, rtx model)
12580 rtx (*gen) (rtx, rtx, rtx, rtx);
12582 switch (mode)
12584 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12585 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12586 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12587 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12588 default:
12589 gcc_unreachable ();
12592 emit_insn (gen (dst, mem, value, model));
12595 /* Operations supported by aarch64_emit_atomic_load_op. */
12597 enum aarch64_atomic_load_op_code
12599 AARCH64_LDOP_PLUS, /* A + B */
12600 AARCH64_LDOP_XOR, /* A ^ B */
12601 AARCH64_LDOP_OR, /* A | B */
12602 AARCH64_LDOP_BIC /* A & ~B */
12605 /* Emit an atomic load-operate. */
12607 static void
12608 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12609 machine_mode mode, rtx dst, rtx src,
12610 rtx mem, rtx model)
12612 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12613 const aarch64_atomic_load_op_fn plus[] =
12615 gen_aarch64_atomic_loadaddqi,
12616 gen_aarch64_atomic_loadaddhi,
12617 gen_aarch64_atomic_loadaddsi,
12618 gen_aarch64_atomic_loadadddi
12620 const aarch64_atomic_load_op_fn eor[] =
12622 gen_aarch64_atomic_loadeorqi,
12623 gen_aarch64_atomic_loadeorhi,
12624 gen_aarch64_atomic_loadeorsi,
12625 gen_aarch64_atomic_loadeordi
12627 const aarch64_atomic_load_op_fn ior[] =
12629 gen_aarch64_atomic_loadsetqi,
12630 gen_aarch64_atomic_loadsethi,
12631 gen_aarch64_atomic_loadsetsi,
12632 gen_aarch64_atomic_loadsetdi
12634 const aarch64_atomic_load_op_fn bic[] =
12636 gen_aarch64_atomic_loadclrqi,
12637 gen_aarch64_atomic_loadclrhi,
12638 gen_aarch64_atomic_loadclrsi,
12639 gen_aarch64_atomic_loadclrdi
12641 aarch64_atomic_load_op_fn gen;
12642 int idx = 0;
12644 switch (mode)
12646 case E_QImode: idx = 0; break;
12647 case E_HImode: idx = 1; break;
12648 case E_SImode: idx = 2; break;
12649 case E_DImode: idx = 3; break;
12650 default:
12651 gcc_unreachable ();
12654 switch (code)
12656 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12657 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12658 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12659 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12660 default:
12661 gcc_unreachable ();
12664 emit_insn (gen (dst, mem, src, model));
12667 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12668 location to store the data read from memory. OUT_RESULT is the location to
12669 store the result of the operation. MEM is the memory location to read and
12670 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12671 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12672 be NULL. */
12674 void
12675 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12676 rtx mem, rtx value, rtx model_rtx)
12678 machine_mode mode = GET_MODE (mem);
12679 machine_mode wmode = (mode == DImode ? DImode : SImode);
12680 const bool short_mode = (mode < SImode);
12681 aarch64_atomic_load_op_code ldop_code;
12682 rtx src;
12683 rtx x;
12685 if (out_data)
12686 out_data = gen_lowpart (mode, out_data);
12688 if (out_result)
12689 out_result = gen_lowpart (mode, out_result);
12691 /* Make sure the value is in a register, putting it into a destination
12692 register if it needs to be manipulated. */
12693 if (!register_operand (value, mode)
12694 || code == AND || code == MINUS)
12696 src = out_result ? out_result : out_data;
12697 emit_move_insn (src, gen_lowpart (mode, value));
12699 else
12700 src = value;
12701 gcc_assert (register_operand (src, mode));
12703 /* Preprocess the data for the operation as necessary. If the operation is
12704 a SET then emit a swap instruction and finish. */
12705 switch (code)
12707 case SET:
12708 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12709 return;
12711 case MINUS:
12712 /* Negate the value and treat it as a PLUS. */
12714 rtx neg_src;
12716 /* Resize the value if necessary. */
12717 if (short_mode)
12718 src = gen_lowpart (wmode, src);
12720 neg_src = gen_rtx_NEG (wmode, src);
12721 emit_insn (gen_rtx_SET (src, neg_src));
12723 if (short_mode)
12724 src = gen_lowpart (mode, src);
12726 /* Fall-through. */
12727 case PLUS:
12728 ldop_code = AARCH64_LDOP_PLUS;
12729 break;
12731 case IOR:
12732 ldop_code = AARCH64_LDOP_OR;
12733 break;
12735 case XOR:
12736 ldop_code = AARCH64_LDOP_XOR;
12737 break;
12739 case AND:
12741 rtx not_src;
12743 /* Resize the value if necessary. */
12744 if (short_mode)
12745 src = gen_lowpart (wmode, src);
12747 not_src = gen_rtx_NOT (wmode, src);
12748 emit_insn (gen_rtx_SET (src, not_src));
12750 if (short_mode)
12751 src = gen_lowpart (mode, src);
12753 ldop_code = AARCH64_LDOP_BIC;
12754 break;
12756 default:
12757 /* The operation can't be done with atomic instructions. */
12758 gcc_unreachable ();
12761 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12763 /* If necessary, calculate the data in memory after the update by redoing the
12764 operation from values in registers. */
12765 if (!out_result)
12766 return;
12768 if (short_mode)
12770 src = gen_lowpart (wmode, src);
12771 out_data = gen_lowpart (wmode, out_data);
12772 out_result = gen_lowpart (wmode, out_result);
12775 x = NULL_RTX;
12777 switch (code)
12779 case MINUS:
12780 case PLUS:
12781 x = gen_rtx_PLUS (wmode, out_data, src);
12782 break;
12783 case IOR:
12784 x = gen_rtx_IOR (wmode, out_data, src);
12785 break;
12786 case XOR:
12787 x = gen_rtx_XOR (wmode, out_data, src);
12788 break;
12789 case AND:
12790 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12791 return;
12792 default:
12793 gcc_unreachable ();
12796 emit_set_insn (out_result, x);
12798 return;
12801 /* Split an atomic operation. */
12803 void
12804 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12805 rtx value, rtx model_rtx, rtx cond)
12807 machine_mode mode = GET_MODE (mem);
12808 machine_mode wmode = (mode == DImode ? DImode : SImode);
12809 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12810 const bool is_sync = is_mm_sync (model);
12811 rtx_code_label *label;
12812 rtx x;
12814 /* Split the atomic operation into a sequence. */
12815 label = gen_label_rtx ();
12816 emit_label (label);
12818 if (new_out)
12819 new_out = gen_lowpart (wmode, new_out);
12820 if (old_out)
12821 old_out = gen_lowpart (wmode, old_out);
12822 else
12823 old_out = new_out;
12824 value = simplify_gen_subreg (wmode, value, mode, 0);
12826 /* The initial load can be relaxed for a __sync operation since a final
12827 barrier will be emitted to stop code hoisting. */
12828 if (is_sync)
12829 aarch64_emit_load_exclusive (mode, old_out, mem,
12830 GEN_INT (MEMMODEL_RELAXED));
12831 else
12832 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12834 switch (code)
12836 case SET:
12837 new_out = value;
12838 break;
12840 case NOT:
12841 x = gen_rtx_AND (wmode, old_out, value);
12842 emit_insn (gen_rtx_SET (new_out, x));
12843 x = gen_rtx_NOT (wmode, new_out);
12844 emit_insn (gen_rtx_SET (new_out, x));
12845 break;
12847 case MINUS:
12848 if (CONST_INT_P (value))
12850 value = GEN_INT (-INTVAL (value));
12851 code = PLUS;
12853 /* Fall through. */
12855 default:
12856 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12857 emit_insn (gen_rtx_SET (new_out, x));
12858 break;
12861 aarch64_emit_store_exclusive (mode, cond, mem,
12862 gen_lowpart (mode, new_out), model_rtx);
12864 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12865 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12866 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12867 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12869 /* Emit any final barrier needed for a __sync operation. */
12870 if (is_sync)
12871 aarch64_emit_post_barrier (model);
12874 static void
12875 aarch64_init_libfuncs (void)
12877 /* Half-precision float operations. The compiler handles all operations
12878 with NULL libfuncs by converting to SFmode. */
12880 /* Conversions. */
12881 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12882 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12884 /* Arithmetic. */
12885 set_optab_libfunc (add_optab, HFmode, NULL);
12886 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12887 set_optab_libfunc (smul_optab, HFmode, NULL);
12888 set_optab_libfunc (neg_optab, HFmode, NULL);
12889 set_optab_libfunc (sub_optab, HFmode, NULL);
12891 /* Comparisons. */
12892 set_optab_libfunc (eq_optab, HFmode, NULL);
12893 set_optab_libfunc (ne_optab, HFmode, NULL);
12894 set_optab_libfunc (lt_optab, HFmode, NULL);
12895 set_optab_libfunc (le_optab, HFmode, NULL);
12896 set_optab_libfunc (ge_optab, HFmode, NULL);
12897 set_optab_libfunc (gt_optab, HFmode, NULL);
12898 set_optab_libfunc (unord_optab, HFmode, NULL);
12901 /* Target hook for c_mode_for_suffix. */
12902 static machine_mode
12903 aarch64_c_mode_for_suffix (char suffix)
12905 if (suffix == 'q')
12906 return TFmode;
12908 return VOIDmode;
12911 /* We can only represent floating point constants which will fit in
12912 "quarter-precision" values. These values are characterised by
12913 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12916 (-1)^s * (n/16) * 2^r
12918 Where:
12919 's' is the sign bit.
12920 'n' is an integer in the range 16 <= n <= 31.
12921 'r' is an integer in the range -3 <= r <= 4. */
12923 /* Return true iff X can be represented by a quarter-precision
12924 floating point immediate operand X. Note, we cannot represent 0.0. */
12925 bool
12926 aarch64_float_const_representable_p (rtx x)
12928 /* This represents our current view of how many bits
12929 make up the mantissa. */
12930 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12931 int exponent;
12932 unsigned HOST_WIDE_INT mantissa, mask;
12933 REAL_VALUE_TYPE r, m;
12934 bool fail;
12936 if (!CONST_DOUBLE_P (x))
12937 return false;
12939 /* We don't support HFmode constants yet. */
12940 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12941 return false;
12943 r = *CONST_DOUBLE_REAL_VALUE (x);
12945 /* We cannot represent infinities, NaNs or +/-zero. We won't
12946 know if we have +zero until we analyse the mantissa, but we
12947 can reject the other invalid values. */
12948 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12949 || REAL_VALUE_MINUS_ZERO (r))
12950 return false;
12952 /* Extract exponent. */
12953 r = real_value_abs (&r);
12954 exponent = REAL_EXP (&r);
12956 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12957 highest (sign) bit, with a fixed binary point at bit point_pos.
12958 m1 holds the low part of the mantissa, m2 the high part.
12959 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12960 bits for the mantissa, this can fail (low bits will be lost). */
12961 real_ldexp (&m, &r, point_pos - exponent);
12962 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12964 /* If the low part of the mantissa has bits set we cannot represent
12965 the value. */
12966 if (w.ulow () != 0)
12967 return false;
12968 /* We have rejected the lower HOST_WIDE_INT, so update our
12969 understanding of how many bits lie in the mantissa and
12970 look only at the high HOST_WIDE_INT. */
12971 mantissa = w.elt (1);
12972 point_pos -= HOST_BITS_PER_WIDE_INT;
12974 /* We can only represent values with a mantissa of the form 1.xxxx. */
12975 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12976 if ((mantissa & mask) != 0)
12977 return false;
12979 /* Having filtered unrepresentable values, we may now remove all
12980 but the highest 5 bits. */
12981 mantissa >>= point_pos - 5;
12983 /* We cannot represent the value 0.0, so reject it. This is handled
12984 elsewhere. */
12985 if (mantissa == 0)
12986 return false;
12988 /* Then, as bit 4 is always set, we can mask it off, leaving
12989 the mantissa in the range [0, 15]. */
12990 mantissa &= ~(1 << 4);
12991 gcc_assert (mantissa <= 15);
12993 /* GCC internally does not use IEEE754-like encoding (where normalized
12994 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12995 Our mantissa values are shifted 4 places to the left relative to
12996 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12997 by 5 places to correct for GCC's representation. */
12998 exponent = 5 - exponent;
13000 return (exponent >= 0 && exponent <= 7);
13003 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
13004 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
13005 output MOVI/MVNI, ORR or BIC immediate. */
13006 char*
13007 aarch64_output_simd_mov_immediate (rtx const_vector,
13008 machine_mode mode,
13009 unsigned width,
13010 enum simd_immediate_check which)
13012 bool is_valid;
13013 static char templ[40];
13014 const char *mnemonic;
13015 const char *shift_op;
13016 unsigned int lane_count = 0;
13017 char element_char;
13019 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13021 /* This will return true to show const_vector is legal for use as either
13022 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
13023 It will also update INFO to show how the immediate should be generated.
13024 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
13025 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false,
13026 &info, which);
13027 gcc_assert (is_valid);
13029 element_char = sizetochar (info.element_width);
13030 lane_count = width / info.element_width;
13032 mode = GET_MODE_INNER (mode);
13033 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13035 gcc_assert (info.shift == 0 && ! info.mvn);
13036 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13037 move immediate path. */
13038 if (aarch64_float_const_zero_rtx_p (info.value))
13039 info.value = GEN_INT (0);
13040 else
13042 const unsigned int buf_size = 20;
13043 char float_buf[buf_size] = {'\0'};
13044 real_to_decimal_for_mode (float_buf,
13045 CONST_DOUBLE_REAL_VALUE (info.value),
13046 buf_size, buf_size, 1, mode);
13048 if (lane_count == 1)
13049 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13050 else
13051 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13052 lane_count, element_char, float_buf);
13053 return templ;
13057 gcc_assert (CONST_INT_P (info.value));
13059 if (which == AARCH64_CHECK_MOV)
13061 mnemonic = info.mvn ? "mvni" : "movi";
13062 shift_op = info.msl ? "msl" : "lsl";
13063 if (lane_count == 1)
13064 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13065 mnemonic, UINTVAL (info.value));
13066 else if (info.shift)
13067 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13068 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
13069 element_char, UINTVAL (info.value), shift_op, info.shift);
13070 else
13071 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13072 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
13073 element_char, UINTVAL (info.value));
13075 else
13077 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
13078 mnemonic = info.mvn ? "bic" : "orr";
13079 if (info.shift)
13080 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13081 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
13082 element_char, UINTVAL (info.value), "lsl", info.shift);
13083 else
13084 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13085 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
13086 element_char, UINTVAL (info.value));
13088 return templ;
13091 char*
13092 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13095 /* If a floating point number was passed and we desire to use it in an
13096 integer mode do the conversion to integer. */
13097 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13099 unsigned HOST_WIDE_INT ival;
13100 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13101 gcc_unreachable ();
13102 immediate = gen_int_mode (ival, mode);
13105 machine_mode vmode;
13106 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13107 a 128 bit vector mode. */
13108 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13110 vmode = aarch64_simd_container_mode (mode, width);
13111 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13112 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13115 /* Split operands into moves from op[1] + op[2] into op[0]. */
13117 void
13118 aarch64_split_combinev16qi (rtx operands[3])
13120 unsigned int dest = REGNO (operands[0]);
13121 unsigned int src1 = REGNO (operands[1]);
13122 unsigned int src2 = REGNO (operands[2]);
13123 machine_mode halfmode = GET_MODE (operands[1]);
13124 unsigned int halfregs = REG_NREGS (operands[1]);
13125 rtx destlo, desthi;
13127 gcc_assert (halfmode == V16QImode);
13129 if (src1 == dest && src2 == dest + halfregs)
13131 /* No-op move. Can't split to nothing; emit something. */
13132 emit_note (NOTE_INSN_DELETED);
13133 return;
13136 /* Preserve register attributes for variable tracking. */
13137 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13138 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13139 GET_MODE_SIZE (halfmode));
13141 /* Special case of reversed high/low parts. */
13142 if (reg_overlap_mentioned_p (operands[2], destlo)
13143 && reg_overlap_mentioned_p (operands[1], desthi))
13145 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13146 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13147 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13149 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13151 /* Try to avoid unnecessary moves if part of the result
13152 is in the right place already. */
13153 if (src1 != dest)
13154 emit_move_insn (destlo, operands[1]);
13155 if (src2 != dest + halfregs)
13156 emit_move_insn (desthi, operands[2]);
13158 else
13160 if (src2 != dest + halfregs)
13161 emit_move_insn (desthi, operands[2]);
13162 if (src1 != dest)
13163 emit_move_insn (destlo, operands[1]);
13167 /* vec_perm support. */
13169 #define MAX_VECT_LEN 16
13171 struct expand_vec_perm_d
13173 rtx target, op0, op1;
13174 auto_vec_perm_indices perm;
13175 machine_mode vmode;
13176 bool one_vector_p;
13177 bool testing_p;
13180 /* Generate a variable permutation. */
13182 static void
13183 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13185 machine_mode vmode = GET_MODE (target);
13186 bool one_vector_p = rtx_equal_p (op0, op1);
13188 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13189 gcc_checking_assert (GET_MODE (op0) == vmode);
13190 gcc_checking_assert (GET_MODE (op1) == vmode);
13191 gcc_checking_assert (GET_MODE (sel) == vmode);
13192 gcc_checking_assert (TARGET_SIMD);
13194 if (one_vector_p)
13196 if (vmode == V8QImode)
13198 /* Expand the argument to a V16QI mode by duplicating it. */
13199 rtx pair = gen_reg_rtx (V16QImode);
13200 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13201 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13203 else
13205 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13208 else
13210 rtx pair;
13212 if (vmode == V8QImode)
13214 pair = gen_reg_rtx (V16QImode);
13215 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13216 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13218 else
13220 pair = gen_reg_rtx (OImode);
13221 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13222 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13227 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
13228 NELT is the number of elements in the vector. */
13230 void
13231 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
13232 unsigned int nelt)
13234 machine_mode vmode = GET_MODE (target);
13235 bool one_vector_p = rtx_equal_p (op0, op1);
13236 rtx mask;
13238 /* The TBL instruction does not use a modulo index, so we must take care
13239 of that ourselves. */
13240 mask = aarch64_simd_gen_const_vector_dup (vmode,
13241 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13242 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13244 /* For big-endian, we also need to reverse the index within the vector
13245 (but not which vector). */
13246 if (BYTES_BIG_ENDIAN)
13248 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13249 if (!one_vector_p)
13250 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13251 sel = expand_simple_binop (vmode, XOR, sel, mask,
13252 NULL, 0, OPTAB_LIB_WIDEN);
13254 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13257 /* Recognize patterns suitable for the TRN instructions. */
13258 static bool
13259 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13261 unsigned int i, odd, mask, nelt = d->perm.length ();
13262 rtx out, in0, in1, x;
13263 machine_mode vmode = d->vmode;
13265 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13266 return false;
13268 /* Note that these are little-endian tests.
13269 We correct for big-endian later. */
13270 if (d->perm[0] == 0)
13271 odd = 0;
13272 else if (d->perm[0] == 1)
13273 odd = 1;
13274 else
13275 return false;
13276 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13278 for (i = 0; i < nelt; i += 2)
13280 if (d->perm[i] != i + odd)
13281 return false;
13282 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13283 return false;
13286 /* Success! */
13287 if (d->testing_p)
13288 return true;
13290 in0 = d->op0;
13291 in1 = d->op1;
13292 if (BYTES_BIG_ENDIAN)
13294 x = in0, in0 = in1, in1 = x;
13295 odd = !odd;
13297 out = d->target;
13299 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13300 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
13301 return true;
13304 /* Recognize patterns suitable for the UZP instructions. */
13305 static bool
13306 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13308 unsigned int i, odd, mask, nelt = d->perm.length ();
13309 rtx out, in0, in1, x;
13310 machine_mode vmode = d->vmode;
13312 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13313 return false;
13315 /* Note that these are little-endian tests.
13316 We correct for big-endian later. */
13317 if (d->perm[0] == 0)
13318 odd = 0;
13319 else if (d->perm[0] == 1)
13320 odd = 1;
13321 else
13322 return false;
13323 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13325 for (i = 0; i < nelt; i++)
13327 unsigned elt = (i * 2 + odd) & mask;
13328 if (d->perm[i] != elt)
13329 return false;
13332 /* Success! */
13333 if (d->testing_p)
13334 return true;
13336 in0 = d->op0;
13337 in1 = d->op1;
13338 if (BYTES_BIG_ENDIAN)
13340 x = in0, in0 = in1, in1 = x;
13341 odd = !odd;
13343 out = d->target;
13345 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13346 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
13347 return true;
13350 /* Recognize patterns suitable for the ZIP instructions. */
13351 static bool
13352 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13354 unsigned int i, high, mask, nelt = d->perm.length ();
13355 rtx out, in0, in1, x;
13356 machine_mode vmode = d->vmode;
13358 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13359 return false;
13361 /* Note that these are little-endian tests.
13362 We correct for big-endian later. */
13363 high = nelt / 2;
13364 if (d->perm[0] == high)
13365 /* Do Nothing. */
13367 else if (d->perm[0] == 0)
13368 high = 0;
13369 else
13370 return false;
13371 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13373 for (i = 0; i < nelt / 2; i++)
13375 unsigned elt = (i + high) & mask;
13376 if (d->perm[i * 2] != elt)
13377 return false;
13378 elt = (elt + nelt) & mask;
13379 if (d->perm[i * 2 + 1] != elt)
13380 return false;
13383 /* Success! */
13384 if (d->testing_p)
13385 return true;
13387 in0 = d->op0;
13388 in1 = d->op1;
13389 if (BYTES_BIG_ENDIAN)
13391 x = in0, in0 = in1, in1 = x;
13392 high = !high;
13394 out = d->target;
13396 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13397 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
13398 return true;
13401 /* Recognize patterns for the EXT insn. */
13403 static bool
13404 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13406 unsigned int i, nelt = d->perm.length ();
13407 rtx offset;
13409 unsigned int location = d->perm[0]; /* Always < nelt. */
13411 /* Check if the extracted indices are increasing by one. */
13412 for (i = 1; i < nelt; i++)
13414 unsigned int required = location + i;
13415 if (d->one_vector_p)
13417 /* We'll pass the same vector in twice, so allow indices to wrap. */
13418 required &= (nelt - 1);
13420 if (d->perm[i] != required)
13421 return false;
13424 /* Success! */
13425 if (d->testing_p)
13426 return true;
13428 /* The case where (location == 0) is a no-op for both big- and little-endian,
13429 and is removed by the mid-end at optimization levels -O1 and higher. */
13431 if (BYTES_BIG_ENDIAN && (location != 0))
13433 /* After setup, we want the high elements of the first vector (stored
13434 at the LSB end of the register), and the low elements of the second
13435 vector (stored at the MSB end of the register). So swap. */
13436 std::swap (d->op0, d->op1);
13437 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13438 location = nelt - location;
13441 offset = GEN_INT (location);
13442 emit_set_insn (d->target,
13443 gen_rtx_UNSPEC (d->vmode,
13444 gen_rtvec (3, d->op0, d->op1, offset),
13445 UNSPEC_EXT));
13446 return true;
13449 /* Recognize patterns for the REV insns. */
13451 static bool
13452 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13454 unsigned int i, j, diff, size, unspec, nelt = d->perm.length ();
13456 if (!d->one_vector_p)
13457 return false;
13459 diff = d->perm[0];
13460 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
13461 if (size == 8)
13462 unspec = UNSPEC_REV64;
13463 else if (size == 4)
13464 unspec = UNSPEC_REV32;
13465 else if (size == 2)
13466 unspec = UNSPEC_REV16;
13467 else
13468 return false;
13470 for (i = 0; i < nelt ; i += diff + 1)
13471 for (j = 0; j <= diff; j += 1)
13473 /* This is guaranteed to be true as the value of diff
13474 is 7, 3, 1 and we should have enough elements in the
13475 queue to generate this. Getting a vector mask with a
13476 value of diff other than these values implies that
13477 something is wrong by the time we get here. */
13478 gcc_assert (i + j < nelt);
13479 if (d->perm[i + j] != i + diff - j)
13480 return false;
13483 /* Success! */
13484 if (d->testing_p)
13485 return true;
13487 emit_set_insn (d->target, gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0),
13488 unspec));
13489 return true;
13492 static bool
13493 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13495 rtx out = d->target;
13496 rtx in0;
13497 machine_mode vmode = d->vmode;
13498 unsigned int i, elt, nelt = d->perm.length ();
13499 rtx lane;
13501 elt = d->perm[0];
13502 for (i = 1; i < nelt; i++)
13504 if (elt != d->perm[i])
13505 return false;
13508 /* The generic preparation in aarch64_expand_vec_perm_const_1
13509 swaps the operand order and the permute indices if it finds
13510 d->perm[0] to be in the second operand. Thus, we can always
13511 use d->op0 and need not do any extra arithmetic to get the
13512 correct lane number. */
13513 in0 = d->op0;
13514 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13516 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
13517 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
13518 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
13519 return true;
13522 static bool
13523 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13525 rtx rperm[MAX_VECT_LEN], sel;
13526 machine_mode vmode = d->vmode;
13527 unsigned int i, nelt = d->perm.length ();
13529 if (d->testing_p)
13530 return true;
13532 /* Generic code will try constant permutation twice. Once with the
13533 original mode and again with the elements lowered to QImode.
13534 So wait and don't do the selector expansion ourselves. */
13535 if (vmode != V8QImode && vmode != V16QImode)
13536 return false;
13538 for (i = 0; i < nelt; ++i)
13540 int nunits = GET_MODE_NUNITS (vmode);
13542 /* If big-endian and two vectors we end up with a weird mixed-endian
13543 mode on NEON. Reverse the index within each word but not the word
13544 itself. */
13545 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13546 : d->perm[i]);
13548 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13549 sel = force_reg (vmode, sel);
13551 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13552 return true;
13555 static bool
13556 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13558 /* The pattern matching functions above are written to look for a small
13559 number to begin the sequence (0, 1, N/2). If we begin with an index
13560 from the second operand, we can swap the operands. */
13561 unsigned int nelt = d->perm.length ();
13562 if (d->perm[0] >= nelt)
13564 gcc_assert (nelt == (nelt & -nelt));
13565 for (unsigned int i = 0; i < nelt; ++i)
13566 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13568 std::swap (d->op0, d->op1);
13571 if (TARGET_SIMD && nelt > 1)
13573 if (aarch64_evpc_rev (d))
13574 return true;
13575 else if (aarch64_evpc_ext (d))
13576 return true;
13577 else if (aarch64_evpc_dup (d))
13578 return true;
13579 else if (aarch64_evpc_zip (d))
13580 return true;
13581 else if (aarch64_evpc_uzp (d))
13582 return true;
13583 else if (aarch64_evpc_trn (d))
13584 return true;
13585 return aarch64_evpc_tbl (d);
13587 return false;
13590 /* Expand a vec_perm_const pattern with the operands given by TARGET,
13591 OP0, OP1 and SEL. NELT is the number of elements in the vector. */
13593 bool
13594 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel,
13595 unsigned int nelt)
13597 struct expand_vec_perm_d d;
13598 unsigned int i, which;
13600 d.target = target;
13601 d.op0 = op0;
13602 d.op1 = op1;
13604 d.vmode = GET_MODE (target);
13605 gcc_assert (VECTOR_MODE_P (d.vmode));
13606 d.testing_p = false;
13608 d.perm.reserve (nelt);
13609 for (i = which = 0; i < nelt; ++i)
13611 rtx e = XVECEXP (sel, 0, i);
13612 unsigned int ei = INTVAL (e) & (2 * nelt - 1);
13613 which |= (ei < nelt ? 1 : 2);
13614 d.perm.quick_push (ei);
13617 switch (which)
13619 default:
13620 gcc_unreachable ();
13622 case 3:
13623 d.one_vector_p = false;
13624 if (!rtx_equal_p (op0, op1))
13625 break;
13627 /* The elements of PERM do not suggest that only the first operand
13628 is used, but both operands are identical. Allow easier matching
13629 of the permutation by folding the permutation into the single
13630 input vector. */
13631 /* Fall Through. */
13632 case 2:
13633 for (i = 0; i < nelt; ++i)
13634 d.perm[i] &= nelt - 1;
13635 d.op0 = op1;
13636 d.one_vector_p = true;
13637 break;
13639 case 1:
13640 d.op1 = op0;
13641 d.one_vector_p = true;
13642 break;
13645 return aarch64_expand_vec_perm_const_1 (&d);
13648 static bool
13649 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
13651 struct expand_vec_perm_d d;
13652 unsigned int i, nelt, which;
13653 bool ret;
13655 d.vmode = vmode;
13656 d.testing_p = true;
13657 d.perm.safe_splice (sel);
13659 /* Calculate whether all elements are in one vector. */
13660 nelt = sel.length ();
13661 for (i = which = 0; i < nelt; ++i)
13663 unsigned int e = d.perm[i];
13664 gcc_assert (e < 2 * nelt);
13665 which |= (e < nelt ? 1 : 2);
13668 /* If all elements are from the second vector, reindex as if from the
13669 first vector. */
13670 if (which == 2)
13671 for (i = 0; i < nelt; ++i)
13672 d.perm[i] -= nelt;
13674 /* Check whether the mask can be applied to a single vector. */
13675 d.one_vector_p = (which != 3);
13677 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13678 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13679 if (!d.one_vector_p)
13680 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13682 start_sequence ();
13683 ret = aarch64_expand_vec_perm_const_1 (&d);
13684 end_sequence ();
13686 return ret;
13689 /* Generate a byte permute mask for a register of mode MODE,
13690 which has NUNITS units. */
13693 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
13695 /* We have to reverse each vector because we dont have
13696 a permuted load that can reverse-load according to ABI rules. */
13697 rtx mask;
13698 rtvec v = rtvec_alloc (16);
13699 unsigned int i, j;
13700 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
13702 gcc_assert (BYTES_BIG_ENDIAN);
13703 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13705 for (i = 0; i < nunits; i++)
13706 for (j = 0; j < usize; j++)
13707 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13708 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13709 return force_reg (V16QImode, mask);
13712 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
13713 true. However due to issues with register allocation it is preferable
13714 to avoid tieing integer scalar and FP scalar modes. Executing integer
13715 operations in general registers is better than treating them as scalar
13716 vector operations. This reduces latency and avoids redundant int<->FP
13717 moves. So tie modes if they are either the same class, or vector modes
13718 with other vector modes, vector structs or any scalar mode. */
13720 static bool
13721 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13723 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13724 return true;
13726 /* We specifically want to allow elements of "structure" modes to
13727 be tieable to the structure. This more general condition allows
13728 other rarer situations too. */
13729 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13730 return true;
13732 /* Also allow any scalar modes with vectors. */
13733 if (aarch64_vector_mode_supported_p (mode1)
13734 || aarch64_vector_mode_supported_p (mode2))
13735 return true;
13737 return false;
13740 /* Return a new RTX holding the result of moving POINTER forward by
13741 AMOUNT bytes. */
13743 static rtx
13744 aarch64_move_pointer (rtx pointer, int amount)
13746 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13748 return adjust_automodify_address (pointer, GET_MODE (pointer),
13749 next, amount);
13752 /* Return a new RTX holding the result of moving POINTER forward by the
13753 size of the mode it points to. */
13755 static rtx
13756 aarch64_progress_pointer (rtx pointer)
13758 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13760 return aarch64_move_pointer (pointer, amount);
13763 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13764 MODE bytes. */
13766 static void
13767 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13768 machine_mode mode)
13770 rtx reg = gen_reg_rtx (mode);
13772 /* "Cast" the pointers to the correct mode. */
13773 *src = adjust_address (*src, mode, 0);
13774 *dst = adjust_address (*dst, mode, 0);
13775 /* Emit the memcpy. */
13776 emit_move_insn (reg, *src);
13777 emit_move_insn (*dst, reg);
13778 /* Move the pointers forward. */
13779 *src = aarch64_progress_pointer (*src);
13780 *dst = aarch64_progress_pointer (*dst);
13783 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13784 we succeed, otherwise return false. */
13786 bool
13787 aarch64_expand_movmem (rtx *operands)
13789 unsigned int n;
13790 rtx dst = operands[0];
13791 rtx src = operands[1];
13792 rtx base;
13793 bool speed_p = !optimize_function_for_size_p (cfun);
13795 /* When optimizing for size, give a better estimate of the length of a
13796 memcpy call, but use the default otherwise. */
13797 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13799 /* We can't do anything smart if the amount to copy is not constant. */
13800 if (!CONST_INT_P (operands[2]))
13801 return false;
13803 n = UINTVAL (operands[2]);
13805 /* Try to keep the number of instructions low. For cases below 16 bytes we
13806 need to make at most two moves. For cases above 16 bytes it will be one
13807 move for each 16 byte chunk, then at most two additional moves. */
13808 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13809 return false;
13811 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13812 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13814 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13815 src = adjust_automodify_address (src, VOIDmode, base, 0);
13817 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13818 1-byte chunk. */
13819 if (n < 4)
13821 if (n >= 2)
13823 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13824 n -= 2;
13827 if (n == 1)
13828 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13830 return true;
13833 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13834 4-byte chunk, partially overlapping with the previously copied chunk. */
13835 if (n < 8)
13837 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13838 n -= 4;
13839 if (n > 0)
13841 int move = n - 4;
13843 src = aarch64_move_pointer (src, move);
13844 dst = aarch64_move_pointer (dst, move);
13845 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13847 return true;
13850 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13851 them, then (if applicable) an 8-byte chunk. */
13852 while (n >= 8)
13854 if (n / 16)
13856 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13857 n -= 16;
13859 else
13861 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13862 n -= 8;
13866 /* Finish the final bytes of the copy. We can always do this in one
13867 instruction. We either copy the exact amount we need, or partially
13868 overlap with the previous chunk we copied and copy 8-bytes. */
13869 if (n == 0)
13870 return true;
13871 else if (n == 1)
13872 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13873 else if (n == 2)
13874 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13875 else if (n == 4)
13876 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13877 else
13879 if (n == 3)
13881 src = aarch64_move_pointer (src, -1);
13882 dst = aarch64_move_pointer (dst, -1);
13883 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13885 else
13887 int move = n - 8;
13889 src = aarch64_move_pointer (src, move);
13890 dst = aarch64_move_pointer (dst, move);
13891 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13895 return true;
13898 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13899 SImode stores. Handle the case when the constant has identical
13900 bottom and top halves. This is beneficial when the two stores can be
13901 merged into an STP and we avoid synthesising potentially expensive
13902 immediates twice. Return true if such a split is possible. */
13904 bool
13905 aarch64_split_dimode_const_store (rtx dst, rtx src)
13907 rtx lo = gen_lowpart (SImode, src);
13908 rtx hi = gen_highpart_mode (SImode, DImode, src);
13910 bool size_p = optimize_function_for_size_p (cfun);
13912 if (!rtx_equal_p (lo, hi))
13913 return false;
13915 unsigned int orig_cost
13916 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13917 unsigned int lo_cost
13918 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13920 /* We want to transform:
13921 MOV x1, 49370
13922 MOVK x1, 0x140, lsl 16
13923 MOVK x1, 0xc0da, lsl 32
13924 MOVK x1, 0x140, lsl 48
13925 STR x1, [x0]
13926 into:
13927 MOV w1, 49370
13928 MOVK w1, 0x140, lsl 16
13929 STP w1, w1, [x0]
13930 So we want to perform this only when we save two instructions
13931 or more. When optimizing for size, however, accept any code size
13932 savings we can. */
13933 if (size_p && orig_cost <= lo_cost)
13934 return false;
13936 if (!size_p
13937 && (orig_cost <= lo_cost + 1))
13938 return false;
13940 rtx mem_lo = adjust_address (dst, SImode, 0);
13941 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13942 return false;
13944 rtx tmp_reg = gen_reg_rtx (SImode);
13945 aarch64_expand_mov_immediate (tmp_reg, lo);
13946 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13947 /* Don't emit an explicit store pair as this may not be always profitable.
13948 Let the sched-fusion logic decide whether to merge them. */
13949 emit_move_insn (mem_lo, tmp_reg);
13950 emit_move_insn (mem_hi, tmp_reg);
13952 return true;
13955 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13957 static unsigned HOST_WIDE_INT
13958 aarch64_asan_shadow_offset (void)
13960 return (HOST_WIDE_INT_1 << 36);
13963 static bool
13964 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13965 unsigned int align,
13966 enum by_pieces_operation op,
13967 bool speed_p)
13969 /* STORE_BY_PIECES can be used when copying a constant string, but
13970 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13971 For now we always fail this and let the move_by_pieces code copy
13972 the string from read-only memory. */
13973 if (op == STORE_BY_PIECES)
13974 return false;
13976 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13979 static rtx
13980 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13981 int code, tree treeop0, tree treeop1)
13983 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13984 rtx op0, op1;
13985 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13986 insn_code icode;
13987 struct expand_operand ops[4];
13989 start_sequence ();
13990 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13992 op_mode = GET_MODE (op0);
13993 if (op_mode == VOIDmode)
13994 op_mode = GET_MODE (op1);
13996 switch (op_mode)
13998 case E_QImode:
13999 case E_HImode:
14000 case E_SImode:
14001 cmp_mode = SImode;
14002 icode = CODE_FOR_cmpsi;
14003 break;
14005 case E_DImode:
14006 cmp_mode = DImode;
14007 icode = CODE_FOR_cmpdi;
14008 break;
14010 case E_SFmode:
14011 cmp_mode = SFmode;
14012 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14013 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14014 break;
14016 case E_DFmode:
14017 cmp_mode = DFmode;
14018 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14019 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14020 break;
14022 default:
14023 end_sequence ();
14024 return NULL_RTX;
14027 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14028 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14029 if (!op0 || !op1)
14031 end_sequence ();
14032 return NULL_RTX;
14034 *prep_seq = get_insns ();
14035 end_sequence ();
14037 create_fixed_operand (&ops[0], op0);
14038 create_fixed_operand (&ops[1], op1);
14040 start_sequence ();
14041 if (!maybe_expand_insn (icode, 2, ops))
14043 end_sequence ();
14044 return NULL_RTX;
14046 *gen_seq = get_insns ();
14047 end_sequence ();
14049 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14050 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14053 static rtx
14054 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14055 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14057 rtx op0, op1, target;
14058 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14059 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14060 insn_code icode;
14061 struct expand_operand ops[6];
14062 int aarch64_cond;
14064 push_to_sequence (*prep_seq);
14065 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14067 op_mode = GET_MODE (op0);
14068 if (op_mode == VOIDmode)
14069 op_mode = GET_MODE (op1);
14071 switch (op_mode)
14073 case E_QImode:
14074 case E_HImode:
14075 case E_SImode:
14076 cmp_mode = SImode;
14077 icode = CODE_FOR_ccmpsi;
14078 break;
14080 case E_DImode:
14081 cmp_mode = DImode;
14082 icode = CODE_FOR_ccmpdi;
14083 break;
14085 case E_SFmode:
14086 cmp_mode = SFmode;
14087 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14088 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14089 break;
14091 case E_DFmode:
14092 cmp_mode = DFmode;
14093 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14094 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14095 break;
14097 default:
14098 end_sequence ();
14099 return NULL_RTX;
14102 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14103 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14104 if (!op0 || !op1)
14106 end_sequence ();
14107 return NULL_RTX;
14109 *prep_seq = get_insns ();
14110 end_sequence ();
14112 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14113 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14115 if (bit_code != AND)
14117 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14118 GET_MODE (XEXP (prev, 0))),
14119 VOIDmode, XEXP (prev, 0), const0_rtx);
14120 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14123 create_fixed_operand (&ops[0], XEXP (prev, 0));
14124 create_fixed_operand (&ops[1], target);
14125 create_fixed_operand (&ops[2], op0);
14126 create_fixed_operand (&ops[3], op1);
14127 create_fixed_operand (&ops[4], prev);
14128 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14130 push_to_sequence (*gen_seq);
14131 if (!maybe_expand_insn (icode, 6, ops))
14133 end_sequence ();
14134 return NULL_RTX;
14137 *gen_seq = get_insns ();
14138 end_sequence ();
14140 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14143 #undef TARGET_GEN_CCMP_FIRST
14144 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14146 #undef TARGET_GEN_CCMP_NEXT
14147 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14149 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14150 instruction fusion of some sort. */
14152 static bool
14153 aarch64_macro_fusion_p (void)
14155 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14159 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14160 should be kept together during scheduling. */
14162 static bool
14163 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14165 rtx set_dest;
14166 rtx prev_set = single_set (prev);
14167 rtx curr_set = single_set (curr);
14168 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14169 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14171 if (!aarch64_macro_fusion_p ())
14172 return false;
14174 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14176 /* We are trying to match:
14177 prev (mov) == (set (reg r0) (const_int imm16))
14178 curr (movk) == (set (zero_extract (reg r0)
14179 (const_int 16)
14180 (const_int 16))
14181 (const_int imm16_1)) */
14183 set_dest = SET_DEST (curr_set);
14185 if (GET_CODE (set_dest) == ZERO_EXTRACT
14186 && CONST_INT_P (SET_SRC (curr_set))
14187 && CONST_INT_P (SET_SRC (prev_set))
14188 && CONST_INT_P (XEXP (set_dest, 2))
14189 && INTVAL (XEXP (set_dest, 2)) == 16
14190 && REG_P (XEXP (set_dest, 0))
14191 && REG_P (SET_DEST (prev_set))
14192 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14194 return true;
14198 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14201 /* We're trying to match:
14202 prev (adrp) == (set (reg r1)
14203 (high (symbol_ref ("SYM"))))
14204 curr (add) == (set (reg r0)
14205 (lo_sum (reg r1)
14206 (symbol_ref ("SYM"))))
14207 Note that r0 need not necessarily be the same as r1, especially
14208 during pre-regalloc scheduling. */
14210 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14211 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14213 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14214 && REG_P (XEXP (SET_SRC (curr_set), 0))
14215 && REGNO (XEXP (SET_SRC (curr_set), 0))
14216 == REGNO (SET_DEST (prev_set))
14217 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14218 XEXP (SET_SRC (curr_set), 1)))
14219 return true;
14223 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14226 /* We're trying to match:
14227 prev (movk) == (set (zero_extract (reg r0)
14228 (const_int 16)
14229 (const_int 32))
14230 (const_int imm16_1))
14231 curr (movk) == (set (zero_extract (reg r0)
14232 (const_int 16)
14233 (const_int 48))
14234 (const_int imm16_2)) */
14236 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14237 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14238 && REG_P (XEXP (SET_DEST (prev_set), 0))
14239 && REG_P (XEXP (SET_DEST (curr_set), 0))
14240 && REGNO (XEXP (SET_DEST (prev_set), 0))
14241 == REGNO (XEXP (SET_DEST (curr_set), 0))
14242 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14243 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14244 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14245 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14246 && CONST_INT_P (SET_SRC (prev_set))
14247 && CONST_INT_P (SET_SRC (curr_set)))
14248 return true;
14251 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14253 /* We're trying to match:
14254 prev (adrp) == (set (reg r0)
14255 (high (symbol_ref ("SYM"))))
14256 curr (ldr) == (set (reg r1)
14257 (mem (lo_sum (reg r0)
14258 (symbol_ref ("SYM")))))
14260 curr (ldr) == (set (reg r1)
14261 (zero_extend (mem
14262 (lo_sum (reg r0)
14263 (symbol_ref ("SYM")))))) */
14264 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14265 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14267 rtx curr_src = SET_SRC (curr_set);
14269 if (GET_CODE (curr_src) == ZERO_EXTEND)
14270 curr_src = XEXP (curr_src, 0);
14272 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14273 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14274 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14275 == REGNO (SET_DEST (prev_set))
14276 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14277 XEXP (SET_SRC (prev_set), 0)))
14278 return true;
14282 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14283 && aarch_crypto_can_dual_issue (prev, curr))
14284 return true;
14286 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14287 && any_condjump_p (curr))
14289 enum attr_type prev_type = get_attr_type (prev);
14291 unsigned int condreg1, condreg2;
14292 rtx cc_reg_1;
14293 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14294 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14296 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14297 && prev
14298 && modified_in_p (cc_reg_1, prev))
14300 /* FIXME: this misses some which is considered simple arthematic
14301 instructions for ThunderX. Simple shifts are missed here. */
14302 if (prev_type == TYPE_ALUS_SREG
14303 || prev_type == TYPE_ALUS_IMM
14304 || prev_type == TYPE_LOGICS_REG
14305 || prev_type == TYPE_LOGICS_IMM)
14306 return true;
14310 if (prev_set
14311 && curr_set
14312 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14313 && any_condjump_p (curr))
14315 /* We're trying to match:
14316 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14317 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14318 (const_int 0))
14319 (label_ref ("SYM"))
14320 (pc)) */
14321 if (SET_DEST (curr_set) == (pc_rtx)
14322 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14323 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14324 && REG_P (SET_DEST (prev_set))
14325 && REGNO (SET_DEST (prev_set))
14326 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14328 /* Fuse ALU operations followed by conditional branch instruction. */
14329 switch (get_attr_type (prev))
14331 case TYPE_ALU_IMM:
14332 case TYPE_ALU_SREG:
14333 case TYPE_ADC_REG:
14334 case TYPE_ADC_IMM:
14335 case TYPE_ADCS_REG:
14336 case TYPE_ADCS_IMM:
14337 case TYPE_LOGIC_REG:
14338 case TYPE_LOGIC_IMM:
14339 case TYPE_CSEL:
14340 case TYPE_ADR:
14341 case TYPE_MOV_IMM:
14342 case TYPE_SHIFT_REG:
14343 case TYPE_SHIFT_IMM:
14344 case TYPE_BFM:
14345 case TYPE_RBIT:
14346 case TYPE_REV:
14347 case TYPE_EXTEND:
14348 return true;
14350 default:;
14355 return false;
14358 /* Return true iff the instruction fusion described by OP is enabled. */
14360 bool
14361 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14363 return (aarch64_tune_params.fusible_ops & op) != 0;
14366 /* If MEM is in the form of [base+offset], extract the two parts
14367 of address and set to BASE and OFFSET, otherwise return false
14368 after clearing BASE and OFFSET. */
14370 bool
14371 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14373 rtx addr;
14375 gcc_assert (MEM_P (mem));
14377 addr = XEXP (mem, 0);
14379 if (REG_P (addr))
14381 *base = addr;
14382 *offset = const0_rtx;
14383 return true;
14386 if (GET_CODE (addr) == PLUS
14387 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14389 *base = XEXP (addr, 0);
14390 *offset = XEXP (addr, 1);
14391 return true;
14394 *base = NULL_RTX;
14395 *offset = NULL_RTX;
14397 return false;
14400 /* Types for scheduling fusion. */
14401 enum sched_fusion_type
14403 SCHED_FUSION_NONE = 0,
14404 SCHED_FUSION_LD_SIGN_EXTEND,
14405 SCHED_FUSION_LD_ZERO_EXTEND,
14406 SCHED_FUSION_LD,
14407 SCHED_FUSION_ST,
14408 SCHED_FUSION_NUM
14411 /* If INSN is a load or store of address in the form of [base+offset],
14412 extract the two parts and set to BASE and OFFSET. Return scheduling
14413 fusion type this INSN is. */
14415 static enum sched_fusion_type
14416 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14418 rtx x, dest, src;
14419 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14421 gcc_assert (INSN_P (insn));
14422 x = PATTERN (insn);
14423 if (GET_CODE (x) != SET)
14424 return SCHED_FUSION_NONE;
14426 src = SET_SRC (x);
14427 dest = SET_DEST (x);
14429 machine_mode dest_mode = GET_MODE (dest);
14431 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14432 return SCHED_FUSION_NONE;
14434 if (GET_CODE (src) == SIGN_EXTEND)
14436 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14437 src = XEXP (src, 0);
14438 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14439 return SCHED_FUSION_NONE;
14441 else if (GET_CODE (src) == ZERO_EXTEND)
14443 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14444 src = XEXP (src, 0);
14445 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14446 return SCHED_FUSION_NONE;
14449 if (GET_CODE (src) == MEM && REG_P (dest))
14450 extract_base_offset_in_addr (src, base, offset);
14451 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14453 fusion = SCHED_FUSION_ST;
14454 extract_base_offset_in_addr (dest, base, offset);
14456 else
14457 return SCHED_FUSION_NONE;
14459 if (*base == NULL_RTX || *offset == NULL_RTX)
14460 fusion = SCHED_FUSION_NONE;
14462 return fusion;
14465 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14467 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14468 and PRI are only calculated for these instructions. For other instruction,
14469 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14470 type instruction fusion can be added by returning different priorities.
14472 It's important that irrelevant instructions get the largest FUSION_PRI. */
14474 static void
14475 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14476 int *fusion_pri, int *pri)
14478 int tmp, off_val;
14479 rtx base, offset;
14480 enum sched_fusion_type fusion;
14482 gcc_assert (INSN_P (insn));
14484 tmp = max_pri - 1;
14485 fusion = fusion_load_store (insn, &base, &offset);
14486 if (fusion == SCHED_FUSION_NONE)
14488 *pri = tmp;
14489 *fusion_pri = tmp;
14490 return;
14493 /* Set FUSION_PRI according to fusion type and base register. */
14494 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14496 /* Calculate PRI. */
14497 tmp /= 2;
14499 /* INSN with smaller offset goes first. */
14500 off_val = (int)(INTVAL (offset));
14501 if (off_val >= 0)
14502 tmp -= (off_val & 0xfffff);
14503 else
14504 tmp += ((- off_val) & 0xfffff);
14506 *pri = tmp;
14507 return;
14510 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14511 Adjust priority of sha1h instructions so they are scheduled before
14512 other SHA1 instructions. */
14514 static int
14515 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14517 rtx x = PATTERN (insn);
14519 if (GET_CODE (x) == SET)
14521 x = SET_SRC (x);
14523 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14524 return priority + 10;
14527 return priority;
14530 /* Given OPERANDS of consecutive load/store, check if we can merge
14531 them into ldp/stp. LOAD is true if they are load instructions.
14532 MODE is the mode of memory operands. */
14534 bool
14535 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14536 machine_mode mode)
14538 HOST_WIDE_INT offval_1, offval_2, msize;
14539 enum reg_class rclass_1, rclass_2;
14540 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14542 if (load)
14544 mem_1 = operands[1];
14545 mem_2 = operands[3];
14546 reg_1 = operands[0];
14547 reg_2 = operands[2];
14548 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14549 if (REGNO (reg_1) == REGNO (reg_2))
14550 return false;
14552 else
14554 mem_1 = operands[0];
14555 mem_2 = operands[2];
14556 reg_1 = operands[1];
14557 reg_2 = operands[3];
14560 /* The mems cannot be volatile. */
14561 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14562 return false;
14564 /* If we have SImode and slow unaligned ldp,
14565 check the alignment to be at least 8 byte. */
14566 if (mode == SImode
14567 && (aarch64_tune_params.extra_tuning_flags
14568 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14569 && !optimize_size
14570 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14571 return false;
14573 /* Check if the addresses are in the form of [base+offset]. */
14574 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14575 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14576 return false;
14577 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14578 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14579 return false;
14581 /* Check if the bases are same. */
14582 if (!rtx_equal_p (base_1, base_2))
14583 return false;
14585 offval_1 = INTVAL (offset_1);
14586 offval_2 = INTVAL (offset_2);
14587 msize = GET_MODE_SIZE (mode);
14588 /* Check if the offsets are consecutive. */
14589 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14590 return false;
14592 /* Check if the addresses are clobbered by load. */
14593 if (load)
14595 if (reg_mentioned_p (reg_1, mem_1))
14596 return false;
14598 /* In increasing order, the last load can clobber the address. */
14599 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14600 return false;
14603 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14604 rclass_1 = FP_REGS;
14605 else
14606 rclass_1 = GENERAL_REGS;
14608 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14609 rclass_2 = FP_REGS;
14610 else
14611 rclass_2 = GENERAL_REGS;
14613 /* Check if the registers are of same class. */
14614 if (rclass_1 != rclass_2)
14615 return false;
14617 return true;
14620 /* Given OPERANDS of consecutive load/store, check if we can merge
14621 them into ldp/stp by adjusting the offset. LOAD is true if they
14622 are load instructions. MODE is the mode of memory operands.
14624 Given below consecutive stores:
14626 str w1, [xb, 0x100]
14627 str w1, [xb, 0x104]
14628 str w1, [xb, 0x108]
14629 str w1, [xb, 0x10c]
14631 Though the offsets are out of the range supported by stp, we can
14632 still pair them after adjusting the offset, like:
14634 add scratch, xb, 0x100
14635 stp w1, w1, [scratch]
14636 stp w1, w1, [scratch, 0x8]
14638 The peephole patterns detecting this opportunity should guarantee
14639 the scratch register is avaliable. */
14641 bool
14642 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14643 scalar_mode mode)
14645 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14646 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14647 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14648 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14650 if (load)
14652 reg_1 = operands[0];
14653 mem_1 = operands[1];
14654 reg_2 = operands[2];
14655 mem_2 = operands[3];
14656 reg_3 = operands[4];
14657 mem_3 = operands[5];
14658 reg_4 = operands[6];
14659 mem_4 = operands[7];
14660 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14661 && REG_P (reg_3) && REG_P (reg_4));
14662 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14663 return false;
14665 else
14667 mem_1 = operands[0];
14668 reg_1 = operands[1];
14669 mem_2 = operands[2];
14670 reg_2 = operands[3];
14671 mem_3 = operands[4];
14672 reg_3 = operands[5];
14673 mem_4 = operands[6];
14674 reg_4 = operands[7];
14676 /* Skip if memory operand is by itslef valid for ldp/stp. */
14677 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14678 return false;
14680 /* The mems cannot be volatile. */
14681 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14682 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14683 return false;
14685 /* Check if the addresses are in the form of [base+offset]. */
14686 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14687 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14688 return false;
14689 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14690 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14691 return false;
14692 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14693 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14694 return false;
14695 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14696 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14697 return false;
14699 /* Check if the bases are same. */
14700 if (!rtx_equal_p (base_1, base_2)
14701 || !rtx_equal_p (base_2, base_3)
14702 || !rtx_equal_p (base_3, base_4))
14703 return false;
14705 offval_1 = INTVAL (offset_1);
14706 offval_2 = INTVAL (offset_2);
14707 offval_3 = INTVAL (offset_3);
14708 offval_4 = INTVAL (offset_4);
14709 msize = GET_MODE_SIZE (mode);
14710 /* Check if the offsets are consecutive. */
14711 if ((offval_1 != (offval_2 + msize)
14712 || offval_1 != (offval_3 + msize * 2)
14713 || offval_1 != (offval_4 + msize * 3))
14714 && (offval_4 != (offval_3 + msize)
14715 || offval_4 != (offval_2 + msize * 2)
14716 || offval_4 != (offval_1 + msize * 3)))
14717 return false;
14719 /* Check if the addresses are clobbered by load. */
14720 if (load)
14722 if (reg_mentioned_p (reg_1, mem_1)
14723 || reg_mentioned_p (reg_2, mem_2)
14724 || reg_mentioned_p (reg_3, mem_3))
14725 return false;
14727 /* In increasing order, the last load can clobber the address. */
14728 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14729 return false;
14732 /* If we have SImode and slow unaligned ldp,
14733 check the alignment to be at least 8 byte. */
14734 if (mode == SImode
14735 && (aarch64_tune_params.extra_tuning_flags
14736 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14737 && !optimize_size
14738 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14739 return false;
14741 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14742 rclass_1 = FP_REGS;
14743 else
14744 rclass_1 = GENERAL_REGS;
14746 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14747 rclass_2 = FP_REGS;
14748 else
14749 rclass_2 = GENERAL_REGS;
14751 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14752 rclass_3 = FP_REGS;
14753 else
14754 rclass_3 = GENERAL_REGS;
14756 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14757 rclass_4 = FP_REGS;
14758 else
14759 rclass_4 = GENERAL_REGS;
14761 /* Check if the registers are of same class. */
14762 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14763 return false;
14765 return true;
14768 /* Given OPERANDS of consecutive load/store, this function pairs them
14769 into ldp/stp after adjusting the offset. It depends on the fact
14770 that addresses of load/store instructions are in increasing order.
14771 MODE is the mode of memory operands. CODE is the rtl operator
14772 which should be applied to all memory operands, it's SIGN_EXTEND,
14773 ZERO_EXTEND or UNKNOWN. */
14775 bool
14776 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14777 scalar_mode mode, RTX_CODE code)
14779 rtx base, offset, t1, t2;
14780 rtx mem_1, mem_2, mem_3, mem_4;
14781 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14783 if (load)
14785 mem_1 = operands[1];
14786 mem_2 = operands[3];
14787 mem_3 = operands[5];
14788 mem_4 = operands[7];
14790 else
14792 mem_1 = operands[0];
14793 mem_2 = operands[2];
14794 mem_3 = operands[4];
14795 mem_4 = operands[6];
14796 gcc_assert (code == UNKNOWN);
14799 extract_base_offset_in_addr (mem_1, &base, &offset);
14800 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14802 /* Adjust offset thus it can fit in ldp/stp instruction. */
14803 msize = GET_MODE_SIZE (mode);
14804 stp_off_limit = msize * 0x40;
14805 off_val = INTVAL (offset);
14806 abs_off = (off_val < 0) ? -off_val : off_val;
14807 new_off = abs_off % stp_off_limit;
14808 adj_off = abs_off - new_off;
14810 /* Further adjust to make sure all offsets are OK. */
14811 if ((new_off + msize * 2) >= stp_off_limit)
14813 adj_off += stp_off_limit;
14814 new_off -= stp_off_limit;
14817 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14818 if (adj_off >= 0x1000)
14819 return false;
14821 if (off_val < 0)
14823 adj_off = -adj_off;
14824 new_off = -new_off;
14827 /* Create new memory references. */
14828 mem_1 = change_address (mem_1, VOIDmode,
14829 plus_constant (DImode, operands[8], new_off));
14831 /* Check if the adjusted address is OK for ldp/stp. */
14832 if (!aarch64_mem_pair_operand (mem_1, mode))
14833 return false;
14835 msize = GET_MODE_SIZE (mode);
14836 mem_2 = change_address (mem_2, VOIDmode,
14837 plus_constant (DImode,
14838 operands[8],
14839 new_off + msize));
14840 mem_3 = change_address (mem_3, VOIDmode,
14841 plus_constant (DImode,
14842 operands[8],
14843 new_off + msize * 2));
14844 mem_4 = change_address (mem_4, VOIDmode,
14845 plus_constant (DImode,
14846 operands[8],
14847 new_off + msize * 3));
14849 if (code == ZERO_EXTEND)
14851 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14852 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14853 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14854 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14856 else if (code == SIGN_EXTEND)
14858 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14859 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14860 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14861 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14864 if (load)
14866 operands[1] = mem_1;
14867 operands[3] = mem_2;
14868 operands[5] = mem_3;
14869 operands[7] = mem_4;
14871 else
14873 operands[0] = mem_1;
14874 operands[2] = mem_2;
14875 operands[4] = mem_3;
14876 operands[6] = mem_4;
14879 /* Emit adjusting instruction. */
14880 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14881 /* Emit ldp/stp instructions. */
14882 t1 = gen_rtx_SET (operands[0], operands[1]);
14883 t2 = gen_rtx_SET (operands[2], operands[3]);
14884 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14885 t1 = gen_rtx_SET (operands[4], operands[5]);
14886 t2 = gen_rtx_SET (operands[6], operands[7]);
14887 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14888 return true;
14891 /* Return 1 if pseudo register should be created and used to hold
14892 GOT address for PIC code. */
14894 bool
14895 aarch64_use_pseudo_pic_reg (void)
14897 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14900 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14902 static int
14903 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14905 switch (XINT (x, 1))
14907 case UNSPEC_GOTSMALLPIC:
14908 case UNSPEC_GOTSMALLPIC28K:
14909 case UNSPEC_GOTTINYPIC:
14910 return 0;
14911 default:
14912 break;
14915 return default_unspec_may_trap_p (x, flags);
14919 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14920 return the log2 of that value. Otherwise return -1. */
14923 aarch64_fpconst_pow_of_2 (rtx x)
14925 const REAL_VALUE_TYPE *r;
14927 if (!CONST_DOUBLE_P (x))
14928 return -1;
14930 r = CONST_DOUBLE_REAL_VALUE (x);
14932 if (REAL_VALUE_NEGATIVE (*r)
14933 || REAL_VALUE_ISNAN (*r)
14934 || REAL_VALUE_ISINF (*r)
14935 || !real_isinteger (r, DFmode))
14936 return -1;
14938 return exact_log2 (real_to_integer (r));
14941 /* If X is a vector of equal CONST_DOUBLE values and that value is
14942 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14945 aarch64_vec_fpconst_pow_of_2 (rtx x)
14947 if (GET_CODE (x) != CONST_VECTOR)
14948 return -1;
14950 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14951 return -1;
14953 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14954 if (firstval <= 0)
14955 return -1;
14957 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14958 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14959 return -1;
14961 return firstval;
14964 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14965 to float.
14967 __fp16 always promotes through this hook.
14968 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14969 through the generic excess precision logic rather than here. */
14971 static tree
14972 aarch64_promoted_type (const_tree t)
14974 if (SCALAR_FLOAT_TYPE_P (t)
14975 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14976 return float_type_node;
14978 return NULL_TREE;
14981 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14983 static bool
14984 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14985 optimization_type opt_type)
14987 switch (op)
14989 case rsqrt_optab:
14990 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14992 default:
14993 return true;
14997 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14998 if MODE is HFmode, and punt to the generic implementation otherwise. */
15000 static bool
15001 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15003 return (mode == HFmode
15004 ? true
15005 : default_libgcc_floating_mode_supported_p (mode));
15008 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15009 if MODE is HFmode, and punt to the generic implementation otherwise. */
15011 static bool
15012 aarch64_scalar_mode_supported_p (scalar_mode mode)
15014 return (mode == HFmode
15015 ? true
15016 : default_scalar_mode_supported_p (mode));
15019 /* Set the value of FLT_EVAL_METHOD.
15020 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15022 0: evaluate all operations and constants, whose semantic type has at
15023 most the range and precision of type float, to the range and
15024 precision of float; evaluate all other operations and constants to
15025 the range and precision of the semantic type;
15027 N, where _FloatN is a supported interchange floating type
15028 evaluate all operations and constants, whose semantic type has at
15029 most the range and precision of _FloatN type, to the range and
15030 precision of the _FloatN type; evaluate all other operations and
15031 constants to the range and precision of the semantic type;
15033 If we have the ARMv8.2-A extensions then we support _Float16 in native
15034 precision, so we should set this to 16. Otherwise, we support the type,
15035 but want to evaluate expressions in float precision, so set this to
15036 0. */
15038 static enum flt_eval_method
15039 aarch64_excess_precision (enum excess_precision_type type)
15041 switch (type)
15043 case EXCESS_PRECISION_TYPE_FAST:
15044 case EXCESS_PRECISION_TYPE_STANDARD:
15045 /* We can calculate either in 16-bit range and precision or
15046 32-bit range and precision. Make that decision based on whether
15047 we have native support for the ARMv8.2-A 16-bit floating-point
15048 instructions or not. */
15049 return (TARGET_FP_F16INST
15050 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15051 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15052 case EXCESS_PRECISION_TYPE_IMPLICIT:
15053 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15054 default:
15055 gcc_unreachable ();
15057 return FLT_EVAL_METHOD_UNPREDICTABLE;
15060 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15061 scheduled for speculative execution. Reject the long-running division
15062 and square-root instructions. */
15064 static bool
15065 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15067 switch (get_attr_type (insn))
15069 case TYPE_SDIV:
15070 case TYPE_UDIV:
15071 case TYPE_FDIVS:
15072 case TYPE_FDIVD:
15073 case TYPE_FSQRTS:
15074 case TYPE_FSQRTD:
15075 case TYPE_NEON_FP_SQRT_S:
15076 case TYPE_NEON_FP_SQRT_D:
15077 case TYPE_NEON_FP_SQRT_S_Q:
15078 case TYPE_NEON_FP_SQRT_D_Q:
15079 case TYPE_NEON_FP_DIV_S:
15080 case TYPE_NEON_FP_DIV_D:
15081 case TYPE_NEON_FP_DIV_S_Q:
15082 case TYPE_NEON_FP_DIV_D_Q:
15083 return false;
15084 default:
15085 return true;
15089 /* Target-specific selftests. */
15091 #if CHECKING_P
15093 namespace selftest {
15095 /* Selftest for the RTL loader.
15096 Verify that the RTL loader copes with a dump from
15097 print_rtx_function. This is essentially just a test that class
15098 function_reader can handle a real dump, but it also verifies
15099 that lookup_reg_by_dump_name correctly handles hard regs.
15100 The presence of hard reg names in the dump means that the test is
15101 target-specific, hence it is in this file. */
15103 static void
15104 aarch64_test_loading_full_dump ()
15106 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15108 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15110 rtx_insn *insn_1 = get_insn_by_uid (1);
15111 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15113 rtx_insn *insn_15 = get_insn_by_uid (15);
15114 ASSERT_EQ (INSN, GET_CODE (insn_15));
15115 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15117 /* Verify crtl->return_rtx. */
15118 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15119 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15120 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15123 /* Run all target-specific selftests. */
15125 static void
15126 aarch64_run_selftests (void)
15128 aarch64_test_loading_full_dump ();
15131 } // namespace selftest
15133 #endif /* #if CHECKING_P */
15135 #undef TARGET_ADDRESS_COST
15136 #define TARGET_ADDRESS_COST aarch64_address_cost
15138 /* This hook will determines whether unnamed bitfields affect the alignment
15139 of the containing structure. The hook returns true if the structure
15140 should inherit the alignment requirements of an unnamed bitfield's
15141 type. */
15142 #undef TARGET_ALIGN_ANON_BITFIELD
15143 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15145 #undef TARGET_ASM_ALIGNED_DI_OP
15146 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15148 #undef TARGET_ASM_ALIGNED_HI_OP
15149 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15151 #undef TARGET_ASM_ALIGNED_SI_OP
15152 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15154 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15155 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15156 hook_bool_const_tree_hwi_hwi_const_tree_true
15158 #undef TARGET_ASM_FILE_START
15159 #define TARGET_ASM_FILE_START aarch64_start_file
15161 #undef TARGET_ASM_OUTPUT_MI_THUNK
15162 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15164 #undef TARGET_ASM_SELECT_RTX_SECTION
15165 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15167 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15168 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15170 #undef TARGET_BUILD_BUILTIN_VA_LIST
15171 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15173 #undef TARGET_CALLEE_COPIES
15174 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15176 #undef TARGET_CAN_ELIMINATE
15177 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15179 #undef TARGET_CAN_INLINE_P
15180 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15182 #undef TARGET_CANNOT_FORCE_CONST_MEM
15183 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15185 #undef TARGET_CASE_VALUES_THRESHOLD
15186 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15188 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15189 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15191 /* Only the least significant bit is used for initialization guard
15192 variables. */
15193 #undef TARGET_CXX_GUARD_MASK_BIT
15194 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15196 #undef TARGET_C_MODE_FOR_SUFFIX
15197 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15199 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15200 #undef TARGET_DEFAULT_TARGET_FLAGS
15201 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15202 #endif
15204 #undef TARGET_CLASS_MAX_NREGS
15205 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15207 #undef TARGET_BUILTIN_DECL
15208 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15210 #undef TARGET_BUILTIN_RECIPROCAL
15211 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15213 #undef TARGET_C_EXCESS_PRECISION
15214 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15216 #undef TARGET_EXPAND_BUILTIN
15217 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15219 #undef TARGET_EXPAND_BUILTIN_VA_START
15220 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15222 #undef TARGET_FOLD_BUILTIN
15223 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15225 #undef TARGET_FUNCTION_ARG
15226 #define TARGET_FUNCTION_ARG aarch64_function_arg
15228 #undef TARGET_FUNCTION_ARG_ADVANCE
15229 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15231 #undef TARGET_FUNCTION_ARG_BOUNDARY
15232 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15234 #undef TARGET_FUNCTION_ARG_PADDING
15235 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15237 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15238 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15240 #undef TARGET_FUNCTION_VALUE
15241 #define TARGET_FUNCTION_VALUE aarch64_function_value
15243 #undef TARGET_FUNCTION_VALUE_REGNO_P
15244 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15246 #undef TARGET_GIMPLE_FOLD_BUILTIN
15247 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15249 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15250 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15252 #undef TARGET_INIT_BUILTINS
15253 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15255 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15256 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15257 aarch64_ira_change_pseudo_allocno_class
15259 #undef TARGET_LEGITIMATE_ADDRESS_P
15260 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15262 #undef TARGET_LEGITIMATE_CONSTANT_P
15263 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15265 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15266 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15267 aarch64_legitimize_address_displacement
15269 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15270 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15272 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15273 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15274 aarch64_libgcc_floating_mode_supported_p
15276 #undef TARGET_MANGLE_TYPE
15277 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15279 #undef TARGET_MEMORY_MOVE_COST
15280 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15282 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15283 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15285 #undef TARGET_MUST_PASS_IN_STACK
15286 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15288 /* This target hook should return true if accesses to volatile bitfields
15289 should use the narrowest mode possible. It should return false if these
15290 accesses should use the bitfield container type. */
15291 #undef TARGET_NARROW_VOLATILE_BITFIELD
15292 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15294 #undef TARGET_OPTION_OVERRIDE
15295 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15297 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15298 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15299 aarch64_override_options_after_change
15301 #undef TARGET_OPTION_SAVE
15302 #define TARGET_OPTION_SAVE aarch64_option_save
15304 #undef TARGET_OPTION_RESTORE
15305 #define TARGET_OPTION_RESTORE aarch64_option_restore
15307 #undef TARGET_OPTION_PRINT
15308 #define TARGET_OPTION_PRINT aarch64_option_print
15310 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15311 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15313 #undef TARGET_SET_CURRENT_FUNCTION
15314 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15316 #undef TARGET_PASS_BY_REFERENCE
15317 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15319 #undef TARGET_PREFERRED_RELOAD_CLASS
15320 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15322 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15323 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15325 #undef TARGET_PROMOTED_TYPE
15326 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15328 #undef TARGET_SECONDARY_RELOAD
15329 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15331 #undef TARGET_SHIFT_TRUNCATION_MASK
15332 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15334 #undef TARGET_SETUP_INCOMING_VARARGS
15335 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15337 #undef TARGET_STRUCT_VALUE_RTX
15338 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15340 #undef TARGET_REGISTER_MOVE_COST
15341 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15343 #undef TARGET_RETURN_IN_MEMORY
15344 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15346 #undef TARGET_RETURN_IN_MSB
15347 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15349 #undef TARGET_RTX_COSTS
15350 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15352 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15353 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15355 #undef TARGET_SCHED_ISSUE_RATE
15356 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15358 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15359 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15360 aarch64_sched_first_cycle_multipass_dfa_lookahead
15362 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15363 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15364 aarch64_first_cycle_multipass_dfa_lookahead_guard
15366 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15367 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15368 aarch64_get_separate_components
15370 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15371 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15372 aarch64_components_for_bb
15374 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15375 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15376 aarch64_disqualify_components
15378 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15379 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15380 aarch64_emit_prologue_components
15382 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15383 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15384 aarch64_emit_epilogue_components
15386 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15387 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15388 aarch64_set_handled_components
15390 #undef TARGET_TRAMPOLINE_INIT
15391 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15393 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15394 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15396 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15397 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15399 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15400 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15401 aarch64_builtin_support_vector_misalignment
15403 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15404 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15406 #undef TARGET_VECTORIZE_ADD_STMT_COST
15407 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15409 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15410 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15411 aarch64_builtin_vectorization_cost
15413 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15414 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15416 #undef TARGET_VECTORIZE_BUILTINS
15417 #define TARGET_VECTORIZE_BUILTINS
15419 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15420 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15421 aarch64_builtin_vectorized_function
15423 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15424 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15425 aarch64_autovectorize_vector_sizes
15427 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15428 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15429 aarch64_atomic_assign_expand_fenv
15431 /* Section anchor support. */
15433 #undef TARGET_MIN_ANCHOR_OFFSET
15434 #define TARGET_MIN_ANCHOR_OFFSET -256
15436 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15437 byte offset; we can do much more for larger data types, but have no way
15438 to determine the size of the access. We assume accesses are aligned. */
15439 #undef TARGET_MAX_ANCHOR_OFFSET
15440 #define TARGET_MAX_ANCHOR_OFFSET 4095
15442 #undef TARGET_VECTOR_ALIGNMENT
15443 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15445 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15446 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15447 aarch64_simd_vector_alignment_reachable
15449 /* vec_perm support. */
15451 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15452 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15453 aarch64_vectorize_vec_perm_const_ok
15455 #undef TARGET_INIT_LIBFUNCS
15456 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15458 #undef TARGET_FIXED_CONDITION_CODE_REGS
15459 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15461 #undef TARGET_FLAGS_REGNUM
15462 #define TARGET_FLAGS_REGNUM CC_REGNUM
15464 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15465 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15467 #undef TARGET_ASAN_SHADOW_OFFSET
15468 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15470 #undef TARGET_LEGITIMIZE_ADDRESS
15471 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15473 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15474 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15475 aarch64_use_by_pieces_infrastructure_p
15477 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15478 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15480 #undef TARGET_CAN_USE_DOLOOP_P
15481 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15483 #undef TARGET_SCHED_ADJUST_PRIORITY
15484 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15486 #undef TARGET_SCHED_MACRO_FUSION_P
15487 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15489 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15490 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15492 #undef TARGET_SCHED_FUSION_PRIORITY
15493 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15495 #undef TARGET_UNSPEC_MAY_TRAP_P
15496 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15498 #undef TARGET_USE_PSEUDO_PIC_REG
15499 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15501 #undef TARGET_PRINT_OPERAND
15502 #define TARGET_PRINT_OPERAND aarch64_print_operand
15504 #undef TARGET_PRINT_OPERAND_ADDRESS
15505 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15507 #undef TARGET_OPTAB_SUPPORTED_P
15508 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15510 #undef TARGET_OMIT_STRUCT_RETURN_REG
15511 #define TARGET_OMIT_STRUCT_RETURN_REG true
15513 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15514 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15515 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15517 #undef TARGET_HARD_REGNO_NREGS
15518 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15519 #undef TARGET_HARD_REGNO_MODE_OK
15520 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15522 #undef TARGET_MODES_TIEABLE_P
15523 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15525 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15526 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15527 aarch64_hard_regno_call_part_clobbered
15529 #undef TARGET_CONSTANT_ALIGNMENT
15530 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15532 #if CHECKING_P
15533 #undef TARGET_RUN_TARGET_SELFTESTS
15534 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15535 #endif /* #if CHECKING_P */
15537 struct gcc_target targetm = TARGET_INITIALIZER;
15539 #include "gt-aarch64.h"