Turn HARD_REGNO_CALL_PART_CLOBBERED into a target hook
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobba48b28d1d54af52a0921e9f180af863b11e24ca
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
145 const unsigned char *sel);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings =
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
869 /* A processor implementing AArch64. */
870 struct processor
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
929 aarch64_cc;
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
992 machine_mode mode;
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
997 if (best_class != ALL_REGS)
998 return best_class;
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1045 return mode == OImode || mode == CImode || mode == XImode;
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1067 return false;
1070 /* Implement HARD_REGNO_NREGS. */
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1075 switch (aarch64_regno_regclass (regno))
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1083 gcc_unreachable ();
1086 /* Implement HARD_REGNO_MODE_OK. */
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return 1;
1106 if (FP_REGNUM_P (regno))
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return
1110 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111 else
1112 return 1;
1115 return 0;
1118 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1119 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1120 clobbers the top 64 bits when restoring the bottom 64 bits. */
1122 static bool
1123 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1125 return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1128 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1129 machine_mode
1130 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1131 machine_mode mode)
1133 /* Handle modes that fit within single registers. */
1134 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1136 if (GET_MODE_SIZE (mode) >= 4)
1137 return mode;
1138 else
1139 return SImode;
1141 /* Fall back to generic for multi-reg and very large modes. */
1142 else
1143 return choose_hard_reg_mode (regno, nregs, false);
1146 /* Return true if calls to DECL should be treated as
1147 long-calls (ie called via a register). */
1148 static bool
1149 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1151 return false;
1154 /* Return true if calls to symbol-ref SYM should be treated as
1155 long-calls (ie called via a register). */
1156 bool
1157 aarch64_is_long_call_p (rtx sym)
1159 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1162 /* Return true if calls to symbol-ref SYM should not go through
1163 plt stubs. */
1165 bool
1166 aarch64_is_noplt_call_p (rtx sym)
1168 const_tree decl = SYMBOL_REF_DECL (sym);
1170 if (flag_pic
1171 && decl
1172 && (!flag_plt
1173 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1174 && !targetm.binds_local_p (decl))
1175 return true;
1177 return false;
1180 /* Return true if the offsets to a zero/sign-extract operation
1181 represent an expression that matches an extend operation. The
1182 operands represent the paramters from
1184 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1185 bool
1186 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1187 rtx extract_imm)
1189 HOST_WIDE_INT mult_val, extract_val;
1191 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1192 return false;
1194 mult_val = INTVAL (mult_imm);
1195 extract_val = INTVAL (extract_imm);
1197 if (extract_val > 8
1198 && extract_val < GET_MODE_BITSIZE (mode)
1199 && exact_log2 (extract_val & ~7) > 0
1200 && (extract_val & 7) <= 4
1201 && mult_val == (1 << (extract_val & 7)))
1202 return true;
1204 return false;
1207 /* Emit an insn that's a simple single-set. Both the operands must be
1208 known to be valid. */
1209 inline static rtx_insn *
1210 emit_set_insn (rtx x, rtx y)
1212 return emit_insn (gen_rtx_SET (x, y));
1215 /* X and Y are two things to compare using CODE. Emit the compare insn and
1216 return the rtx for register 0 in the proper mode. */
1218 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1220 machine_mode mode = SELECT_CC_MODE (code, x, y);
1221 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1223 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1224 return cc_reg;
1227 /* Build the SYMBOL_REF for __tls_get_addr. */
1229 static GTY(()) rtx tls_get_addr_libfunc;
1232 aarch64_tls_get_addr (void)
1234 if (!tls_get_addr_libfunc)
1235 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1236 return tls_get_addr_libfunc;
1239 /* Return the TLS model to use for ADDR. */
1241 static enum tls_model
1242 tls_symbolic_operand_type (rtx addr)
1244 enum tls_model tls_kind = TLS_MODEL_NONE;
1245 rtx sym, addend;
1247 if (GET_CODE (addr) == CONST)
1249 split_const (addr, &sym, &addend);
1250 if (GET_CODE (sym) == SYMBOL_REF)
1251 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1253 else if (GET_CODE (addr) == SYMBOL_REF)
1254 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1256 return tls_kind;
1259 /* We'll allow lo_sum's in addresses in our legitimate addresses
1260 so that combine would take care of combining addresses where
1261 necessary, but for generation purposes, we'll generate the address
1262 as :
1263 RTL Absolute
1264 tmp = hi (symbol_ref); adrp x1, foo
1265 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1268 PIC TLS
1269 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1270 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1271 bl __tls_get_addr
1274 Load TLS symbol, depending on TLS mechanism and TLS access model.
1276 Global Dynamic - Traditional TLS:
1277 adrp tmp, :tlsgd:imm
1278 add dest, tmp, #:tlsgd_lo12:imm
1279 bl __tls_get_addr
1281 Global Dynamic - TLS Descriptors:
1282 adrp dest, :tlsdesc:imm
1283 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1284 add dest, dest, #:tlsdesc_lo12:imm
1285 blr tmp
1286 mrs tp, tpidr_el0
1287 add dest, dest, tp
1289 Initial Exec:
1290 mrs tp, tpidr_el0
1291 adrp tmp, :gottprel:imm
1292 ldr dest, [tmp, #:gottprel_lo12:imm]
1293 add dest, dest, tp
1295 Local Exec:
1296 mrs tp, tpidr_el0
1297 add t0, tp, #:tprel_hi12:imm, lsl #12
1298 add t0, t0, #:tprel_lo12_nc:imm
1301 static void
1302 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1303 enum aarch64_symbol_type type)
1305 switch (type)
1307 case SYMBOL_SMALL_ABSOLUTE:
1309 /* In ILP32, the mode of dest can be either SImode or DImode. */
1310 rtx tmp_reg = dest;
1311 machine_mode mode = GET_MODE (dest);
1313 gcc_assert (mode == Pmode || mode == ptr_mode);
1315 if (can_create_pseudo_p ())
1316 tmp_reg = gen_reg_rtx (mode);
1318 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1319 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1320 return;
1323 case SYMBOL_TINY_ABSOLUTE:
1324 emit_insn (gen_rtx_SET (dest, imm));
1325 return;
1327 case SYMBOL_SMALL_GOT_28K:
1329 machine_mode mode = GET_MODE (dest);
1330 rtx gp_rtx = pic_offset_table_rtx;
1331 rtx insn;
1332 rtx mem;
1334 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1335 here before rtl expand. Tree IVOPT will generate rtl pattern to
1336 decide rtx costs, in which case pic_offset_table_rtx is not
1337 initialized. For that case no need to generate the first adrp
1338 instruction as the final cost for global variable access is
1339 one instruction. */
1340 if (gp_rtx != NULL)
1342 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1343 using the page base as GOT base, the first page may be wasted,
1344 in the worst scenario, there is only 28K space for GOT).
1346 The generate instruction sequence for accessing global variable
1349 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1351 Only one instruction needed. But we must initialize
1352 pic_offset_table_rtx properly. We generate initialize insn for
1353 every global access, and allow CSE to remove all redundant.
1355 The final instruction sequences will look like the following
1356 for multiply global variables access.
1358 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1360 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1361 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1362 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1363 ... */
1365 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1366 crtl->uses_pic_offset_table = 1;
1367 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1369 if (mode != GET_MODE (gp_rtx))
1370 gp_rtx = gen_lowpart (mode, gp_rtx);
1374 if (mode == ptr_mode)
1376 if (mode == DImode)
1377 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1378 else
1379 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1381 mem = XVECEXP (SET_SRC (insn), 0, 0);
1383 else
1385 gcc_assert (mode == Pmode);
1387 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1388 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1391 /* The operand is expected to be MEM. Whenever the related insn
1392 pattern changed, above code which calculate mem should be
1393 updated. */
1394 gcc_assert (GET_CODE (mem) == MEM);
1395 MEM_READONLY_P (mem) = 1;
1396 MEM_NOTRAP_P (mem) = 1;
1397 emit_insn (insn);
1398 return;
1401 case SYMBOL_SMALL_GOT_4G:
1403 /* In ILP32, the mode of dest can be either SImode or DImode,
1404 while the got entry is always of SImode size. The mode of
1405 dest depends on how dest is used: if dest is assigned to a
1406 pointer (e.g. in the memory), it has SImode; it may have
1407 DImode if dest is dereferenced to access the memeory.
1408 This is why we have to handle three different ldr_got_small
1409 patterns here (two patterns for ILP32). */
1411 rtx insn;
1412 rtx mem;
1413 rtx tmp_reg = dest;
1414 machine_mode mode = GET_MODE (dest);
1416 if (can_create_pseudo_p ())
1417 tmp_reg = gen_reg_rtx (mode);
1419 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1420 if (mode == ptr_mode)
1422 if (mode == DImode)
1423 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1424 else
1425 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1427 mem = XVECEXP (SET_SRC (insn), 0, 0);
1429 else
1431 gcc_assert (mode == Pmode);
1433 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1434 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1437 gcc_assert (GET_CODE (mem) == MEM);
1438 MEM_READONLY_P (mem) = 1;
1439 MEM_NOTRAP_P (mem) = 1;
1440 emit_insn (insn);
1441 return;
1444 case SYMBOL_SMALL_TLSGD:
1446 rtx_insn *insns;
1447 machine_mode mode = GET_MODE (dest);
1448 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1450 start_sequence ();
1451 if (TARGET_ILP32)
1452 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1453 else
1454 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1455 insns = get_insns ();
1456 end_sequence ();
1458 RTL_CONST_CALL_P (insns) = 1;
1459 emit_libcall_block (insns, dest, result, imm);
1460 return;
1463 case SYMBOL_SMALL_TLSDESC:
1465 machine_mode mode = GET_MODE (dest);
1466 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1467 rtx tp;
1469 gcc_assert (mode == Pmode || mode == ptr_mode);
1471 /* In ILP32, the got entry is always of SImode size. Unlike
1472 small GOT, the dest is fixed at reg 0. */
1473 if (TARGET_ILP32)
1474 emit_insn (gen_tlsdesc_small_si (imm));
1475 else
1476 emit_insn (gen_tlsdesc_small_di (imm));
1477 tp = aarch64_load_tp (NULL);
1479 if (mode != Pmode)
1480 tp = gen_lowpart (mode, tp);
1482 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1483 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1484 return;
1487 case SYMBOL_SMALL_TLSIE:
1489 /* In ILP32, the mode of dest can be either SImode or DImode,
1490 while the got entry is always of SImode size. The mode of
1491 dest depends on how dest is used: if dest is assigned to a
1492 pointer (e.g. in the memory), it has SImode; it may have
1493 DImode if dest is dereferenced to access the memeory.
1494 This is why we have to handle three different tlsie_small
1495 patterns here (two patterns for ILP32). */
1496 machine_mode mode = GET_MODE (dest);
1497 rtx tmp_reg = gen_reg_rtx (mode);
1498 rtx tp = aarch64_load_tp (NULL);
1500 if (mode == ptr_mode)
1502 if (mode == DImode)
1503 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1504 else
1506 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1507 tp = gen_lowpart (mode, tp);
1510 else
1512 gcc_assert (mode == Pmode);
1513 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1516 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1517 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518 return;
1521 case SYMBOL_TLSLE12:
1522 case SYMBOL_TLSLE24:
1523 case SYMBOL_TLSLE32:
1524 case SYMBOL_TLSLE48:
1526 machine_mode mode = GET_MODE (dest);
1527 rtx tp = aarch64_load_tp (NULL);
1529 if (mode != Pmode)
1530 tp = gen_lowpart (mode, tp);
1532 switch (type)
1534 case SYMBOL_TLSLE12:
1535 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1536 (dest, tp, imm));
1537 break;
1538 case SYMBOL_TLSLE24:
1539 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1540 (dest, tp, imm));
1541 break;
1542 case SYMBOL_TLSLE32:
1543 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1544 (dest, imm));
1545 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1546 (dest, dest, tp));
1547 break;
1548 case SYMBOL_TLSLE48:
1549 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1550 (dest, imm));
1551 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1552 (dest, dest, tp));
1553 break;
1554 default:
1555 gcc_unreachable ();
1558 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1559 return;
1562 case SYMBOL_TINY_GOT:
1563 emit_insn (gen_ldr_got_tiny (dest, imm));
1564 return;
1566 case SYMBOL_TINY_TLSIE:
1568 machine_mode mode = GET_MODE (dest);
1569 rtx tp = aarch64_load_tp (NULL);
1571 if (mode == ptr_mode)
1573 if (mode == DImode)
1574 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1575 else
1577 tp = gen_lowpart (mode, tp);
1578 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1581 else
1583 gcc_assert (mode == Pmode);
1584 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1587 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1588 return;
1591 default:
1592 gcc_unreachable ();
1596 /* Emit a move from SRC to DEST. Assume that the move expanders can
1597 handle all moves if !can_create_pseudo_p (). The distinction is
1598 important because, unlike emit_move_insn, the move expanders know
1599 how to force Pmode objects into the constant pool even when the
1600 constant pool address is not itself legitimate. */
1601 static rtx
1602 aarch64_emit_move (rtx dest, rtx src)
1604 return (can_create_pseudo_p ()
1605 ? emit_move_insn (dest, src)
1606 : emit_move_insn_1 (dest, src));
1609 /* Split a 128-bit move operation into two 64-bit move operations,
1610 taking care to handle partial overlap of register to register
1611 copies. Special cases are needed when moving between GP regs and
1612 FP regs. SRC can be a register, constant or memory; DST a register
1613 or memory. If either operand is memory it must not have any side
1614 effects. */
1615 void
1616 aarch64_split_128bit_move (rtx dst, rtx src)
1618 rtx dst_lo, dst_hi;
1619 rtx src_lo, src_hi;
1621 machine_mode mode = GET_MODE (dst);
1623 gcc_assert (mode == TImode || mode == TFmode);
1624 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1625 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1627 if (REG_P (dst) && REG_P (src))
1629 int src_regno = REGNO (src);
1630 int dst_regno = REGNO (dst);
1632 /* Handle FP <-> GP regs. */
1633 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1635 src_lo = gen_lowpart (word_mode, src);
1636 src_hi = gen_highpart (word_mode, src);
1638 if (mode == TImode)
1640 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1641 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1643 else
1645 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1646 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1648 return;
1650 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1652 dst_lo = gen_lowpart (word_mode, dst);
1653 dst_hi = gen_highpart (word_mode, dst);
1655 if (mode == TImode)
1657 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1658 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1660 else
1662 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1663 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1665 return;
1669 dst_lo = gen_lowpart (word_mode, dst);
1670 dst_hi = gen_highpart (word_mode, dst);
1671 src_lo = gen_lowpart (word_mode, src);
1672 src_hi = gen_highpart_mode (word_mode, mode, src);
1674 /* At most one pairing may overlap. */
1675 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1677 aarch64_emit_move (dst_hi, src_hi);
1678 aarch64_emit_move (dst_lo, src_lo);
1680 else
1682 aarch64_emit_move (dst_lo, src_lo);
1683 aarch64_emit_move (dst_hi, src_hi);
1687 bool
1688 aarch64_split_128bit_move_p (rtx dst, rtx src)
1690 return (! REG_P (src)
1691 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1694 /* Split a complex SIMD combine. */
1696 void
1697 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1699 machine_mode src_mode = GET_MODE (src1);
1700 machine_mode dst_mode = GET_MODE (dst);
1702 gcc_assert (VECTOR_MODE_P (dst_mode));
1703 gcc_assert (register_operand (dst, dst_mode)
1704 && register_operand (src1, src_mode)
1705 && register_operand (src2, src_mode));
1707 rtx (*gen) (rtx, rtx, rtx);
1709 switch (src_mode)
1711 case E_V8QImode:
1712 gen = gen_aarch64_simd_combinev8qi;
1713 break;
1714 case E_V4HImode:
1715 gen = gen_aarch64_simd_combinev4hi;
1716 break;
1717 case E_V2SImode:
1718 gen = gen_aarch64_simd_combinev2si;
1719 break;
1720 case E_V4HFmode:
1721 gen = gen_aarch64_simd_combinev4hf;
1722 break;
1723 case E_V2SFmode:
1724 gen = gen_aarch64_simd_combinev2sf;
1725 break;
1726 case E_DImode:
1727 gen = gen_aarch64_simd_combinedi;
1728 break;
1729 case E_DFmode:
1730 gen = gen_aarch64_simd_combinedf;
1731 break;
1732 default:
1733 gcc_unreachable ();
1736 emit_insn (gen (dst, src1, src2));
1737 return;
1740 /* Split a complex SIMD move. */
1742 void
1743 aarch64_split_simd_move (rtx dst, rtx src)
1745 machine_mode src_mode = GET_MODE (src);
1746 machine_mode dst_mode = GET_MODE (dst);
1748 gcc_assert (VECTOR_MODE_P (dst_mode));
1750 if (REG_P (dst) && REG_P (src))
1752 rtx (*gen) (rtx, rtx);
1754 gcc_assert (VECTOR_MODE_P (src_mode));
1756 switch (src_mode)
1758 case E_V16QImode:
1759 gen = gen_aarch64_split_simd_movv16qi;
1760 break;
1761 case E_V8HImode:
1762 gen = gen_aarch64_split_simd_movv8hi;
1763 break;
1764 case E_V4SImode:
1765 gen = gen_aarch64_split_simd_movv4si;
1766 break;
1767 case E_V2DImode:
1768 gen = gen_aarch64_split_simd_movv2di;
1769 break;
1770 case E_V8HFmode:
1771 gen = gen_aarch64_split_simd_movv8hf;
1772 break;
1773 case E_V4SFmode:
1774 gen = gen_aarch64_split_simd_movv4sf;
1775 break;
1776 case E_V2DFmode:
1777 gen = gen_aarch64_split_simd_movv2df;
1778 break;
1779 default:
1780 gcc_unreachable ();
1783 emit_insn (gen (dst, src));
1784 return;
1788 bool
1789 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1790 machine_mode ymode, rtx y)
1792 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1793 gcc_assert (r != NULL);
1794 return rtx_equal_p (x, r);
1798 static rtx
1799 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1801 if (can_create_pseudo_p ())
1802 return force_reg (mode, value);
1803 else
1805 x = aarch64_emit_move (x, value);
1806 return x;
1811 static rtx
1812 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1814 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1816 rtx high;
1817 /* Load the full offset into a register. This
1818 might be improvable in the future. */
1819 high = GEN_INT (offset);
1820 offset = 0;
1821 high = aarch64_force_temporary (mode, temp, high);
1822 reg = aarch64_force_temporary (mode, temp,
1823 gen_rtx_PLUS (mode, high, reg));
1825 return plus_constant (mode, reg, offset);
1828 static int
1829 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1830 machine_mode mode)
1832 int i;
1833 unsigned HOST_WIDE_INT val, val2, mask;
1834 int one_match, zero_match;
1835 int num_insns;
1837 val = INTVAL (imm);
1839 if (aarch64_move_imm (val, mode))
1841 if (generate)
1842 emit_insn (gen_rtx_SET (dest, imm));
1843 return 1;
1846 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1847 (with XXXX non-zero). In that case check to see if the move can be done in
1848 a smaller mode. */
1849 val2 = val & 0xffffffff;
1850 if (mode == DImode
1851 && aarch64_move_imm (val2, SImode)
1852 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1854 if (generate)
1855 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1857 /* Check if we have to emit a second instruction by checking to see
1858 if any of the upper 32 bits of the original DI mode value is set. */
1859 if (val == val2)
1860 return 1;
1862 i = (val >> 48) ? 48 : 32;
1864 if (generate)
1865 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1866 GEN_INT ((val >> i) & 0xffff)));
1868 return 2;
1871 if ((val >> 32) == 0 || mode == SImode)
1873 if (generate)
1875 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1876 if (mode == SImode)
1877 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1878 GEN_INT ((val >> 16) & 0xffff)));
1879 else
1880 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1881 GEN_INT ((val >> 16) & 0xffff)));
1883 return 2;
1886 /* Remaining cases are all for DImode. */
1888 mask = 0xffff;
1889 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1890 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1891 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1892 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1894 if (zero_match != 2 && one_match != 2)
1896 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1897 For a 64-bit bitmask try whether changing 16 bits to all ones or
1898 zeroes creates a valid bitmask. To check any repeated bitmask,
1899 try using 16 bits from the other 32-bit half of val. */
1901 for (i = 0; i < 64; i += 16, mask <<= 16)
1903 val2 = val & ~mask;
1904 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1905 break;
1906 val2 = val | mask;
1907 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1908 break;
1909 val2 = val2 & ~mask;
1910 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1911 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1912 break;
1914 if (i != 64)
1916 if (generate)
1918 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1919 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1920 GEN_INT ((val >> i) & 0xffff)));
1922 return 2;
1926 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1927 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1928 otherwise skip zero bits. */
1930 num_insns = 1;
1931 mask = 0xffff;
1932 val2 = one_match > zero_match ? ~val : val;
1933 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1935 if (generate)
1936 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1937 ? (val | ~(mask << i))
1938 : (val & (mask << i)))));
1939 for (i += 16; i < 64; i += 16)
1941 if ((val2 & (mask << i)) == 0)
1942 continue;
1943 if (generate)
1944 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1945 GEN_INT ((val >> i) & 0xffff)));
1946 num_insns ++;
1949 return num_insns;
1953 void
1954 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1956 machine_mode mode = GET_MODE (dest);
1958 gcc_assert (mode == SImode || mode == DImode);
1960 /* Check on what type of symbol it is. */
1961 if (GET_CODE (imm) == SYMBOL_REF
1962 || GET_CODE (imm) == LABEL_REF
1963 || GET_CODE (imm) == CONST)
1965 rtx mem, base, offset;
1966 enum aarch64_symbol_type sty;
1968 /* If we have (const (plus symbol offset)), separate out the offset
1969 before we start classifying the symbol. */
1970 split_const (imm, &base, &offset);
1972 sty = aarch64_classify_symbol (base, offset);
1973 switch (sty)
1975 case SYMBOL_FORCE_TO_MEM:
1976 if (offset != const0_rtx
1977 && targetm.cannot_force_const_mem (mode, imm))
1979 gcc_assert (can_create_pseudo_p ());
1980 base = aarch64_force_temporary (mode, dest, base);
1981 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1982 aarch64_emit_move (dest, base);
1983 return;
1986 mem = force_const_mem (ptr_mode, imm);
1987 gcc_assert (mem);
1989 /* If we aren't generating PC relative literals, then
1990 we need to expand the literal pool access carefully.
1991 This is something that needs to be done in a number
1992 of places, so could well live as a separate function. */
1993 if (!aarch64_pcrelative_literal_loads)
1995 gcc_assert (can_create_pseudo_p ());
1996 base = gen_reg_rtx (ptr_mode);
1997 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1998 if (ptr_mode != Pmode)
1999 base = convert_memory_address (Pmode, base);
2000 mem = gen_rtx_MEM (ptr_mode, base);
2003 if (mode != ptr_mode)
2004 mem = gen_rtx_ZERO_EXTEND (mode, mem);
2006 emit_insn (gen_rtx_SET (dest, mem));
2008 return;
2010 case SYMBOL_SMALL_TLSGD:
2011 case SYMBOL_SMALL_TLSDESC:
2012 case SYMBOL_SMALL_TLSIE:
2013 case SYMBOL_SMALL_GOT_28K:
2014 case SYMBOL_SMALL_GOT_4G:
2015 case SYMBOL_TINY_GOT:
2016 case SYMBOL_TINY_TLSIE:
2017 if (offset != const0_rtx)
2019 gcc_assert(can_create_pseudo_p ());
2020 base = aarch64_force_temporary (mode, dest, base);
2021 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2022 aarch64_emit_move (dest, base);
2023 return;
2025 /* FALLTHRU */
2027 case SYMBOL_SMALL_ABSOLUTE:
2028 case SYMBOL_TINY_ABSOLUTE:
2029 case SYMBOL_TLSLE12:
2030 case SYMBOL_TLSLE24:
2031 case SYMBOL_TLSLE32:
2032 case SYMBOL_TLSLE48:
2033 aarch64_load_symref_appropriately (dest, imm, sty);
2034 return;
2036 default:
2037 gcc_unreachable ();
2041 if (!CONST_INT_P (imm))
2043 if (GET_CODE (imm) == HIGH)
2044 emit_insn (gen_rtx_SET (dest, imm));
2045 else
2047 rtx mem = force_const_mem (mode, imm);
2048 gcc_assert (mem);
2049 emit_insn (gen_rtx_SET (dest, mem));
2052 return;
2055 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2058 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2059 temporary value if necessary. FRAME_RELATED_P should be true if
2060 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2061 to the generated instructions. If SCRATCHREG is known to hold
2062 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2063 immediate again.
2065 Since this function may be used to adjust the stack pointer, we must
2066 ensure that it cannot cause transient stack deallocation (for example
2067 by first incrementing SP and then decrementing when adjusting by a
2068 large immediate). */
2070 static void
2071 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2072 HOST_WIDE_INT delta, bool frame_related_p,
2073 bool emit_move_imm)
2075 HOST_WIDE_INT mdelta = abs_hwi (delta);
2076 rtx this_rtx = gen_rtx_REG (mode, regnum);
2077 rtx_insn *insn;
2079 if (!mdelta)
2080 return;
2082 /* Single instruction adjustment. */
2083 if (aarch64_uimm12_shift (mdelta))
2085 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2086 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2087 return;
2090 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2091 Only do this if mdelta is not a 16-bit move as adjusting using a move
2092 is better. */
2093 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2095 HOST_WIDE_INT low_off = mdelta & 0xfff;
2097 low_off = delta < 0 ? -low_off : low_off;
2098 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2099 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2100 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2101 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2102 return;
2105 /* Emit a move immediate if required and an addition/subtraction. */
2106 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2107 if (emit_move_imm)
2108 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2109 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2110 : gen_add2_insn (this_rtx, scratch_rtx));
2111 if (frame_related_p)
2113 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2114 rtx adj = plus_constant (mode, this_rtx, delta);
2115 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2119 static inline void
2120 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2121 HOST_WIDE_INT delta)
2123 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2126 static inline void
2127 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2129 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2130 true, emit_move_imm);
2133 static inline void
2134 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2136 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2137 frame_related_p, true);
2140 static bool
2141 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2142 tree exp ATTRIBUTE_UNUSED)
2144 /* Currently, always true. */
2145 return true;
2148 /* Implement TARGET_PASS_BY_REFERENCE. */
2150 static bool
2151 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2152 machine_mode mode,
2153 const_tree type,
2154 bool named ATTRIBUTE_UNUSED)
2156 HOST_WIDE_INT size;
2157 machine_mode dummymode;
2158 int nregs;
2160 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2161 size = (mode == BLKmode && type)
2162 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2164 /* Aggregates are passed by reference based on their size. */
2165 if (type && AGGREGATE_TYPE_P (type))
2167 size = int_size_in_bytes (type);
2170 /* Variable sized arguments are always returned by reference. */
2171 if (size < 0)
2172 return true;
2174 /* Can this be a candidate to be passed in fp/simd register(s)? */
2175 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2176 &dummymode, &nregs,
2177 NULL))
2178 return false;
2180 /* Arguments which are variable sized or larger than 2 registers are
2181 passed by reference unless they are a homogenous floating point
2182 aggregate. */
2183 return size > 2 * UNITS_PER_WORD;
2186 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2187 static bool
2188 aarch64_return_in_msb (const_tree valtype)
2190 machine_mode dummy_mode;
2191 int dummy_int;
2193 /* Never happens in little-endian mode. */
2194 if (!BYTES_BIG_ENDIAN)
2195 return false;
2197 /* Only composite types smaller than or equal to 16 bytes can
2198 be potentially returned in registers. */
2199 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2200 || int_size_in_bytes (valtype) <= 0
2201 || int_size_in_bytes (valtype) > 16)
2202 return false;
2204 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2205 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2206 is always passed/returned in the least significant bits of fp/simd
2207 register(s). */
2208 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2209 &dummy_mode, &dummy_int, NULL))
2210 return false;
2212 return true;
2215 /* Implement TARGET_FUNCTION_VALUE.
2216 Define how to find the value returned by a function. */
2218 static rtx
2219 aarch64_function_value (const_tree type, const_tree func,
2220 bool outgoing ATTRIBUTE_UNUSED)
2222 machine_mode mode;
2223 int unsignedp;
2224 int count;
2225 machine_mode ag_mode;
2227 mode = TYPE_MODE (type);
2228 if (INTEGRAL_TYPE_P (type))
2229 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2231 if (aarch64_return_in_msb (type))
2233 HOST_WIDE_INT size = int_size_in_bytes (type);
2235 if (size % UNITS_PER_WORD != 0)
2237 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2238 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2242 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2243 &ag_mode, &count, NULL))
2245 if (!aarch64_composite_type_p (type, mode))
2247 gcc_assert (count == 1 && mode == ag_mode);
2248 return gen_rtx_REG (mode, V0_REGNUM);
2250 else
2252 int i;
2253 rtx par;
2255 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2256 for (i = 0; i < count; i++)
2258 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2259 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2260 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2261 XVECEXP (par, 0, i) = tmp;
2263 return par;
2266 else
2267 return gen_rtx_REG (mode, R0_REGNUM);
2270 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2271 Return true if REGNO is the number of a hard register in which the values
2272 of called function may come back. */
2274 static bool
2275 aarch64_function_value_regno_p (const unsigned int regno)
2277 /* Maximum of 16 bytes can be returned in the general registers. Examples
2278 of 16-byte return values are: 128-bit integers and 16-byte small
2279 structures (excluding homogeneous floating-point aggregates). */
2280 if (regno == R0_REGNUM || regno == R1_REGNUM)
2281 return true;
2283 /* Up to four fp/simd registers can return a function value, e.g. a
2284 homogeneous floating-point aggregate having four members. */
2285 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2286 return TARGET_FLOAT;
2288 return false;
2291 /* Implement TARGET_RETURN_IN_MEMORY.
2293 If the type T of the result of a function is such that
2294 void func (T arg)
2295 would require that arg be passed as a value in a register (or set of
2296 registers) according to the parameter passing rules, then the result
2297 is returned in the same registers as would be used for such an
2298 argument. */
2300 static bool
2301 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2303 HOST_WIDE_INT size;
2304 machine_mode ag_mode;
2305 int count;
2307 if (!AGGREGATE_TYPE_P (type)
2308 && TREE_CODE (type) != COMPLEX_TYPE
2309 && TREE_CODE (type) != VECTOR_TYPE)
2310 /* Simple scalar types always returned in registers. */
2311 return false;
2313 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2314 type,
2315 &ag_mode,
2316 &count,
2317 NULL))
2318 return false;
2320 /* Types larger than 2 registers returned in memory. */
2321 size = int_size_in_bytes (type);
2322 return (size < 0 || size > 2 * UNITS_PER_WORD);
2325 static bool
2326 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2327 const_tree type, int *nregs)
2329 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2330 return aarch64_vfp_is_call_or_return_candidate (mode,
2331 type,
2332 &pcum->aapcs_vfp_rmode,
2333 nregs,
2334 NULL);
2337 /* Given MODE and TYPE of a function argument, return the alignment in
2338 bits. The idea is to suppress any stronger alignment requested by
2339 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2340 This is a helper function for local use only. */
2342 static unsigned int
2343 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2345 if (!type)
2346 return GET_MODE_ALIGNMENT (mode);
2348 if (integer_zerop (TYPE_SIZE (type)))
2349 return 0;
2351 gcc_assert (TYPE_MODE (type) == mode);
2353 if (!AGGREGATE_TYPE_P (type))
2354 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2356 if (TREE_CODE (type) == ARRAY_TYPE)
2357 return TYPE_ALIGN (TREE_TYPE (type));
2359 unsigned int alignment = 0;
2360 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2361 if (TREE_CODE (field) == FIELD_DECL)
2362 alignment = std::max (alignment, DECL_ALIGN (field));
2364 return alignment;
2367 /* Layout a function argument according to the AAPCS64 rules. The rule
2368 numbers refer to the rule numbers in the AAPCS64. */
2370 static void
2371 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2372 const_tree type,
2373 bool named ATTRIBUTE_UNUSED)
2375 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2376 int ncrn, nvrn, nregs;
2377 bool allocate_ncrn, allocate_nvrn;
2378 HOST_WIDE_INT size;
2380 /* We need to do this once per argument. */
2381 if (pcum->aapcs_arg_processed)
2382 return;
2384 pcum->aapcs_arg_processed = true;
2386 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2387 size
2388 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2389 UNITS_PER_WORD);
2391 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2392 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2393 mode,
2394 type,
2395 &nregs);
2397 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2398 The following code thus handles passing by SIMD/FP registers first. */
2400 nvrn = pcum->aapcs_nvrn;
2402 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2403 and homogenous short-vector aggregates (HVA). */
2404 if (allocate_nvrn)
2406 if (!TARGET_FLOAT)
2407 aarch64_err_no_fpadvsimd (mode, "argument");
2409 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2411 pcum->aapcs_nextnvrn = nvrn + nregs;
2412 if (!aarch64_composite_type_p (type, mode))
2414 gcc_assert (nregs == 1);
2415 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2417 else
2419 rtx par;
2420 int i;
2421 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2422 for (i = 0; i < nregs; i++)
2424 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2425 V0_REGNUM + nvrn + i);
2426 tmp = gen_rtx_EXPR_LIST
2427 (VOIDmode, tmp,
2428 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2429 XVECEXP (par, 0, i) = tmp;
2431 pcum->aapcs_reg = par;
2433 return;
2435 else
2437 /* C.3 NSRN is set to 8. */
2438 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2439 goto on_stack;
2443 ncrn = pcum->aapcs_ncrn;
2444 nregs = size / UNITS_PER_WORD;
2446 /* C6 - C9. though the sign and zero extension semantics are
2447 handled elsewhere. This is the case where the argument fits
2448 entirely general registers. */
2449 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2452 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2454 /* C.8 if the argument has an alignment of 16 then the NGRN is
2455 rounded up to the next even number. */
2456 if (nregs == 2
2457 && ncrn % 2
2458 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2459 comparison is there because for > 16 * BITS_PER_UNIT
2460 alignment nregs should be > 2 and therefore it should be
2461 passed by reference rather than value. */
2462 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2464 ++ncrn;
2465 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2468 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2469 A reg is still generated for it, but the caller should be smart
2470 enough not to use it. */
2471 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2472 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2473 else
2475 rtx par;
2476 int i;
2478 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2479 for (i = 0; i < nregs; i++)
2481 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2482 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2483 GEN_INT (i * UNITS_PER_WORD));
2484 XVECEXP (par, 0, i) = tmp;
2486 pcum->aapcs_reg = par;
2489 pcum->aapcs_nextncrn = ncrn + nregs;
2490 return;
2493 /* C.11 */
2494 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2496 /* The argument is passed on stack; record the needed number of words for
2497 this argument and align the total size if necessary. */
2498 on_stack:
2499 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2501 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2502 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2503 16 / UNITS_PER_WORD);
2504 return;
2507 /* Implement TARGET_FUNCTION_ARG. */
2509 static rtx
2510 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2511 const_tree type, bool named)
2513 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2514 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2516 if (mode == VOIDmode)
2517 return NULL_RTX;
2519 aarch64_layout_arg (pcum_v, mode, type, named);
2520 return pcum->aapcs_reg;
2523 void
2524 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2525 const_tree fntype ATTRIBUTE_UNUSED,
2526 rtx libname ATTRIBUTE_UNUSED,
2527 const_tree fndecl ATTRIBUTE_UNUSED,
2528 unsigned n_named ATTRIBUTE_UNUSED)
2530 pcum->aapcs_ncrn = 0;
2531 pcum->aapcs_nvrn = 0;
2532 pcum->aapcs_nextncrn = 0;
2533 pcum->aapcs_nextnvrn = 0;
2534 pcum->pcs_variant = ARM_PCS_AAPCS64;
2535 pcum->aapcs_reg = NULL_RTX;
2536 pcum->aapcs_arg_processed = false;
2537 pcum->aapcs_stack_words = 0;
2538 pcum->aapcs_stack_size = 0;
2540 if (!TARGET_FLOAT
2541 && fndecl && TREE_PUBLIC (fndecl)
2542 && fntype && fntype != error_mark_node)
2544 const_tree type = TREE_TYPE (fntype);
2545 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2546 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2547 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2548 &mode, &nregs, NULL))
2549 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2551 return;
2554 static void
2555 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2556 machine_mode mode,
2557 const_tree type,
2558 bool named)
2560 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2561 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2563 aarch64_layout_arg (pcum_v, mode, type, named);
2564 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2565 != (pcum->aapcs_stack_words != 0));
2566 pcum->aapcs_arg_processed = false;
2567 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2568 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2569 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2570 pcum->aapcs_stack_words = 0;
2571 pcum->aapcs_reg = NULL_RTX;
2575 bool
2576 aarch64_function_arg_regno_p (unsigned regno)
2578 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2579 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2582 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2583 PARM_BOUNDARY bits of alignment, but will be given anything up
2584 to STACK_BOUNDARY bits if the type requires it. This makes sure
2585 that both before and after the layout of each argument, the Next
2586 Stacked Argument Address (NSAA) will have a minimum alignment of
2587 8 bytes. */
2589 static unsigned int
2590 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2592 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2593 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2596 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2598 Return true if an argument passed on the stack should be padded upwards,
2599 i.e. if the least-significant byte of the stack slot has useful data.
2601 Small aggregate types are placed in the lowest memory address.
2603 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2605 bool
2606 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2608 /* On little-endian targets, the least significant byte of every stack
2609 argument is passed at the lowest byte address of the stack slot. */
2610 if (!BYTES_BIG_ENDIAN)
2611 return true;
2613 /* Otherwise, integral, floating-point and pointer types are padded downward:
2614 the least significant byte of a stack argument is passed at the highest
2615 byte address of the stack slot. */
2616 if (type
2617 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2618 || POINTER_TYPE_P (type))
2619 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2620 return false;
2622 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2623 return true;
2626 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2628 It specifies padding for the last (may also be the only)
2629 element of a block move between registers and memory. If
2630 assuming the block is in the memory, padding upward means that
2631 the last element is padded after its highest significant byte,
2632 while in downward padding, the last element is padded at the
2633 its least significant byte side.
2635 Small aggregates and small complex types are always padded
2636 upwards.
2638 We don't need to worry about homogeneous floating-point or
2639 short-vector aggregates; their move is not affected by the
2640 padding direction determined here. Regardless of endianness,
2641 each element of such an aggregate is put in the least
2642 significant bits of a fp/simd register.
2644 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2645 register has useful data, and return the opposite if the most
2646 significant byte does. */
2648 bool
2649 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2650 bool first ATTRIBUTE_UNUSED)
2653 /* Small composite types are always padded upward. */
2654 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2656 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2657 : GET_MODE_SIZE (mode));
2658 if (size < 2 * UNITS_PER_WORD)
2659 return true;
2662 /* Otherwise, use the default padding. */
2663 return !BYTES_BIG_ENDIAN;
2666 static scalar_int_mode
2667 aarch64_libgcc_cmp_return_mode (void)
2669 return SImode;
2672 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2674 /* We use the 12-bit shifted immediate arithmetic instructions so values
2675 must be multiple of (1 << 12), i.e. 4096. */
2676 #define ARITH_FACTOR 4096
2678 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2679 #error Cannot use simple address calculation for stack probing
2680 #endif
2682 /* The pair of scratch registers used for stack probing. */
2683 #define PROBE_STACK_FIRST_REG 9
2684 #define PROBE_STACK_SECOND_REG 10
2686 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2687 inclusive. These are offsets from the current stack pointer. */
2689 static void
2690 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2692 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2694 /* See the same assertion on PROBE_INTERVAL above. */
2695 gcc_assert ((first % ARITH_FACTOR) == 0);
2697 /* See if we have a constant small number of probes to generate. If so,
2698 that's the easy case. */
2699 if (size <= PROBE_INTERVAL)
2701 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2703 emit_set_insn (reg1,
2704 plus_constant (Pmode,
2705 stack_pointer_rtx, -(first + base)));
2706 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2709 /* The run-time loop is made up of 8 insns in the generic case while the
2710 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2711 else if (size <= 4 * PROBE_INTERVAL)
2713 HOST_WIDE_INT i, rem;
2715 emit_set_insn (reg1,
2716 plus_constant (Pmode,
2717 stack_pointer_rtx,
2718 -(first + PROBE_INTERVAL)));
2719 emit_stack_probe (reg1);
2721 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2722 it exceeds SIZE. If only two probes are needed, this will not
2723 generate any code. Then probe at FIRST + SIZE. */
2724 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2726 emit_set_insn (reg1,
2727 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2728 emit_stack_probe (reg1);
2731 rem = size - (i - PROBE_INTERVAL);
2732 if (rem > 256)
2734 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2736 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2737 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2739 else
2740 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2743 /* Otherwise, do the same as above, but in a loop. Note that we must be
2744 extra careful with variables wrapping around because we might be at
2745 the very top (or the very bottom) of the address space and we have
2746 to be able to handle this case properly; in particular, we use an
2747 equality test for the loop condition. */
2748 else
2750 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2752 /* Step 1: round SIZE to the previous multiple of the interval. */
2754 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2757 /* Step 2: compute initial and final value of the loop counter. */
2759 /* TEST_ADDR = SP + FIRST. */
2760 emit_set_insn (reg1,
2761 plus_constant (Pmode, stack_pointer_rtx, -first));
2763 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2764 HOST_WIDE_INT adjustment = - (first + rounded_size);
2765 if (! aarch64_uimm12_shift (adjustment))
2767 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2768 true, Pmode);
2769 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2771 else
2773 emit_set_insn (reg2,
2774 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2777 /* Step 3: the loop
2781 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2782 probe at TEST_ADDR
2784 while (TEST_ADDR != LAST_ADDR)
2786 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2787 until it is equal to ROUNDED_SIZE. */
2789 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2792 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2793 that SIZE is equal to ROUNDED_SIZE. */
2795 if (size != rounded_size)
2797 HOST_WIDE_INT rem = size - rounded_size;
2799 if (rem > 256)
2801 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2803 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2804 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2806 else
2807 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2811 /* Make sure nothing is scheduled before we are done. */
2812 emit_insn (gen_blockage ());
2815 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2816 absolute addresses. */
2818 const char *
2819 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2821 static int labelno = 0;
2822 char loop_lab[32];
2823 rtx xops[2];
2825 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2827 /* Loop. */
2828 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2830 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2831 xops[0] = reg1;
2832 xops[1] = GEN_INT (PROBE_INTERVAL);
2833 output_asm_insn ("sub\t%0, %0, %1", xops);
2835 /* Probe at TEST_ADDR. */
2836 output_asm_insn ("str\txzr, [%0]", xops);
2838 /* Test if TEST_ADDR == LAST_ADDR. */
2839 xops[1] = reg2;
2840 output_asm_insn ("cmp\t%0, %1", xops);
2842 /* Branch. */
2843 fputs ("\tb.ne\t", asm_out_file);
2844 assemble_name_raw (asm_out_file, loop_lab);
2845 fputc ('\n', asm_out_file);
2847 return "";
2850 static bool
2851 aarch64_frame_pointer_required (void)
2853 /* In aarch64_override_options_after_change
2854 flag_omit_leaf_frame_pointer turns off the frame pointer by
2855 default. Turn it back on now if we've not got a leaf
2856 function. */
2857 if (flag_omit_leaf_frame_pointer
2858 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2859 return true;
2861 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2862 if (crtl->calls_eh_return)
2863 return true;
2865 return false;
2868 /* Mark the registers that need to be saved by the callee and calculate
2869 the size of the callee-saved registers area and frame record (both FP
2870 and LR may be omitted). */
2871 static void
2872 aarch64_layout_frame (void)
2874 HOST_WIDE_INT offset = 0;
2875 int regno, last_fp_reg = INVALID_REGNUM;
2877 if (reload_completed && cfun->machine->frame.laid_out)
2878 return;
2880 #define SLOT_NOT_REQUIRED (-2)
2881 #define SLOT_REQUIRED (-1)
2883 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2884 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2886 /* First mark all the registers that really need to be saved... */
2887 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2888 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2890 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2891 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2893 /* ... that includes the eh data registers (if needed)... */
2894 if (crtl->calls_eh_return)
2895 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2896 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2897 = SLOT_REQUIRED;
2899 /* ... and any callee saved register that dataflow says is live. */
2900 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2901 if (df_regs_ever_live_p (regno)
2902 && (regno == R30_REGNUM
2903 || !call_used_regs[regno]))
2904 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2906 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2907 if (df_regs_ever_live_p (regno)
2908 && !call_used_regs[regno])
2910 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2911 last_fp_reg = regno;
2914 if (frame_pointer_needed)
2916 /* FP and LR are placed in the linkage record. */
2917 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2918 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2919 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2920 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2921 offset += 2 * UNITS_PER_WORD;
2924 /* Now assign stack slots for them. */
2925 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2926 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2928 cfun->machine->frame.reg_offset[regno] = offset;
2929 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2930 cfun->machine->frame.wb_candidate1 = regno;
2931 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2932 cfun->machine->frame.wb_candidate2 = regno;
2933 offset += UNITS_PER_WORD;
2936 HOST_WIDE_INT max_int_offset = offset;
2937 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2938 bool has_align_gap = offset != max_int_offset;
2940 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2941 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2943 /* If there is an alignment gap between integer and fp callee-saves,
2944 allocate the last fp register to it if possible. */
2945 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2947 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2948 break;
2951 cfun->machine->frame.reg_offset[regno] = offset;
2952 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2953 cfun->machine->frame.wb_candidate1 = regno;
2954 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2955 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2956 cfun->machine->frame.wb_candidate2 = regno;
2957 offset += UNITS_PER_WORD;
2960 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2962 cfun->machine->frame.saved_regs_size = offset;
2964 HOST_WIDE_INT varargs_and_saved_regs_size
2965 = offset + cfun->machine->frame.saved_varargs_size;
2967 cfun->machine->frame.hard_fp_offset
2968 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2969 STACK_BOUNDARY / BITS_PER_UNIT);
2971 cfun->machine->frame.frame_size
2972 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2973 + crtl->outgoing_args_size,
2974 STACK_BOUNDARY / BITS_PER_UNIT);
2976 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2978 cfun->machine->frame.initial_adjust = 0;
2979 cfun->machine->frame.final_adjust = 0;
2980 cfun->machine->frame.callee_adjust = 0;
2981 cfun->machine->frame.callee_offset = 0;
2983 HOST_WIDE_INT max_push_offset = 0;
2984 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2985 max_push_offset = 512;
2986 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2987 max_push_offset = 256;
2989 if (cfun->machine->frame.frame_size < max_push_offset
2990 && crtl->outgoing_args_size == 0)
2992 /* Simple, small frame with no outgoing arguments:
2993 stp reg1, reg2, [sp, -frame_size]!
2994 stp reg3, reg4, [sp, 16] */
2995 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2997 else if ((crtl->outgoing_args_size
2998 + cfun->machine->frame.saved_regs_size < 512)
2999 && !(cfun->calls_alloca
3000 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3002 /* Frame with small outgoing arguments:
3003 sub sp, sp, frame_size
3004 stp reg1, reg2, [sp, outgoing_args_size]
3005 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3006 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3007 cfun->machine->frame.callee_offset
3008 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3010 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3012 /* Frame with large outgoing arguments but a small local area:
3013 stp reg1, reg2, [sp, -hard_fp_offset]!
3014 stp reg3, reg4, [sp, 16]
3015 sub sp, sp, outgoing_args_size */
3016 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3017 cfun->machine->frame.final_adjust
3018 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3020 else if (!frame_pointer_needed
3021 && varargs_and_saved_regs_size < max_push_offset)
3023 /* Frame with large local area and outgoing arguments (this pushes the
3024 callee-saves first, followed by the locals and outgoing area):
3025 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3026 stp reg3, reg4, [sp, 16]
3027 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3028 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3029 cfun->machine->frame.final_adjust
3030 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3031 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3032 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3034 else
3036 /* Frame with large local area and outgoing arguments using frame pointer:
3037 sub sp, sp, hard_fp_offset
3038 stp x29, x30, [sp, 0]
3039 add x29, sp, 0
3040 stp reg3, reg4, [sp, 16]
3041 sub sp, sp, outgoing_args_size */
3042 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3043 cfun->machine->frame.final_adjust
3044 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3047 cfun->machine->frame.laid_out = true;
3050 /* Return true if the register REGNO is saved on entry to
3051 the current function. */
3053 static bool
3054 aarch64_register_saved_on_entry (int regno)
3056 return cfun->machine->frame.reg_offset[regno] >= 0;
3059 /* Return the next register up from REGNO up to LIMIT for the callee
3060 to save. */
3062 static unsigned
3063 aarch64_next_callee_save (unsigned regno, unsigned limit)
3065 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3066 regno ++;
3067 return regno;
3070 /* Push the register number REGNO of mode MODE to the stack with write-back
3071 adjusting the stack by ADJUSTMENT. */
3073 static void
3074 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3075 HOST_WIDE_INT adjustment)
3077 rtx base_rtx = stack_pointer_rtx;
3078 rtx insn, reg, mem;
3080 reg = gen_rtx_REG (mode, regno);
3081 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3082 plus_constant (Pmode, base_rtx, -adjustment));
3083 mem = gen_frame_mem (mode, mem);
3085 insn = emit_move_insn (mem, reg);
3086 RTX_FRAME_RELATED_P (insn) = 1;
3089 /* Generate and return an instruction to store the pair of registers
3090 REG and REG2 of mode MODE to location BASE with write-back adjusting
3091 the stack location BASE by ADJUSTMENT. */
3093 static rtx
3094 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3095 HOST_WIDE_INT adjustment)
3097 switch (mode)
3099 case E_DImode:
3100 return gen_storewb_pairdi_di (base, base, reg, reg2,
3101 GEN_INT (-adjustment),
3102 GEN_INT (UNITS_PER_WORD - adjustment));
3103 case E_DFmode:
3104 return gen_storewb_pairdf_di (base, base, reg, reg2,
3105 GEN_INT (-adjustment),
3106 GEN_INT (UNITS_PER_WORD - adjustment));
3107 default:
3108 gcc_unreachable ();
3112 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3113 stack pointer by ADJUSTMENT. */
3115 static void
3116 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3118 rtx_insn *insn;
3119 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3121 if (regno2 == INVALID_REGNUM)
3122 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3124 rtx reg1 = gen_rtx_REG (mode, regno1);
3125 rtx reg2 = gen_rtx_REG (mode, regno2);
3127 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3128 reg2, adjustment));
3129 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3130 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3131 RTX_FRAME_RELATED_P (insn) = 1;
3134 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3135 adjusting it by ADJUSTMENT afterwards. */
3137 static rtx
3138 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3139 HOST_WIDE_INT adjustment)
3141 switch (mode)
3143 case E_DImode:
3144 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3145 GEN_INT (UNITS_PER_WORD));
3146 case E_DFmode:
3147 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3148 GEN_INT (UNITS_PER_WORD));
3149 default:
3150 gcc_unreachable ();
3154 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3155 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3156 into CFI_OPS. */
3158 static void
3159 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3160 rtx *cfi_ops)
3162 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3163 rtx reg1 = gen_rtx_REG (mode, regno1);
3165 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3167 if (regno2 == INVALID_REGNUM)
3169 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3170 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3171 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3173 else
3175 rtx reg2 = gen_rtx_REG (mode, regno2);
3176 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3177 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3178 reg2, adjustment));
3182 /* Generate and return a store pair instruction of mode MODE to store
3183 register REG1 to MEM1 and register REG2 to MEM2. */
3185 static rtx
3186 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3187 rtx reg2)
3189 switch (mode)
3191 case E_DImode:
3192 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3194 case E_DFmode:
3195 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3197 default:
3198 gcc_unreachable ();
3202 /* Generate and regurn a load pair isntruction of mode MODE to load register
3203 REG1 from MEM1 and register REG2 from MEM2. */
3205 static rtx
3206 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3207 rtx mem2)
3209 switch (mode)
3211 case E_DImode:
3212 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3214 case E_DFmode:
3215 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3217 default:
3218 gcc_unreachable ();
3222 /* Return TRUE if return address signing should be enabled for the current
3223 function, otherwise return FALSE. */
3225 bool
3226 aarch64_return_address_signing_enabled (void)
3228 /* This function should only be called after frame laid out. */
3229 gcc_assert (cfun->machine->frame.laid_out);
3231 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3232 if it's LR is pushed onto stack. */
3233 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3234 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3235 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3238 /* Emit code to save the callee-saved registers from register number START
3239 to LIMIT to the stack at the location starting at offset START_OFFSET,
3240 skipping any write-back candidates if SKIP_WB is true. */
3242 static void
3243 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3244 unsigned start, unsigned limit, bool skip_wb)
3246 rtx_insn *insn;
3247 unsigned regno;
3248 unsigned regno2;
3250 for (regno = aarch64_next_callee_save (start, limit);
3251 regno <= limit;
3252 regno = aarch64_next_callee_save (regno + 1, limit))
3254 rtx reg, mem;
3255 HOST_WIDE_INT offset;
3257 if (skip_wb
3258 && (regno == cfun->machine->frame.wb_candidate1
3259 || regno == cfun->machine->frame.wb_candidate2))
3260 continue;
3262 if (cfun->machine->reg_is_wrapped_separately[regno])
3263 continue;
3265 reg = gen_rtx_REG (mode, regno);
3266 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3267 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3268 offset));
3270 regno2 = aarch64_next_callee_save (regno + 1, limit);
3272 if (regno2 <= limit
3273 && !cfun->machine->reg_is_wrapped_separately[regno2]
3274 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3275 == cfun->machine->frame.reg_offset[regno2]))
3278 rtx reg2 = gen_rtx_REG (mode, regno2);
3279 rtx mem2;
3281 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3282 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3283 offset));
3284 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3285 reg2));
3287 /* The first part of a frame-related parallel insn is
3288 always assumed to be relevant to the frame
3289 calculations; subsequent parts, are only
3290 frame-related if explicitly marked. */
3291 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3292 regno = regno2;
3294 else
3295 insn = emit_move_insn (mem, reg);
3297 RTX_FRAME_RELATED_P (insn) = 1;
3301 /* Emit code to restore the callee registers of mode MODE from register
3302 number START up to and including LIMIT. Restore from the stack offset
3303 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3304 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3306 static void
3307 aarch64_restore_callee_saves (machine_mode mode,
3308 HOST_WIDE_INT start_offset, unsigned start,
3309 unsigned limit, bool skip_wb, rtx *cfi_ops)
3311 rtx base_rtx = stack_pointer_rtx;
3312 unsigned regno;
3313 unsigned regno2;
3314 HOST_WIDE_INT offset;
3316 for (regno = aarch64_next_callee_save (start, limit);
3317 regno <= limit;
3318 regno = aarch64_next_callee_save (regno + 1, limit))
3320 if (cfun->machine->reg_is_wrapped_separately[regno])
3321 continue;
3323 rtx reg, mem;
3325 if (skip_wb
3326 && (regno == cfun->machine->frame.wb_candidate1
3327 || regno == cfun->machine->frame.wb_candidate2))
3328 continue;
3330 reg = gen_rtx_REG (mode, regno);
3331 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3332 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3334 regno2 = aarch64_next_callee_save (regno + 1, limit);
3336 if (regno2 <= limit
3337 && !cfun->machine->reg_is_wrapped_separately[regno2]
3338 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3339 == cfun->machine->frame.reg_offset[regno2]))
3341 rtx reg2 = gen_rtx_REG (mode, regno2);
3342 rtx mem2;
3344 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3345 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3346 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3348 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3349 regno = regno2;
3351 else
3352 emit_move_insn (reg, mem);
3353 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3357 static inline bool
3358 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3359 HOST_WIDE_INT offset)
3361 return offset >= -256 && offset < 256;
3364 static inline bool
3365 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3367 return (offset >= 0
3368 && offset < 4096 * GET_MODE_SIZE (mode)
3369 && offset % GET_MODE_SIZE (mode) == 0);
3372 bool
3373 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3375 return (offset >= -64 * GET_MODE_SIZE (mode)
3376 && offset < 64 * GET_MODE_SIZE (mode)
3377 && offset % GET_MODE_SIZE (mode) == 0);
3380 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3382 static sbitmap
3383 aarch64_get_separate_components (void)
3385 aarch64_layout_frame ();
3387 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3388 bitmap_clear (components);
3390 /* The registers we need saved to the frame. */
3391 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3392 if (aarch64_register_saved_on_entry (regno))
3394 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3395 if (!frame_pointer_needed)
3396 offset += cfun->machine->frame.frame_size
3397 - cfun->machine->frame.hard_fp_offset;
3398 /* Check that we can access the stack slot of the register with one
3399 direct load with no adjustments needed. */
3400 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3401 bitmap_set_bit (components, regno);
3404 /* Don't mess with the hard frame pointer. */
3405 if (frame_pointer_needed)
3406 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3408 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3409 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3410 /* If aarch64_layout_frame has chosen registers to store/restore with
3411 writeback don't interfere with them to avoid having to output explicit
3412 stack adjustment instructions. */
3413 if (reg2 != INVALID_REGNUM)
3414 bitmap_clear_bit (components, reg2);
3415 if (reg1 != INVALID_REGNUM)
3416 bitmap_clear_bit (components, reg1);
3418 bitmap_clear_bit (components, LR_REGNUM);
3419 bitmap_clear_bit (components, SP_REGNUM);
3421 return components;
3424 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3426 static sbitmap
3427 aarch64_components_for_bb (basic_block bb)
3429 bitmap in = DF_LIVE_IN (bb);
3430 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3431 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3433 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3434 bitmap_clear (components);
3436 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3437 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3438 if ((!call_used_regs[regno])
3439 && (bitmap_bit_p (in, regno)
3440 || bitmap_bit_p (gen, regno)
3441 || bitmap_bit_p (kill, regno)))
3442 bitmap_set_bit (components, regno);
3444 return components;
3447 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3448 Nothing to do for aarch64. */
3450 static void
3451 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3455 /* Return the next set bit in BMP from START onwards. Return the total number
3456 of bits in BMP if no set bit is found at or after START. */
3458 static unsigned int
3459 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3461 unsigned int nbits = SBITMAP_SIZE (bmp);
3462 if (start == nbits)
3463 return start;
3465 gcc_assert (start < nbits);
3466 for (unsigned int i = start; i < nbits; i++)
3467 if (bitmap_bit_p (bmp, i))
3468 return i;
3470 return nbits;
3473 /* Do the work for aarch64_emit_prologue_components and
3474 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3475 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3476 for these components or the epilogue sequence. That is, it determines
3477 whether we should emit stores or loads and what kind of CFA notes to attach
3478 to the insns. Otherwise the logic for the two sequences is very
3479 similar. */
3481 static void
3482 aarch64_process_components (sbitmap components, bool prologue_p)
3484 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3485 ? HARD_FRAME_POINTER_REGNUM
3486 : STACK_POINTER_REGNUM);
3488 unsigned last_regno = SBITMAP_SIZE (components);
3489 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3490 rtx_insn *insn = NULL;
3492 while (regno != last_regno)
3494 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3495 so DFmode for the vector registers is enough. */
3496 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3497 rtx reg = gen_rtx_REG (mode, regno);
3498 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3499 if (!frame_pointer_needed)
3500 offset += cfun->machine->frame.frame_size
3501 - cfun->machine->frame.hard_fp_offset;
3502 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3503 rtx mem = gen_frame_mem (mode, addr);
3505 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3506 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3507 /* No more registers to handle after REGNO.
3508 Emit a single save/restore and exit. */
3509 if (regno2 == last_regno)
3511 insn = emit_insn (set);
3512 RTX_FRAME_RELATED_P (insn) = 1;
3513 if (prologue_p)
3514 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3515 else
3516 add_reg_note (insn, REG_CFA_RESTORE, reg);
3517 break;
3520 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3521 /* The next register is not of the same class or its offset is not
3522 mergeable with the current one into a pair. */
3523 if (!satisfies_constraint_Ump (mem)
3524 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3525 || (offset2 - cfun->machine->frame.reg_offset[regno])
3526 != GET_MODE_SIZE (mode))
3528 insn = emit_insn (set);
3529 RTX_FRAME_RELATED_P (insn) = 1;
3530 if (prologue_p)
3531 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3532 else
3533 add_reg_note (insn, REG_CFA_RESTORE, reg);
3535 regno = regno2;
3536 continue;
3539 /* REGNO2 can be saved/restored in a pair with REGNO. */
3540 rtx reg2 = gen_rtx_REG (mode, regno2);
3541 if (!frame_pointer_needed)
3542 offset2 += cfun->machine->frame.frame_size
3543 - cfun->machine->frame.hard_fp_offset;
3544 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3545 rtx mem2 = gen_frame_mem (mode, addr2);
3546 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3547 : gen_rtx_SET (reg2, mem2);
3549 if (prologue_p)
3550 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3551 else
3552 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3554 RTX_FRAME_RELATED_P (insn) = 1;
3555 if (prologue_p)
3557 add_reg_note (insn, REG_CFA_OFFSET, set);
3558 add_reg_note (insn, REG_CFA_OFFSET, set2);
3560 else
3562 add_reg_note (insn, REG_CFA_RESTORE, reg);
3563 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3566 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3570 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3572 static void
3573 aarch64_emit_prologue_components (sbitmap components)
3575 aarch64_process_components (components, true);
3578 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3580 static void
3581 aarch64_emit_epilogue_components (sbitmap components)
3583 aarch64_process_components (components, false);
3586 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3588 static void
3589 aarch64_set_handled_components (sbitmap components)
3591 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3592 if (bitmap_bit_p (components, regno))
3593 cfun->machine->reg_is_wrapped_separately[regno] = true;
3596 /* AArch64 stack frames generated by this compiler look like:
3598 +-------------------------------+
3600 | incoming stack arguments |
3602 +-------------------------------+
3603 | | <-- incoming stack pointer (aligned)
3604 | callee-allocated save area |
3605 | for register varargs |
3607 +-------------------------------+
3608 | local variables | <-- frame_pointer_rtx
3610 +-------------------------------+
3611 | padding0 | \
3612 +-------------------------------+ |
3613 | callee-saved registers | | frame.saved_regs_size
3614 +-------------------------------+ |
3615 | LR' | |
3616 +-------------------------------+ |
3617 | FP' | / <- hard_frame_pointer_rtx (aligned)
3618 +-------------------------------+
3619 | dynamic allocation |
3620 +-------------------------------+
3621 | padding |
3622 +-------------------------------+
3623 | outgoing stack arguments | <-- arg_pointer
3625 +-------------------------------+
3626 | | <-- stack_pointer_rtx (aligned)
3628 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3629 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3630 unchanged. */
3632 /* Generate the prologue instructions for entry into a function.
3633 Establish the stack frame by decreasing the stack pointer with a
3634 properly calculated size and, if necessary, create a frame record
3635 filled with the values of LR and previous frame pointer. The
3636 current FP is also set up if it is in use. */
3638 void
3639 aarch64_expand_prologue (void)
3641 aarch64_layout_frame ();
3643 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3644 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3645 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3646 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3647 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3648 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3649 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3650 rtx_insn *insn;
3652 /* Sign return address for functions. */
3653 if (aarch64_return_address_signing_enabled ())
3655 insn = emit_insn (gen_pacisp ());
3656 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3657 RTX_FRAME_RELATED_P (insn) = 1;
3660 if (flag_stack_usage_info)
3661 current_function_static_stack_size = frame_size;
3663 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3665 if (crtl->is_leaf && !cfun->calls_alloca)
3667 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3668 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3669 frame_size - STACK_CHECK_PROTECT);
3671 else if (frame_size > 0)
3672 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3675 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3677 if (callee_adjust != 0)
3678 aarch64_push_regs (reg1, reg2, callee_adjust);
3680 if (frame_pointer_needed)
3682 if (callee_adjust == 0)
3683 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3684 R30_REGNUM, false);
3685 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3686 stack_pointer_rtx,
3687 GEN_INT (callee_offset)));
3688 RTX_FRAME_RELATED_P (insn) = 1;
3689 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3692 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3693 callee_adjust != 0 || frame_pointer_needed);
3694 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3695 callee_adjust != 0 || frame_pointer_needed);
3696 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3699 /* Return TRUE if we can use a simple_return insn.
3701 This function checks whether the callee saved stack is empty, which
3702 means no restore actions are need. The pro_and_epilogue will use
3703 this to check whether shrink-wrapping opt is feasible. */
3705 bool
3706 aarch64_use_return_insn_p (void)
3708 if (!reload_completed)
3709 return false;
3711 if (crtl->profile)
3712 return false;
3714 aarch64_layout_frame ();
3716 return cfun->machine->frame.frame_size == 0;
3719 /* Generate the epilogue instructions for returning from a function.
3720 This is almost exactly the reverse of the prolog sequence, except
3721 that we need to insert barriers to avoid scheduling loads that read
3722 from a deallocated stack, and we optimize the unwind records by
3723 emitting them all together if possible. */
3724 void
3725 aarch64_expand_epilogue (bool for_sibcall)
3727 aarch64_layout_frame ();
3729 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3730 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3731 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3732 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3733 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3734 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3735 rtx cfi_ops = NULL;
3736 rtx_insn *insn;
3738 /* We need to add memory barrier to prevent read from deallocated stack. */
3739 bool need_barrier_p = (get_frame_size ()
3740 + cfun->machine->frame.saved_varargs_size) != 0;
3742 /* Emit a barrier to prevent loads from a deallocated stack. */
3743 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3744 || crtl->calls_eh_return)
3746 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3747 need_barrier_p = false;
3750 /* Restore the stack pointer from the frame pointer if it may not
3751 be the same as the stack pointer. */
3752 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3754 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3755 hard_frame_pointer_rtx,
3756 GEN_INT (-callee_offset)));
3757 /* If writeback is used when restoring callee-saves, the CFA
3758 is restored on the instruction doing the writeback. */
3759 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3761 else
3762 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3764 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3765 callee_adjust != 0, &cfi_ops);
3766 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3767 callee_adjust != 0, &cfi_ops);
3769 if (need_barrier_p)
3770 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3772 if (callee_adjust != 0)
3773 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3775 if (callee_adjust != 0 || initial_adjust > 65536)
3777 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3778 insn = get_last_insn ();
3779 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3780 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3781 RTX_FRAME_RELATED_P (insn) = 1;
3782 cfi_ops = NULL;
3785 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3787 if (cfi_ops)
3789 /* Emit delayed restores and reset the CFA to be SP. */
3790 insn = get_last_insn ();
3791 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3792 REG_NOTES (insn) = cfi_ops;
3793 RTX_FRAME_RELATED_P (insn) = 1;
3796 /* We prefer to emit the combined return/authenticate instruction RETAA,
3797 however there are three cases in which we must instead emit an explicit
3798 authentication instruction.
3800 1) Sibcalls don't return in a normal way, so if we're about to call one
3801 we must authenticate.
3803 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3804 generating code for !TARGET_ARMV8_3 we can't use it and must
3805 explicitly authenticate.
3807 3) On an eh_return path we make extra stack adjustments to update the
3808 canonical frame address to be the exception handler's CFA. We want
3809 to authenticate using the CFA of the function which calls eh_return.
3811 if (aarch64_return_address_signing_enabled ()
3812 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3814 insn = emit_insn (gen_autisp ());
3815 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3816 RTX_FRAME_RELATED_P (insn) = 1;
3819 /* Stack adjustment for exception handler. */
3820 if (crtl->calls_eh_return)
3822 /* We need to unwind the stack by the offset computed by
3823 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3824 to be SP; letting the CFA move during this adjustment
3825 is just as correct as retaining the CFA from the body
3826 of the function. Therefore, do nothing special. */
3827 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3830 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3831 if (!for_sibcall)
3832 emit_jump_insn (ret_rtx);
3835 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3836 normally or return to a previous frame after unwinding.
3838 An EH return uses a single shared return sequence. The epilogue is
3839 exactly like a normal epilogue except that it has an extra input
3840 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3841 that must be applied after the frame has been destroyed. An extra label
3842 is inserted before the epilogue which initializes this register to zero,
3843 and this is the entry point for a normal return.
3845 An actual EH return updates the return address, initializes the stack
3846 adjustment and jumps directly into the epilogue (bypassing the zeroing
3847 of the adjustment). Since the return address is typically saved on the
3848 stack when a function makes a call, the saved LR must be updated outside
3849 the epilogue.
3851 This poses problems as the store is generated well before the epilogue,
3852 so the offset of LR is not known yet. Also optimizations will remove the
3853 store as it appears dead, even after the epilogue is generated (as the
3854 base or offset for loading LR is different in many cases).
3856 To avoid these problems this implementation forces the frame pointer
3857 in eh_return functions so that the location of LR is fixed and known early.
3858 It also marks the store volatile, so no optimization is permitted to
3859 remove the store. */
3861 aarch64_eh_return_handler_rtx (void)
3863 rtx tmp = gen_frame_mem (Pmode,
3864 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3866 /* Mark the store volatile, so no optimization is permitted to remove it. */
3867 MEM_VOLATILE_P (tmp) = true;
3868 return tmp;
3871 /* Output code to add DELTA to the first argument, and then jump
3872 to FUNCTION. Used for C++ multiple inheritance. */
3873 static void
3874 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3875 HOST_WIDE_INT delta,
3876 HOST_WIDE_INT vcall_offset,
3877 tree function)
3879 /* The this pointer is always in x0. Note that this differs from
3880 Arm where the this pointer maybe bumped to r1 if r0 is required
3881 to return a pointer to an aggregate. On AArch64 a result value
3882 pointer will be in x8. */
3883 int this_regno = R0_REGNUM;
3884 rtx this_rtx, temp0, temp1, addr, funexp;
3885 rtx_insn *insn;
3887 reload_completed = 1;
3888 emit_note (NOTE_INSN_PROLOGUE_END);
3890 if (vcall_offset == 0)
3891 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3892 else
3894 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3896 this_rtx = gen_rtx_REG (Pmode, this_regno);
3897 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3898 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3900 addr = this_rtx;
3901 if (delta != 0)
3903 if (delta >= -256 && delta < 256)
3904 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3905 plus_constant (Pmode, this_rtx, delta));
3906 else
3907 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3910 if (Pmode == ptr_mode)
3911 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3912 else
3913 aarch64_emit_move (temp0,
3914 gen_rtx_ZERO_EXTEND (Pmode,
3915 gen_rtx_MEM (ptr_mode, addr)));
3917 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3918 addr = plus_constant (Pmode, temp0, vcall_offset);
3919 else
3921 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3922 Pmode);
3923 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3926 if (Pmode == ptr_mode)
3927 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3928 else
3929 aarch64_emit_move (temp1,
3930 gen_rtx_SIGN_EXTEND (Pmode,
3931 gen_rtx_MEM (ptr_mode, addr)));
3933 emit_insn (gen_add2_insn (this_rtx, temp1));
3936 /* Generate a tail call to the target function. */
3937 if (!TREE_USED (function))
3939 assemble_external (function);
3940 TREE_USED (function) = 1;
3942 funexp = XEXP (DECL_RTL (function), 0);
3943 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3944 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3945 SIBLING_CALL_P (insn) = 1;
3947 insn = get_insns ();
3948 shorten_branches (insn);
3949 final_start_function (insn, file, 1);
3950 final (insn, file, 1);
3951 final_end_function ();
3953 /* Stop pretending to be a post-reload pass. */
3954 reload_completed = 0;
3957 static bool
3958 aarch64_tls_referenced_p (rtx x)
3960 if (!TARGET_HAVE_TLS)
3961 return false;
3962 subrtx_iterator::array_type array;
3963 FOR_EACH_SUBRTX (iter, array, x, ALL)
3965 const_rtx x = *iter;
3966 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3967 return true;
3968 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3969 TLS offsets, not real symbol references. */
3970 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3971 iter.skip_subrtxes ();
3973 return false;
3977 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3978 a left shift of 0 or 12 bits. */
3979 bool
3980 aarch64_uimm12_shift (HOST_WIDE_INT val)
3982 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3983 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3988 /* Return true if val is an immediate that can be loaded into a
3989 register by a MOVZ instruction. */
3990 static bool
3991 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3993 if (GET_MODE_SIZE (mode) > 4)
3995 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3996 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3997 return 1;
3999 else
4001 /* Ignore sign extension. */
4002 val &= (HOST_WIDE_INT) 0xffffffff;
4004 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4005 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4008 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4010 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4012 0x0000000100000001ull,
4013 0x0001000100010001ull,
4014 0x0101010101010101ull,
4015 0x1111111111111111ull,
4016 0x5555555555555555ull,
4020 /* Return true if val is a valid bitmask immediate. */
4022 bool
4023 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4025 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4026 int bits;
4028 /* Check for a single sequence of one bits and return quickly if so.
4029 The special cases of all ones and all zeroes returns false. */
4030 val = (unsigned HOST_WIDE_INT) val_in;
4031 tmp = val + (val & -val);
4033 if (tmp == (tmp & -tmp))
4034 return (val + 1) > 1;
4036 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4037 if (mode == SImode)
4038 val = (val << 32) | (val & 0xffffffff);
4040 /* Invert if the immediate doesn't start with a zero bit - this means we
4041 only need to search for sequences of one bits. */
4042 if (val & 1)
4043 val = ~val;
4045 /* Find the first set bit and set tmp to val with the first sequence of one
4046 bits removed. Return success if there is a single sequence of ones. */
4047 first_one = val & -val;
4048 tmp = val & (val + first_one);
4050 if (tmp == 0)
4051 return true;
4053 /* Find the next set bit and compute the difference in bit position. */
4054 next_one = tmp & -tmp;
4055 bits = clz_hwi (first_one) - clz_hwi (next_one);
4056 mask = val ^ tmp;
4058 /* Check the bit position difference is a power of 2, and that the first
4059 sequence of one bits fits within 'bits' bits. */
4060 if ((mask >> bits) != 0 || bits != (bits & -bits))
4061 return false;
4063 /* Check the sequence of one bits is repeated 64/bits times. */
4064 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4067 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4068 Assumed precondition: VAL_IN Is not zero. */
4070 unsigned HOST_WIDE_INT
4071 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4073 int lowest_bit_set = ctz_hwi (val_in);
4074 int highest_bit_set = floor_log2 (val_in);
4075 gcc_assert (val_in != 0);
4077 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4078 (HOST_WIDE_INT_1U << lowest_bit_set));
4081 /* Create constant where bits outside of lowest bit set to highest bit set
4082 are set to 1. */
4084 unsigned HOST_WIDE_INT
4085 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4087 return val_in | ~aarch64_and_split_imm1 (val_in);
4090 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4092 bool
4093 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4095 if (aarch64_bitmask_imm (val_in, mode))
4096 return false;
4098 if (aarch64_move_imm (val_in, mode))
4099 return false;
4101 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4103 return aarch64_bitmask_imm (imm2, mode);
4106 /* Return true if val is an immediate that can be loaded into a
4107 register in a single instruction. */
4108 bool
4109 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4111 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4112 return 1;
4113 return aarch64_bitmask_imm (val, mode);
4116 static bool
4117 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4119 rtx base, offset;
4121 if (GET_CODE (x) == HIGH)
4122 return true;
4124 split_const (x, &base, &offset);
4125 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4127 if (aarch64_classify_symbol (base, offset)
4128 != SYMBOL_FORCE_TO_MEM)
4129 return true;
4130 else
4131 /* Avoid generating a 64-bit relocation in ILP32; leave
4132 to aarch64_expand_mov_immediate to handle it properly. */
4133 return mode != ptr_mode;
4136 return aarch64_tls_referenced_p (x);
4139 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4140 The expansion for a table switch is quite expensive due to the number
4141 of instructions, the table lookup and hard to predict indirect jump.
4142 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4143 set, otherwise use tables for > 16 cases as a tradeoff between size and
4144 performance. When optimizing for size, use the default setting. */
4146 static unsigned int
4147 aarch64_case_values_threshold (void)
4149 /* Use the specified limit for the number of cases before using jump
4150 tables at higher optimization levels. */
4151 if (optimize > 2
4152 && selected_cpu->tune->max_case_values != 0)
4153 return selected_cpu->tune->max_case_values;
4154 else
4155 return optimize_size ? default_case_values_threshold () : 17;
4158 /* Return true if register REGNO is a valid index register.
4159 STRICT_P is true if REG_OK_STRICT is in effect. */
4161 bool
4162 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4164 if (!HARD_REGISTER_NUM_P (regno))
4166 if (!strict_p)
4167 return true;
4169 if (!reg_renumber)
4170 return false;
4172 regno = reg_renumber[regno];
4174 return GP_REGNUM_P (regno);
4177 /* Return true if register REGNO is a valid base register for mode MODE.
4178 STRICT_P is true if REG_OK_STRICT is in effect. */
4180 bool
4181 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4183 if (!HARD_REGISTER_NUM_P (regno))
4185 if (!strict_p)
4186 return true;
4188 if (!reg_renumber)
4189 return false;
4191 regno = reg_renumber[regno];
4194 /* The fake registers will be eliminated to either the stack or
4195 hard frame pointer, both of which are usually valid base registers.
4196 Reload deals with the cases where the eliminated form isn't valid. */
4197 return (GP_REGNUM_P (regno)
4198 || regno == SP_REGNUM
4199 || regno == FRAME_POINTER_REGNUM
4200 || regno == ARG_POINTER_REGNUM);
4203 /* Return true if X is a valid base register for mode MODE.
4204 STRICT_P is true if REG_OK_STRICT is in effect. */
4206 static bool
4207 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4209 if (!strict_p
4210 && GET_CODE (x) == SUBREG
4211 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4212 x = SUBREG_REG (x);
4214 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4217 /* Return true if address offset is a valid index. If it is, fill in INFO
4218 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4220 static bool
4221 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4222 machine_mode mode, bool strict_p)
4224 enum aarch64_address_type type;
4225 rtx index;
4226 int shift;
4228 /* (reg:P) */
4229 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4230 && GET_MODE (x) == Pmode)
4232 type = ADDRESS_REG_REG;
4233 index = x;
4234 shift = 0;
4236 /* (sign_extend:DI (reg:SI)) */
4237 else if ((GET_CODE (x) == SIGN_EXTEND
4238 || GET_CODE (x) == ZERO_EXTEND)
4239 && GET_MODE (x) == DImode
4240 && GET_MODE (XEXP (x, 0)) == SImode)
4242 type = (GET_CODE (x) == SIGN_EXTEND)
4243 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4244 index = XEXP (x, 0);
4245 shift = 0;
4247 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4248 else if (GET_CODE (x) == MULT
4249 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4250 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4251 && GET_MODE (XEXP (x, 0)) == DImode
4252 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4253 && CONST_INT_P (XEXP (x, 1)))
4255 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4256 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4257 index = XEXP (XEXP (x, 0), 0);
4258 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4260 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4261 else if (GET_CODE (x) == ASHIFT
4262 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4263 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4264 && GET_MODE (XEXP (x, 0)) == DImode
4265 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4266 && CONST_INT_P (XEXP (x, 1)))
4268 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4269 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4270 index = XEXP (XEXP (x, 0), 0);
4271 shift = INTVAL (XEXP (x, 1));
4273 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4274 else if ((GET_CODE (x) == SIGN_EXTRACT
4275 || GET_CODE (x) == ZERO_EXTRACT)
4276 && GET_MODE (x) == DImode
4277 && GET_CODE (XEXP (x, 0)) == MULT
4278 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4279 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4281 type = (GET_CODE (x) == SIGN_EXTRACT)
4282 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4283 index = XEXP (XEXP (x, 0), 0);
4284 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4285 if (INTVAL (XEXP (x, 1)) != 32 + shift
4286 || INTVAL (XEXP (x, 2)) != 0)
4287 shift = -1;
4289 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4290 (const_int 0xffffffff<<shift)) */
4291 else if (GET_CODE (x) == AND
4292 && GET_MODE (x) == DImode
4293 && GET_CODE (XEXP (x, 0)) == MULT
4294 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4295 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4296 && CONST_INT_P (XEXP (x, 1)))
4298 type = ADDRESS_REG_UXTW;
4299 index = XEXP (XEXP (x, 0), 0);
4300 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4301 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4302 shift = -1;
4304 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4305 else if ((GET_CODE (x) == SIGN_EXTRACT
4306 || GET_CODE (x) == ZERO_EXTRACT)
4307 && GET_MODE (x) == DImode
4308 && GET_CODE (XEXP (x, 0)) == ASHIFT
4309 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4310 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4312 type = (GET_CODE (x) == SIGN_EXTRACT)
4313 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4314 index = XEXP (XEXP (x, 0), 0);
4315 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4316 if (INTVAL (XEXP (x, 1)) != 32 + shift
4317 || INTVAL (XEXP (x, 2)) != 0)
4318 shift = -1;
4320 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4321 (const_int 0xffffffff<<shift)) */
4322 else if (GET_CODE (x) == AND
4323 && GET_MODE (x) == DImode
4324 && GET_CODE (XEXP (x, 0)) == ASHIFT
4325 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4326 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4327 && CONST_INT_P (XEXP (x, 1)))
4329 type = ADDRESS_REG_UXTW;
4330 index = XEXP (XEXP (x, 0), 0);
4331 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4332 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4333 shift = -1;
4335 /* (mult:P (reg:P) (const_int scale)) */
4336 else if (GET_CODE (x) == MULT
4337 && GET_MODE (x) == Pmode
4338 && GET_MODE (XEXP (x, 0)) == Pmode
4339 && CONST_INT_P (XEXP (x, 1)))
4341 type = ADDRESS_REG_REG;
4342 index = XEXP (x, 0);
4343 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4345 /* (ashift:P (reg:P) (const_int shift)) */
4346 else if (GET_CODE (x) == ASHIFT
4347 && GET_MODE (x) == Pmode
4348 && GET_MODE (XEXP (x, 0)) == Pmode
4349 && CONST_INT_P (XEXP (x, 1)))
4351 type = ADDRESS_REG_REG;
4352 index = XEXP (x, 0);
4353 shift = INTVAL (XEXP (x, 1));
4355 else
4356 return false;
4358 if (!strict_p
4359 && GET_CODE (index) == SUBREG
4360 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4361 index = SUBREG_REG (index);
4363 if ((shift == 0 ||
4364 (shift > 0 && shift <= 3
4365 && (1 << shift) == GET_MODE_SIZE (mode)))
4366 && REG_P (index)
4367 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4369 info->type = type;
4370 info->offset = index;
4371 info->shift = shift;
4372 return true;
4375 return false;
4378 /* Return true if MODE is one of the modes for which we
4379 support LDP/STP operations. */
4381 static bool
4382 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4384 return mode == SImode || mode == DImode
4385 || mode == SFmode || mode == DFmode
4386 || (aarch64_vector_mode_supported_p (mode)
4387 && GET_MODE_SIZE (mode) == 8);
4390 /* Return true if REGNO is a virtual pointer register, or an eliminable
4391 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4392 include stack_pointer or hard_frame_pointer. */
4393 static bool
4394 virt_or_elim_regno_p (unsigned regno)
4396 return ((regno >= FIRST_VIRTUAL_REGISTER
4397 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4398 || regno == FRAME_POINTER_REGNUM
4399 || regno == ARG_POINTER_REGNUM);
4402 /* Return true if X is a valid address for machine mode MODE. If it is,
4403 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4404 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4406 static bool
4407 aarch64_classify_address (struct aarch64_address_info *info,
4408 rtx x, machine_mode mode,
4409 RTX_CODE outer_code, bool strict_p)
4411 enum rtx_code code = GET_CODE (x);
4412 rtx op0, op1;
4414 /* On BE, we use load/store pair for all large int mode load/stores.
4415 TI/TFmode may also use a load/store pair. */
4416 bool load_store_pair_p = (outer_code == PARALLEL
4417 || mode == TImode
4418 || mode == TFmode
4419 || (BYTES_BIG_ENDIAN
4420 && aarch64_vect_struct_mode_p (mode)));
4422 bool allow_reg_index_p =
4423 !load_store_pair_p
4424 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4425 && !aarch64_vect_struct_mode_p (mode);
4427 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4428 REG addressing. */
4429 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4430 && (code != POST_INC && code != REG))
4431 return false;
4433 switch (code)
4435 case REG:
4436 case SUBREG:
4437 info->type = ADDRESS_REG_IMM;
4438 info->base = x;
4439 info->offset = const0_rtx;
4440 return aarch64_base_register_rtx_p (x, strict_p);
4442 case PLUS:
4443 op0 = XEXP (x, 0);
4444 op1 = XEXP (x, 1);
4446 if (! strict_p
4447 && REG_P (op0)
4448 && virt_or_elim_regno_p (REGNO (op0))
4449 && CONST_INT_P (op1))
4451 info->type = ADDRESS_REG_IMM;
4452 info->base = op0;
4453 info->offset = op1;
4455 return true;
4458 if (GET_MODE_SIZE (mode) != 0
4459 && CONST_INT_P (op1)
4460 && aarch64_base_register_rtx_p (op0, strict_p))
4462 HOST_WIDE_INT offset = INTVAL (op1);
4464 info->type = ADDRESS_REG_IMM;
4465 info->base = op0;
4466 info->offset = op1;
4468 /* TImode and TFmode values are allowed in both pairs of X
4469 registers and individual Q registers. The available
4470 address modes are:
4471 X,X: 7-bit signed scaled offset
4472 Q: 9-bit signed offset
4473 We conservatively require an offset representable in either mode.
4474 When performing the check for pairs of X registers i.e. LDP/STP
4475 pass down DImode since that is the natural size of the LDP/STP
4476 instruction memory accesses. */
4477 if (mode == TImode || mode == TFmode)
4478 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4479 && (offset_9bit_signed_unscaled_p (mode, offset)
4480 || offset_12bit_unsigned_scaled_p (mode, offset)));
4482 /* A 7bit offset check because OImode will emit a ldp/stp
4483 instruction (only big endian will get here).
4484 For ldp/stp instructions, the offset is scaled for the size of a
4485 single element of the pair. */
4486 if (mode == OImode)
4487 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4489 /* Three 9/12 bit offsets checks because CImode will emit three
4490 ldr/str instructions (only big endian will get here). */
4491 if (mode == CImode)
4492 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4493 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4494 || offset_12bit_unsigned_scaled_p (V16QImode,
4495 offset + 32)));
4497 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4498 instructions (only big endian will get here). */
4499 if (mode == XImode)
4500 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4501 && aarch64_offset_7bit_signed_scaled_p (TImode,
4502 offset + 32));
4504 if (load_store_pair_p)
4505 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4506 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4507 else
4508 return (offset_9bit_signed_unscaled_p (mode, offset)
4509 || offset_12bit_unsigned_scaled_p (mode, offset));
4512 if (allow_reg_index_p)
4514 /* Look for base + (scaled/extended) index register. */
4515 if (aarch64_base_register_rtx_p (op0, strict_p)
4516 && aarch64_classify_index (info, op1, mode, strict_p))
4518 info->base = op0;
4519 return true;
4521 if (aarch64_base_register_rtx_p (op1, strict_p)
4522 && aarch64_classify_index (info, op0, mode, strict_p))
4524 info->base = op1;
4525 return true;
4529 return false;
4531 case POST_INC:
4532 case POST_DEC:
4533 case PRE_INC:
4534 case PRE_DEC:
4535 info->type = ADDRESS_REG_WB;
4536 info->base = XEXP (x, 0);
4537 info->offset = NULL_RTX;
4538 return aarch64_base_register_rtx_p (info->base, strict_p);
4540 case POST_MODIFY:
4541 case PRE_MODIFY:
4542 info->type = ADDRESS_REG_WB;
4543 info->base = XEXP (x, 0);
4544 if (GET_CODE (XEXP (x, 1)) == PLUS
4545 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4546 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4547 && aarch64_base_register_rtx_p (info->base, strict_p))
4549 HOST_WIDE_INT offset;
4550 info->offset = XEXP (XEXP (x, 1), 1);
4551 offset = INTVAL (info->offset);
4553 /* TImode and TFmode values are allowed in both pairs of X
4554 registers and individual Q registers. The available
4555 address modes are:
4556 X,X: 7-bit signed scaled offset
4557 Q: 9-bit signed offset
4558 We conservatively require an offset representable in either mode.
4560 if (mode == TImode || mode == TFmode)
4561 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4562 && offset_9bit_signed_unscaled_p (mode, offset));
4564 if (load_store_pair_p)
4565 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4566 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4567 else
4568 return offset_9bit_signed_unscaled_p (mode, offset);
4570 return false;
4572 case CONST:
4573 case SYMBOL_REF:
4574 case LABEL_REF:
4575 /* load literal: pc-relative constant pool entry. Only supported
4576 for SI mode or larger. */
4577 info->type = ADDRESS_SYMBOLIC;
4579 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4581 rtx sym, addend;
4583 split_const (x, &sym, &addend);
4584 return ((GET_CODE (sym) == LABEL_REF
4585 || (GET_CODE (sym) == SYMBOL_REF
4586 && CONSTANT_POOL_ADDRESS_P (sym)
4587 && aarch64_pcrelative_literal_loads)));
4589 return false;
4591 case LO_SUM:
4592 info->type = ADDRESS_LO_SUM;
4593 info->base = XEXP (x, 0);
4594 info->offset = XEXP (x, 1);
4595 if (allow_reg_index_p
4596 && aarch64_base_register_rtx_p (info->base, strict_p))
4598 rtx sym, offs;
4599 split_const (info->offset, &sym, &offs);
4600 if (GET_CODE (sym) == SYMBOL_REF
4601 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4603 /* The symbol and offset must be aligned to the access size. */
4604 unsigned int align;
4605 unsigned int ref_size;
4607 if (CONSTANT_POOL_ADDRESS_P (sym))
4608 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4609 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4611 tree exp = SYMBOL_REF_DECL (sym);
4612 align = TYPE_ALIGN (TREE_TYPE (exp));
4613 align = CONSTANT_ALIGNMENT (exp, align);
4615 else if (SYMBOL_REF_DECL (sym))
4616 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4617 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4618 && SYMBOL_REF_BLOCK (sym) != NULL)
4619 align = SYMBOL_REF_BLOCK (sym)->alignment;
4620 else
4621 align = BITS_PER_UNIT;
4623 ref_size = GET_MODE_SIZE (mode);
4624 if (ref_size == 0)
4625 ref_size = GET_MODE_SIZE (DImode);
4627 return ((INTVAL (offs) & (ref_size - 1)) == 0
4628 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4631 return false;
4633 default:
4634 return false;
4638 /* Return true if the address X is valid for a PRFM instruction.
4639 STRICT_P is true if we should do strict checking with
4640 aarch64_classify_address. */
4642 bool
4643 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4645 struct aarch64_address_info addr;
4647 /* PRFM accepts the same addresses as DImode... */
4648 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4649 if (!res)
4650 return false;
4652 /* ... except writeback forms. */
4653 return addr.type != ADDRESS_REG_WB;
4656 bool
4657 aarch64_symbolic_address_p (rtx x)
4659 rtx offset;
4661 split_const (x, &x, &offset);
4662 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4665 /* Classify the base of symbolic expression X. */
4667 enum aarch64_symbol_type
4668 aarch64_classify_symbolic_expression (rtx x)
4670 rtx offset;
4672 split_const (x, &x, &offset);
4673 return aarch64_classify_symbol (x, offset);
4677 /* Return TRUE if X is a legitimate address for accessing memory in
4678 mode MODE. */
4679 static bool
4680 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4682 struct aarch64_address_info addr;
4684 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4687 /* Return TRUE if X is a legitimate address for accessing memory in
4688 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4689 pair operation. */
4690 bool
4691 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4692 RTX_CODE outer_code, bool strict_p)
4694 struct aarch64_address_info addr;
4696 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4699 /* Split an out-of-range address displacement into a base and offset.
4700 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4701 to increase opportunities for sharing the base address of different sizes.
4702 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4703 static bool
4704 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4706 HOST_WIDE_INT offset = INTVAL (*disp);
4707 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4709 if (mode == TImode || mode == TFmode
4710 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4711 base = (offset + 0x100) & ~0x1ff;
4713 *off = GEN_INT (base);
4714 *disp = GEN_INT (offset - base);
4715 return true;
4718 /* Return the binary representation of floating point constant VALUE in INTVAL.
4719 If the value cannot be converted, return false without setting INTVAL.
4720 The conversion is done in the given MODE. */
4721 bool
4722 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4725 /* We make a general exception for 0. */
4726 if (aarch64_float_const_zero_rtx_p (value))
4728 *intval = 0;
4729 return true;
4732 machine_mode mode = GET_MODE (value);
4733 if (GET_CODE (value) != CONST_DOUBLE
4734 || !SCALAR_FLOAT_MODE_P (mode)
4735 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4736 /* Only support up to DF mode. */
4737 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4738 return false;
4740 unsigned HOST_WIDE_INT ival = 0;
4742 long res[2];
4743 real_to_target (res,
4744 CONST_DOUBLE_REAL_VALUE (value),
4745 REAL_MODE_FORMAT (mode));
4747 if (mode == DFmode)
4749 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4750 ival = zext_hwi (res[order], 32);
4751 ival |= (zext_hwi (res[1 - order], 32) << 32);
4753 else
4754 ival = zext_hwi (res[0], 32);
4756 *intval = ival;
4757 return true;
4760 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4761 single MOV(+MOVK) followed by an FMOV. */
4762 bool
4763 aarch64_float_const_rtx_p (rtx x)
4765 machine_mode mode = GET_MODE (x);
4766 if (mode == VOIDmode)
4767 return false;
4769 /* Determine whether it's cheaper to write float constants as
4770 mov/movk pairs over ldr/adrp pairs. */
4771 unsigned HOST_WIDE_INT ival;
4773 if (GET_CODE (x) == CONST_DOUBLE
4774 && SCALAR_FLOAT_MODE_P (mode)
4775 && aarch64_reinterpret_float_as_int (x, &ival))
4777 machine_mode imode = (mode == HFmode
4778 ? SImode
4779 : int_mode_for_mode (mode).require ());
4780 int num_instr = aarch64_internal_mov_immediate
4781 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4782 return num_instr < 3;
4785 return false;
4788 /* Return TRUE if rtx X is immediate constant 0.0 */
4789 bool
4790 aarch64_float_const_zero_rtx_p (rtx x)
4792 if (GET_MODE (x) == VOIDmode)
4793 return false;
4795 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4796 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4797 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4800 /* Return TRUE if rtx X is immediate constant that fits in a single
4801 MOVI immediate operation. */
4802 bool
4803 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4805 if (!TARGET_SIMD)
4806 return false;
4808 machine_mode vmode, imode;
4809 unsigned HOST_WIDE_INT ival;
4811 if (GET_CODE (x) == CONST_DOUBLE
4812 && SCALAR_FLOAT_MODE_P (mode))
4814 if (!aarch64_reinterpret_float_as_int (x, &ival))
4815 return false;
4817 /* We make a general exception for 0. */
4818 if (aarch64_float_const_zero_rtx_p (x))
4819 return true;
4821 imode = int_mode_for_mode (mode).require ();
4823 else if (GET_CODE (x) == CONST_INT
4824 && SCALAR_INT_MODE_P (mode))
4826 imode = mode;
4827 ival = INTVAL (x);
4829 else
4830 return false;
4832 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4833 a 128 bit vector mode. */
4834 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4836 vmode = aarch64_simd_container_mode (imode, width);
4837 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4839 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4843 /* Return the fixed registers used for condition codes. */
4845 static bool
4846 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4848 *p1 = CC_REGNUM;
4849 *p2 = INVALID_REGNUM;
4850 return true;
4853 /* This function is used by the call expanders of the machine description.
4854 RESULT is the register in which the result is returned. It's NULL for
4855 "call" and "sibcall".
4856 MEM is the location of the function call.
4857 SIBCALL indicates whether this function call is normal call or sibling call.
4858 It will generate different pattern accordingly. */
4860 void
4861 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4863 rtx call, callee, tmp;
4864 rtvec vec;
4865 machine_mode mode;
4867 gcc_assert (MEM_P (mem));
4868 callee = XEXP (mem, 0);
4869 mode = GET_MODE (callee);
4870 gcc_assert (mode == Pmode);
4872 /* Decide if we should generate indirect calls by loading the
4873 address of the callee into a register before performing
4874 the branch-and-link. */
4875 if (SYMBOL_REF_P (callee)
4876 ? (aarch64_is_long_call_p (callee)
4877 || aarch64_is_noplt_call_p (callee))
4878 : !REG_P (callee))
4879 XEXP (mem, 0) = force_reg (mode, callee);
4881 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4883 if (result != NULL_RTX)
4884 call = gen_rtx_SET (result, call);
4886 if (sibcall)
4887 tmp = ret_rtx;
4888 else
4889 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4891 vec = gen_rtvec (2, call, tmp);
4892 call = gen_rtx_PARALLEL (VOIDmode, vec);
4894 aarch64_emit_call_insn (call);
4897 /* Emit call insn with PAT and do aarch64-specific handling. */
4899 void
4900 aarch64_emit_call_insn (rtx pat)
4902 rtx insn = emit_call_insn (pat);
4904 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4905 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4906 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4909 machine_mode
4910 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4912 /* All floating point compares return CCFP if it is an equality
4913 comparison, and CCFPE otherwise. */
4914 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4916 switch (code)
4918 case EQ:
4919 case NE:
4920 case UNORDERED:
4921 case ORDERED:
4922 case UNLT:
4923 case UNLE:
4924 case UNGT:
4925 case UNGE:
4926 case UNEQ:
4927 case LTGT:
4928 return CCFPmode;
4930 case LT:
4931 case LE:
4932 case GT:
4933 case GE:
4934 return CCFPEmode;
4936 default:
4937 gcc_unreachable ();
4941 /* Equality comparisons of short modes against zero can be performed
4942 using the TST instruction with the appropriate bitmask. */
4943 if (y == const0_rtx && REG_P (x)
4944 && (code == EQ || code == NE)
4945 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4946 return CC_NZmode;
4948 /* Similarly, comparisons of zero_extends from shorter modes can
4949 be performed using an ANDS with an immediate mask. */
4950 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4951 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4952 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4953 && (code == EQ || code == NE))
4954 return CC_NZmode;
4956 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4957 && y == const0_rtx
4958 && (code == EQ || code == NE || code == LT || code == GE)
4959 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4960 || GET_CODE (x) == NEG
4961 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4962 && CONST_INT_P (XEXP (x, 2)))))
4963 return CC_NZmode;
4965 /* A compare with a shifted operand. Because of canonicalization,
4966 the comparison will have to be swapped when we emit the assembly
4967 code. */
4968 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4969 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4970 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4971 || GET_CODE (x) == LSHIFTRT
4972 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4973 return CC_SWPmode;
4975 /* Similarly for a negated operand, but we can only do this for
4976 equalities. */
4977 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4978 && (REG_P (y) || GET_CODE (y) == SUBREG)
4979 && (code == EQ || code == NE)
4980 && GET_CODE (x) == NEG)
4981 return CC_Zmode;
4983 /* A test for unsigned overflow. */
4984 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4985 && code == NE
4986 && GET_CODE (x) == PLUS
4987 && GET_CODE (y) == ZERO_EXTEND)
4988 return CC_Cmode;
4990 /* For everything else, return CCmode. */
4991 return CCmode;
4994 static int
4995 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4998 aarch64_get_condition_code (rtx x)
5000 machine_mode mode = GET_MODE (XEXP (x, 0));
5001 enum rtx_code comp_code = GET_CODE (x);
5003 if (GET_MODE_CLASS (mode) != MODE_CC)
5004 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5005 return aarch64_get_condition_code_1 (mode, comp_code);
5008 static int
5009 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5011 switch (mode)
5013 case E_CCFPmode:
5014 case E_CCFPEmode:
5015 switch (comp_code)
5017 case GE: return AARCH64_GE;
5018 case GT: return AARCH64_GT;
5019 case LE: return AARCH64_LS;
5020 case LT: return AARCH64_MI;
5021 case NE: return AARCH64_NE;
5022 case EQ: return AARCH64_EQ;
5023 case ORDERED: return AARCH64_VC;
5024 case UNORDERED: return AARCH64_VS;
5025 case UNLT: return AARCH64_LT;
5026 case UNLE: return AARCH64_LE;
5027 case UNGT: return AARCH64_HI;
5028 case UNGE: return AARCH64_PL;
5029 default: return -1;
5031 break;
5033 case E_CCmode:
5034 switch (comp_code)
5036 case NE: return AARCH64_NE;
5037 case EQ: return AARCH64_EQ;
5038 case GE: return AARCH64_GE;
5039 case GT: return AARCH64_GT;
5040 case LE: return AARCH64_LE;
5041 case LT: return AARCH64_LT;
5042 case GEU: return AARCH64_CS;
5043 case GTU: return AARCH64_HI;
5044 case LEU: return AARCH64_LS;
5045 case LTU: return AARCH64_CC;
5046 default: return -1;
5048 break;
5050 case E_CC_SWPmode:
5051 switch (comp_code)
5053 case NE: return AARCH64_NE;
5054 case EQ: return AARCH64_EQ;
5055 case GE: return AARCH64_LE;
5056 case GT: return AARCH64_LT;
5057 case LE: return AARCH64_GE;
5058 case LT: return AARCH64_GT;
5059 case GEU: return AARCH64_LS;
5060 case GTU: return AARCH64_CC;
5061 case LEU: return AARCH64_CS;
5062 case LTU: return AARCH64_HI;
5063 default: return -1;
5065 break;
5067 case E_CC_NZmode:
5068 switch (comp_code)
5070 case NE: return AARCH64_NE;
5071 case EQ: return AARCH64_EQ;
5072 case GE: return AARCH64_PL;
5073 case LT: return AARCH64_MI;
5074 default: return -1;
5076 break;
5078 case E_CC_Zmode:
5079 switch (comp_code)
5081 case NE: return AARCH64_NE;
5082 case EQ: return AARCH64_EQ;
5083 default: return -1;
5085 break;
5087 case E_CC_Cmode:
5088 switch (comp_code)
5090 case NE: return AARCH64_CS;
5091 case EQ: return AARCH64_CC;
5092 default: return -1;
5094 break;
5096 default:
5097 return -1;
5100 return -1;
5103 bool
5104 aarch64_const_vec_all_same_in_range_p (rtx x,
5105 HOST_WIDE_INT minval,
5106 HOST_WIDE_INT maxval)
5108 HOST_WIDE_INT firstval;
5109 int count, i;
5111 if (GET_CODE (x) != CONST_VECTOR
5112 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5113 return false;
5115 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5116 if (firstval < minval || firstval > maxval)
5117 return false;
5119 count = CONST_VECTOR_NUNITS (x);
5120 for (i = 1; i < count; i++)
5121 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5122 return false;
5124 return true;
5127 bool
5128 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5130 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5134 /* N Z C V. */
5135 #define AARCH64_CC_V 1
5136 #define AARCH64_CC_C (1 << 1)
5137 #define AARCH64_CC_Z (1 << 2)
5138 #define AARCH64_CC_N (1 << 3)
5140 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5141 static const int aarch64_nzcv_codes[] =
5143 0, /* EQ, Z == 1. */
5144 AARCH64_CC_Z, /* NE, Z == 0. */
5145 0, /* CS, C == 1. */
5146 AARCH64_CC_C, /* CC, C == 0. */
5147 0, /* MI, N == 1. */
5148 AARCH64_CC_N, /* PL, N == 0. */
5149 0, /* VS, V == 1. */
5150 AARCH64_CC_V, /* VC, V == 0. */
5151 0, /* HI, C ==1 && Z == 0. */
5152 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5153 AARCH64_CC_V, /* GE, N == V. */
5154 0, /* LT, N != V. */
5155 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5156 0, /* LE, !(Z == 0 && N == V). */
5157 0, /* AL, Any. */
5158 0 /* NV, Any. */
5161 /* Print operand X to file F in a target specific manner according to CODE.
5162 The acceptable formatting commands given by CODE are:
5163 'c': An integer or symbol address without a preceding #
5164 sign.
5165 'e': Print the sign/zero-extend size as a character 8->b,
5166 16->h, 32->w.
5167 'p': Prints N such that 2^N == X (X must be power of 2 and
5168 const int).
5169 'P': Print the number of non-zero bits in X (a const_int).
5170 'H': Print the higher numbered register of a pair (TImode)
5171 of regs.
5172 'm': Print a condition (eq, ne, etc).
5173 'M': Same as 'm', but invert condition.
5174 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5175 'S/T/U/V': Print a FP/SIMD register name for a register list.
5176 The register printed is the FP/SIMD register name
5177 of X + 0/1/2/3 for S/T/U/V.
5178 'R': Print a scalar FP/SIMD register name + 1.
5179 'X': Print bottom 16 bits of integer constant in hex.
5180 'w/x': Print a general register name or the zero register
5181 (32-bit or 64-bit).
5182 '0': Print a normal operand, if it's a general register,
5183 then we assume DImode.
5184 'k': Print NZCV for conditional compare instructions.
5185 'A': Output address constant representing the first
5186 argument of X, specifying a relocation offset
5187 if appropriate.
5188 'L': Output constant address specified by X
5189 with a relocation offset if appropriate.
5190 'G': Prints address of X, specifying a PC relative
5191 relocation mode if appropriate. */
5193 static void
5194 aarch64_print_operand (FILE *f, rtx x, int code)
5196 switch (code)
5198 case 'c':
5199 switch (GET_CODE (x))
5201 case CONST_INT:
5202 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5203 break;
5205 case SYMBOL_REF:
5206 output_addr_const (f, x);
5207 break;
5209 case CONST:
5210 if (GET_CODE (XEXP (x, 0)) == PLUS
5211 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5213 output_addr_const (f, x);
5214 break;
5216 /* Fall through. */
5218 default:
5219 output_operand_lossage ("Unsupported operand for code '%c'", code);
5221 break;
5223 case 'e':
5225 int n;
5227 if (!CONST_INT_P (x)
5228 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5230 output_operand_lossage ("invalid operand for '%%%c'", code);
5231 return;
5234 switch (n)
5236 case 3:
5237 fputc ('b', f);
5238 break;
5239 case 4:
5240 fputc ('h', f);
5241 break;
5242 case 5:
5243 fputc ('w', f);
5244 break;
5245 default:
5246 output_operand_lossage ("invalid operand for '%%%c'", code);
5247 return;
5250 break;
5252 case 'p':
5254 int n;
5256 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5258 output_operand_lossage ("invalid operand for '%%%c'", code);
5259 return;
5262 asm_fprintf (f, "%d", n);
5264 break;
5266 case 'P':
5267 if (!CONST_INT_P (x))
5269 output_operand_lossage ("invalid operand for '%%%c'", code);
5270 return;
5273 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5274 break;
5276 case 'H':
5277 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5279 output_operand_lossage ("invalid operand for '%%%c'", code);
5280 return;
5283 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5284 break;
5286 case 'M':
5287 case 'm':
5289 int cond_code;
5290 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5291 if (x == const_true_rtx)
5293 if (code == 'M')
5294 fputs ("nv", f);
5295 return;
5298 if (!COMPARISON_P (x))
5300 output_operand_lossage ("invalid operand for '%%%c'", code);
5301 return;
5304 cond_code = aarch64_get_condition_code (x);
5305 gcc_assert (cond_code >= 0);
5306 if (code == 'M')
5307 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5308 fputs (aarch64_condition_codes[cond_code], f);
5310 break;
5312 case 'b':
5313 case 'h':
5314 case 's':
5315 case 'd':
5316 case 'q':
5317 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5319 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5320 return;
5322 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5323 break;
5325 case 'S':
5326 case 'T':
5327 case 'U':
5328 case 'V':
5329 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5331 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5332 return;
5334 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5335 break;
5337 case 'R':
5338 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5340 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5341 return;
5343 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5344 break;
5346 case 'X':
5347 if (!CONST_INT_P (x))
5349 output_operand_lossage ("invalid operand for '%%%c'", code);
5350 return;
5352 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5353 break;
5355 case 'w':
5356 case 'x':
5357 if (x == const0_rtx
5358 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5360 asm_fprintf (f, "%czr", code);
5361 break;
5364 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5366 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5367 break;
5370 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5372 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5373 break;
5376 /* Fall through */
5378 case 0:
5379 if (x == NULL)
5381 output_operand_lossage ("missing operand");
5382 return;
5385 switch (GET_CODE (x))
5387 case REG:
5388 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5389 break;
5391 case MEM:
5392 output_address (GET_MODE (x), XEXP (x, 0));
5393 /* Check all memory references are Pmode - even with ILP32. */
5394 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5395 break;
5397 case CONST:
5398 case LABEL_REF:
5399 case SYMBOL_REF:
5400 output_addr_const (asm_out_file, x);
5401 break;
5403 case CONST_INT:
5404 asm_fprintf (f, "%wd", INTVAL (x));
5405 break;
5407 case CONST_VECTOR:
5408 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5410 gcc_assert (
5411 aarch64_const_vec_all_same_in_range_p (x,
5412 HOST_WIDE_INT_MIN,
5413 HOST_WIDE_INT_MAX));
5414 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5416 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5418 fputc ('0', f);
5420 else
5421 gcc_unreachable ();
5422 break;
5424 case CONST_DOUBLE:
5425 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5426 be getting CONST_DOUBLEs holding integers. */
5427 gcc_assert (GET_MODE (x) != VOIDmode);
5428 if (aarch64_float_const_zero_rtx_p (x))
5430 fputc ('0', f);
5431 break;
5433 else if (aarch64_float_const_representable_p (x))
5435 #define buf_size 20
5436 char float_buf[buf_size] = {'\0'};
5437 real_to_decimal_for_mode (float_buf,
5438 CONST_DOUBLE_REAL_VALUE (x),
5439 buf_size, buf_size,
5440 1, GET_MODE (x));
5441 asm_fprintf (asm_out_file, "%s", float_buf);
5442 break;
5443 #undef buf_size
5445 output_operand_lossage ("invalid constant");
5446 return;
5447 default:
5448 output_operand_lossage ("invalid operand");
5449 return;
5451 break;
5453 case 'A':
5454 if (GET_CODE (x) == HIGH)
5455 x = XEXP (x, 0);
5457 switch (aarch64_classify_symbolic_expression (x))
5459 case SYMBOL_SMALL_GOT_4G:
5460 asm_fprintf (asm_out_file, ":got:");
5461 break;
5463 case SYMBOL_SMALL_TLSGD:
5464 asm_fprintf (asm_out_file, ":tlsgd:");
5465 break;
5467 case SYMBOL_SMALL_TLSDESC:
5468 asm_fprintf (asm_out_file, ":tlsdesc:");
5469 break;
5471 case SYMBOL_SMALL_TLSIE:
5472 asm_fprintf (asm_out_file, ":gottprel:");
5473 break;
5475 case SYMBOL_TLSLE24:
5476 asm_fprintf (asm_out_file, ":tprel:");
5477 break;
5479 case SYMBOL_TINY_GOT:
5480 gcc_unreachable ();
5481 break;
5483 default:
5484 break;
5486 output_addr_const (asm_out_file, x);
5487 break;
5489 case 'L':
5490 switch (aarch64_classify_symbolic_expression (x))
5492 case SYMBOL_SMALL_GOT_4G:
5493 asm_fprintf (asm_out_file, ":lo12:");
5494 break;
5496 case SYMBOL_SMALL_TLSGD:
5497 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5498 break;
5500 case SYMBOL_SMALL_TLSDESC:
5501 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5502 break;
5504 case SYMBOL_SMALL_TLSIE:
5505 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5506 break;
5508 case SYMBOL_TLSLE12:
5509 asm_fprintf (asm_out_file, ":tprel_lo12:");
5510 break;
5512 case SYMBOL_TLSLE24:
5513 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5514 break;
5516 case SYMBOL_TINY_GOT:
5517 asm_fprintf (asm_out_file, ":got:");
5518 break;
5520 case SYMBOL_TINY_TLSIE:
5521 asm_fprintf (asm_out_file, ":gottprel:");
5522 break;
5524 default:
5525 break;
5527 output_addr_const (asm_out_file, x);
5528 break;
5530 case 'G':
5531 switch (aarch64_classify_symbolic_expression (x))
5533 case SYMBOL_TLSLE24:
5534 asm_fprintf (asm_out_file, ":tprel_hi12:");
5535 break;
5536 default:
5537 break;
5539 output_addr_const (asm_out_file, x);
5540 break;
5542 case 'k':
5544 HOST_WIDE_INT cond_code;
5546 if (!CONST_INT_P (x))
5548 output_operand_lossage ("invalid operand for '%%%c'", code);
5549 return;
5552 cond_code = INTVAL (x);
5553 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5554 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5556 break;
5558 default:
5559 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5560 return;
5564 static void
5565 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5567 struct aarch64_address_info addr;
5569 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5570 switch (addr.type)
5572 case ADDRESS_REG_IMM:
5573 if (addr.offset == const0_rtx)
5574 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5575 else
5576 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5577 INTVAL (addr.offset));
5578 return;
5580 case ADDRESS_REG_REG:
5581 if (addr.shift == 0)
5582 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5583 reg_names [REGNO (addr.offset)]);
5584 else
5585 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5586 reg_names [REGNO (addr.offset)], addr.shift);
5587 return;
5589 case ADDRESS_REG_UXTW:
5590 if (addr.shift == 0)
5591 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5592 REGNO (addr.offset) - R0_REGNUM);
5593 else
5594 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5595 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5596 return;
5598 case ADDRESS_REG_SXTW:
5599 if (addr.shift == 0)
5600 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5601 REGNO (addr.offset) - R0_REGNUM);
5602 else
5603 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5604 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5605 return;
5607 case ADDRESS_REG_WB:
5608 switch (GET_CODE (x))
5610 case PRE_INC:
5611 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5612 GET_MODE_SIZE (mode));
5613 return;
5614 case POST_INC:
5615 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5616 GET_MODE_SIZE (mode));
5617 return;
5618 case PRE_DEC:
5619 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5620 GET_MODE_SIZE (mode));
5621 return;
5622 case POST_DEC:
5623 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5624 GET_MODE_SIZE (mode));
5625 return;
5626 case PRE_MODIFY:
5627 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5628 INTVAL (addr.offset));
5629 return;
5630 case POST_MODIFY:
5631 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5632 INTVAL (addr.offset));
5633 return;
5634 default:
5635 break;
5637 break;
5639 case ADDRESS_LO_SUM:
5640 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5641 output_addr_const (f, addr.offset);
5642 asm_fprintf (f, "]");
5643 return;
5645 case ADDRESS_SYMBOLIC:
5646 break;
5649 output_addr_const (f, x);
5652 bool
5653 aarch64_label_mentioned_p (rtx x)
5655 const char *fmt;
5656 int i;
5658 if (GET_CODE (x) == LABEL_REF)
5659 return true;
5661 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5662 referencing instruction, but they are constant offsets, not
5663 symbols. */
5664 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5665 return false;
5667 fmt = GET_RTX_FORMAT (GET_CODE (x));
5668 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5670 if (fmt[i] == 'E')
5672 int j;
5674 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5675 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5676 return 1;
5678 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5679 return 1;
5682 return 0;
5685 /* Implement REGNO_REG_CLASS. */
5687 enum reg_class
5688 aarch64_regno_regclass (unsigned regno)
5690 if (GP_REGNUM_P (regno))
5691 return GENERAL_REGS;
5693 if (regno == SP_REGNUM)
5694 return STACK_REG;
5696 if (regno == FRAME_POINTER_REGNUM
5697 || regno == ARG_POINTER_REGNUM)
5698 return POINTER_REGS;
5700 if (FP_REGNUM_P (regno))
5701 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5703 return NO_REGS;
5706 static rtx
5707 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5709 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5710 where mask is selected by alignment and size of the offset.
5711 We try to pick as large a range for the offset as possible to
5712 maximize the chance of a CSE. However, for aligned addresses
5713 we limit the range to 4k so that structures with different sized
5714 elements are likely to use the same base. We need to be careful
5715 not to split a CONST for some forms of address expression, otherwise
5716 it will generate sub-optimal code. */
5718 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5720 rtx base = XEXP (x, 0);
5721 rtx offset_rtx = XEXP (x, 1);
5722 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5724 if (GET_CODE (base) == PLUS)
5726 rtx op0 = XEXP (base, 0);
5727 rtx op1 = XEXP (base, 1);
5729 /* Force any scaling into a temp for CSE. */
5730 op0 = force_reg (Pmode, op0);
5731 op1 = force_reg (Pmode, op1);
5733 /* Let the pointer register be in op0. */
5734 if (REG_POINTER (op1))
5735 std::swap (op0, op1);
5737 /* If the pointer is virtual or frame related, then we know that
5738 virtual register instantiation or register elimination is going
5739 to apply a second constant. We want the two constants folded
5740 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5741 if (virt_or_elim_regno_p (REGNO (op0)))
5743 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5744 NULL_RTX, true, OPTAB_DIRECT);
5745 return gen_rtx_PLUS (Pmode, base, op1);
5748 /* Otherwise, in order to encourage CSE (and thence loop strength
5749 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5750 base = expand_binop (Pmode, add_optab, op0, op1,
5751 NULL_RTX, true, OPTAB_DIRECT);
5752 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5755 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5756 HOST_WIDE_INT base_offset;
5757 if (GET_MODE_SIZE (mode) > 16)
5758 base_offset = (offset + 0x400) & ~0x7f0;
5759 /* For offsets aren't a multiple of the access size, the limit is
5760 -256...255. */
5761 else if (offset & (GET_MODE_SIZE (mode) - 1))
5763 base_offset = (offset + 0x100) & ~0x1ff;
5765 /* BLKmode typically uses LDP of X-registers. */
5766 if (mode == BLKmode)
5767 base_offset = (offset + 512) & ~0x3ff;
5769 /* Small negative offsets are supported. */
5770 else if (IN_RANGE (offset, -256, 0))
5771 base_offset = 0;
5772 else if (mode == TImode || mode == TFmode)
5773 base_offset = (offset + 0x100) & ~0x1ff;
5774 /* Use 12-bit offset by access size. */
5775 else
5776 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5778 if (base_offset != 0)
5780 base = plus_constant (Pmode, base, base_offset);
5781 base = force_operand (base, NULL_RTX);
5782 return plus_constant (Pmode, base, offset - base_offset);
5786 return x;
5789 /* Return the reload icode required for a constant pool in mode. */
5790 static enum insn_code
5791 aarch64_constant_pool_reload_icode (machine_mode mode)
5793 switch (mode)
5795 case E_SFmode:
5796 return CODE_FOR_aarch64_reload_movcpsfdi;
5798 case E_DFmode:
5799 return CODE_FOR_aarch64_reload_movcpdfdi;
5801 case E_TFmode:
5802 return CODE_FOR_aarch64_reload_movcptfdi;
5804 case E_V8QImode:
5805 return CODE_FOR_aarch64_reload_movcpv8qidi;
5807 case E_V16QImode:
5808 return CODE_FOR_aarch64_reload_movcpv16qidi;
5810 case E_V4HImode:
5811 return CODE_FOR_aarch64_reload_movcpv4hidi;
5813 case E_V8HImode:
5814 return CODE_FOR_aarch64_reload_movcpv8hidi;
5816 case E_V2SImode:
5817 return CODE_FOR_aarch64_reload_movcpv2sidi;
5819 case E_V4SImode:
5820 return CODE_FOR_aarch64_reload_movcpv4sidi;
5822 case E_V2DImode:
5823 return CODE_FOR_aarch64_reload_movcpv2didi;
5825 case E_V2DFmode:
5826 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5828 default:
5829 gcc_unreachable ();
5832 gcc_unreachable ();
5834 static reg_class_t
5835 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5836 reg_class_t rclass,
5837 machine_mode mode,
5838 secondary_reload_info *sri)
5841 /* If we have to disable direct literal pool loads and stores because the
5842 function is too big, then we need a scratch register. */
5843 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5844 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5845 || targetm.vector_mode_supported_p (GET_MODE (x)))
5846 && !aarch64_pcrelative_literal_loads)
5848 sri->icode = aarch64_constant_pool_reload_icode (mode);
5849 return NO_REGS;
5852 /* Without the TARGET_SIMD instructions we cannot move a Q register
5853 to a Q register directly. We need a scratch. */
5854 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5855 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5856 && reg_class_subset_p (rclass, FP_REGS))
5858 if (mode == TFmode)
5859 sri->icode = CODE_FOR_aarch64_reload_movtf;
5860 else if (mode == TImode)
5861 sri->icode = CODE_FOR_aarch64_reload_movti;
5862 return NO_REGS;
5865 /* A TFmode or TImode memory access should be handled via an FP_REGS
5866 because AArch64 has richer addressing modes for LDR/STR instructions
5867 than LDP/STP instructions. */
5868 if (TARGET_FLOAT && rclass == GENERAL_REGS
5869 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5870 return FP_REGS;
5872 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5873 return GENERAL_REGS;
5875 return NO_REGS;
5878 static bool
5879 aarch64_can_eliminate (const int from, const int to)
5881 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5882 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5884 if (frame_pointer_needed)
5886 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5887 return true;
5888 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5889 return false;
5890 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5891 && !cfun->calls_alloca)
5892 return true;
5893 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5894 return true;
5896 return false;
5898 else
5900 /* If we decided that we didn't need a leaf frame pointer but then used
5901 LR in the function, then we'll want a frame pointer after all, so
5902 prevent this elimination to ensure a frame pointer is used. */
5903 if (to == STACK_POINTER_REGNUM
5904 && flag_omit_leaf_frame_pointer
5905 && df_regs_ever_live_p (LR_REGNUM))
5906 return false;
5909 return true;
5912 HOST_WIDE_INT
5913 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5915 aarch64_layout_frame ();
5917 if (to == HARD_FRAME_POINTER_REGNUM)
5919 if (from == ARG_POINTER_REGNUM)
5920 return cfun->machine->frame.hard_fp_offset;
5922 if (from == FRAME_POINTER_REGNUM)
5923 return cfun->machine->frame.hard_fp_offset
5924 - cfun->machine->frame.locals_offset;
5927 if (to == STACK_POINTER_REGNUM)
5929 if (from == FRAME_POINTER_REGNUM)
5930 return cfun->machine->frame.frame_size
5931 - cfun->machine->frame.locals_offset;
5934 return cfun->machine->frame.frame_size;
5937 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5938 previous frame. */
5941 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5943 if (count != 0)
5944 return const0_rtx;
5945 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5949 static void
5950 aarch64_asm_trampoline_template (FILE *f)
5952 if (TARGET_ILP32)
5954 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5955 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5957 else
5959 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5960 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5962 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5963 assemble_aligned_integer (4, const0_rtx);
5964 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5965 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5968 static void
5969 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5971 rtx fnaddr, mem, a_tramp;
5972 const int tramp_code_sz = 16;
5974 /* Don't need to copy the trailing D-words, we fill those in below. */
5975 emit_block_move (m_tramp, assemble_trampoline_template (),
5976 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5977 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5978 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5979 if (GET_MODE (fnaddr) != ptr_mode)
5980 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5981 emit_move_insn (mem, fnaddr);
5983 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5984 emit_move_insn (mem, chain_value);
5986 /* XXX We should really define a "clear_cache" pattern and use
5987 gen_clear_cache(). */
5988 a_tramp = XEXP (m_tramp, 0);
5989 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5990 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
5991 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5992 ptr_mode);
5995 static unsigned char
5996 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5998 switch (regclass)
6000 case CALLER_SAVE_REGS:
6001 case POINTER_REGS:
6002 case GENERAL_REGS:
6003 case ALL_REGS:
6004 case FP_REGS:
6005 case FP_LO_REGS:
6006 return
6007 aarch64_vector_mode_p (mode)
6008 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6009 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6010 case STACK_REG:
6011 return 1;
6013 case NO_REGS:
6014 return 0;
6016 default:
6017 break;
6019 gcc_unreachable ();
6022 static reg_class_t
6023 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6025 if (regclass == POINTER_REGS)
6026 return GENERAL_REGS;
6028 if (regclass == STACK_REG)
6030 if (REG_P(x)
6031 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6032 return regclass;
6034 return NO_REGS;
6037 /* Register eliminiation can result in a request for
6038 SP+constant->FP_REGS. We cannot support such operations which
6039 use SP as source and an FP_REG as destination, so reject out
6040 right now. */
6041 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6043 rtx lhs = XEXP (x, 0);
6045 /* Look through a possible SUBREG introduced by ILP32. */
6046 if (GET_CODE (lhs) == SUBREG)
6047 lhs = SUBREG_REG (lhs);
6049 gcc_assert (REG_P (lhs));
6050 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6051 POINTER_REGS));
6052 return NO_REGS;
6055 return regclass;
6058 void
6059 aarch64_asm_output_labelref (FILE* f, const char *name)
6061 asm_fprintf (f, "%U%s", name);
6064 static void
6065 aarch64_elf_asm_constructor (rtx symbol, int priority)
6067 if (priority == DEFAULT_INIT_PRIORITY)
6068 default_ctor_section_asm_out_constructor (symbol, priority);
6069 else
6071 section *s;
6072 /* While priority is known to be in range [0, 65535], so 18 bytes
6073 would be enough, the compiler might not know that. To avoid
6074 -Wformat-truncation false positive, use a larger size. */
6075 char buf[23];
6076 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6077 s = get_section (buf, SECTION_WRITE, NULL);
6078 switch_to_section (s);
6079 assemble_align (POINTER_SIZE);
6080 assemble_aligned_integer (POINTER_BYTES, symbol);
6084 static void
6085 aarch64_elf_asm_destructor (rtx symbol, int priority)
6087 if (priority == DEFAULT_INIT_PRIORITY)
6088 default_dtor_section_asm_out_destructor (symbol, priority);
6089 else
6091 section *s;
6092 /* While priority is known to be in range [0, 65535], so 18 bytes
6093 would be enough, the compiler might not know that. To avoid
6094 -Wformat-truncation false positive, use a larger size. */
6095 char buf[23];
6096 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6097 s = get_section (buf, SECTION_WRITE, NULL);
6098 switch_to_section (s);
6099 assemble_align (POINTER_SIZE);
6100 assemble_aligned_integer (POINTER_BYTES, symbol);
6104 const char*
6105 aarch64_output_casesi (rtx *operands)
6107 char buf[100];
6108 char label[100];
6109 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6110 int index;
6111 static const char *const patterns[4][2] =
6114 "ldrb\t%w3, [%0,%w1,uxtw]",
6115 "add\t%3, %4, %w3, sxtb #2"
6118 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6119 "add\t%3, %4, %w3, sxth #2"
6122 "ldr\t%w3, [%0,%w1,uxtw #2]",
6123 "add\t%3, %4, %w3, sxtw #2"
6125 /* We assume that DImode is only generated when not optimizing and
6126 that we don't really need 64-bit address offsets. That would
6127 imply an object file with 8GB of code in a single function! */
6129 "ldr\t%w3, [%0,%w1,uxtw #2]",
6130 "add\t%3, %4, %w3, sxtw #2"
6134 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6136 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6138 gcc_assert (index >= 0 && index <= 3);
6140 /* Need to implement table size reduction, by chaning the code below. */
6141 output_asm_insn (patterns[index][0], operands);
6142 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6143 snprintf (buf, sizeof (buf),
6144 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6145 output_asm_insn (buf, operands);
6146 output_asm_insn (patterns[index][1], operands);
6147 output_asm_insn ("br\t%3", operands);
6148 assemble_label (asm_out_file, label);
6149 return "";
6153 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6154 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6155 operator. */
6158 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6160 if (shift >= 0 && shift <= 3)
6162 int size;
6163 for (size = 8; size <= 32; size *= 2)
6165 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6166 if (mask == bits << shift)
6167 return size;
6170 return 0;
6173 /* Constant pools are per function only when PC relative
6174 literal loads are true or we are in the large memory
6175 model. */
6177 static inline bool
6178 aarch64_can_use_per_function_literal_pools_p (void)
6180 return (aarch64_pcrelative_literal_loads
6181 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6184 static bool
6185 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6187 /* Fixme:: In an ideal world this would work similar
6188 to the logic in aarch64_select_rtx_section but this
6189 breaks bootstrap in gcc go. For now we workaround
6190 this by returning false here. */
6191 return false;
6194 /* Select appropriate section for constants depending
6195 on where we place literal pools. */
6197 static section *
6198 aarch64_select_rtx_section (machine_mode mode,
6199 rtx x,
6200 unsigned HOST_WIDE_INT align)
6202 if (aarch64_can_use_per_function_literal_pools_p ())
6203 return function_section (current_function_decl);
6205 return default_elf_select_rtx_section (mode, x, align);
6208 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6209 void
6210 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6211 HOST_WIDE_INT offset)
6213 /* When using per-function literal pools, we must ensure that any code
6214 section is aligned to the minimal instruction length, lest we get
6215 errors from the assembler re "unaligned instructions". */
6216 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6217 ASM_OUTPUT_ALIGN (f, 2);
6220 /* Costs. */
6222 /* Helper function for rtx cost calculation. Strip a shift expression
6223 from X. Returns the inner operand if successful, or the original
6224 expression on failure. */
6225 static rtx
6226 aarch64_strip_shift (rtx x)
6228 rtx op = x;
6230 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6231 we can convert both to ROR during final output. */
6232 if ((GET_CODE (op) == ASHIFT
6233 || GET_CODE (op) == ASHIFTRT
6234 || GET_CODE (op) == LSHIFTRT
6235 || GET_CODE (op) == ROTATERT
6236 || GET_CODE (op) == ROTATE)
6237 && CONST_INT_P (XEXP (op, 1)))
6238 return XEXP (op, 0);
6240 if (GET_CODE (op) == MULT
6241 && CONST_INT_P (XEXP (op, 1))
6242 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6243 return XEXP (op, 0);
6245 return x;
6248 /* Helper function for rtx cost calculation. Strip an extend
6249 expression from X. Returns the inner operand if successful, or the
6250 original expression on failure. We deal with a number of possible
6251 canonicalization variations here. If STRIP_SHIFT is true, then
6252 we can strip off a shift also. */
6253 static rtx
6254 aarch64_strip_extend (rtx x, bool strip_shift)
6256 rtx op = x;
6258 /* Zero and sign extraction of a widened value. */
6259 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6260 && XEXP (op, 2) == const0_rtx
6261 && GET_CODE (XEXP (op, 0)) == MULT
6262 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6263 XEXP (op, 1)))
6264 return XEXP (XEXP (op, 0), 0);
6266 /* It can also be represented (for zero-extend) as an AND with an
6267 immediate. */
6268 if (GET_CODE (op) == AND
6269 && GET_CODE (XEXP (op, 0)) == MULT
6270 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6271 && CONST_INT_P (XEXP (op, 1))
6272 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6273 INTVAL (XEXP (op, 1))) != 0)
6274 return XEXP (XEXP (op, 0), 0);
6276 /* Now handle extended register, as this may also have an optional
6277 left shift by 1..4. */
6278 if (strip_shift
6279 && GET_CODE (op) == ASHIFT
6280 && CONST_INT_P (XEXP (op, 1))
6281 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6282 op = XEXP (op, 0);
6284 if (GET_CODE (op) == ZERO_EXTEND
6285 || GET_CODE (op) == SIGN_EXTEND)
6286 op = XEXP (op, 0);
6288 if (op != x)
6289 return op;
6291 return x;
6294 /* Return true iff CODE is a shift supported in combination
6295 with arithmetic instructions. */
6297 static bool
6298 aarch64_shift_p (enum rtx_code code)
6300 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6304 /* Return true iff X is a cheap shift without a sign extend. */
6306 static bool
6307 aarch64_cheap_mult_shift_p (rtx x)
6309 rtx op0, op1;
6311 op0 = XEXP (x, 0);
6312 op1 = XEXP (x, 1);
6314 if (!(aarch64_tune_params.extra_tuning_flags
6315 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6316 return false;
6318 if (GET_CODE (op0) == SIGN_EXTEND)
6319 return false;
6321 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6322 && UINTVAL (op1) <= 4)
6323 return true;
6325 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6326 return false;
6328 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6330 if (l2 > 0 && l2 <= 4)
6331 return true;
6333 return false;
6336 /* Helper function for rtx cost calculation. Calculate the cost of
6337 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6338 Return the calculated cost of the expression, recursing manually in to
6339 operands where needed. */
6341 static int
6342 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6344 rtx op0, op1;
6345 const struct cpu_cost_table *extra_cost
6346 = aarch64_tune_params.insn_extra_cost;
6347 int cost = 0;
6348 bool compound_p = (outer == PLUS || outer == MINUS);
6349 machine_mode mode = GET_MODE (x);
6351 gcc_checking_assert (code == MULT);
6353 op0 = XEXP (x, 0);
6354 op1 = XEXP (x, 1);
6356 if (VECTOR_MODE_P (mode))
6357 mode = GET_MODE_INNER (mode);
6359 /* Integer multiply/fma. */
6360 if (GET_MODE_CLASS (mode) == MODE_INT)
6362 /* The multiply will be canonicalized as a shift, cost it as such. */
6363 if (aarch64_shift_p (GET_CODE (x))
6364 || (CONST_INT_P (op1)
6365 && exact_log2 (INTVAL (op1)) > 0))
6367 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6368 || GET_CODE (op0) == SIGN_EXTEND;
6369 if (speed)
6371 if (compound_p)
6373 /* If the shift is considered cheap,
6374 then don't add any cost. */
6375 if (aarch64_cheap_mult_shift_p (x))
6377 else if (REG_P (op1))
6378 /* ARITH + shift-by-register. */
6379 cost += extra_cost->alu.arith_shift_reg;
6380 else if (is_extend)
6381 /* ARITH + extended register. We don't have a cost field
6382 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6383 cost += extra_cost->alu.extend_arith;
6384 else
6385 /* ARITH + shift-by-immediate. */
6386 cost += extra_cost->alu.arith_shift;
6388 else
6389 /* LSL (immediate). */
6390 cost += extra_cost->alu.shift;
6393 /* Strip extends as we will have costed them in the case above. */
6394 if (is_extend)
6395 op0 = aarch64_strip_extend (op0, true);
6397 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6399 return cost;
6402 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6403 compound and let the below cases handle it. After all, MNEG is a
6404 special-case alias of MSUB. */
6405 if (GET_CODE (op0) == NEG)
6407 op0 = XEXP (op0, 0);
6408 compound_p = true;
6411 /* Integer multiplies or FMAs have zero/sign extending variants. */
6412 if ((GET_CODE (op0) == ZERO_EXTEND
6413 && GET_CODE (op1) == ZERO_EXTEND)
6414 || (GET_CODE (op0) == SIGN_EXTEND
6415 && GET_CODE (op1) == SIGN_EXTEND))
6417 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6418 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6420 if (speed)
6422 if (compound_p)
6423 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6424 cost += extra_cost->mult[0].extend_add;
6425 else
6426 /* MUL/SMULL/UMULL. */
6427 cost += extra_cost->mult[0].extend;
6430 return cost;
6433 /* This is either an integer multiply or a MADD. In both cases
6434 we want to recurse and cost the operands. */
6435 cost += rtx_cost (op0, mode, MULT, 0, speed);
6436 cost += rtx_cost (op1, mode, MULT, 1, speed);
6438 if (speed)
6440 if (compound_p)
6441 /* MADD/MSUB. */
6442 cost += extra_cost->mult[mode == DImode].add;
6443 else
6444 /* MUL. */
6445 cost += extra_cost->mult[mode == DImode].simple;
6448 return cost;
6450 else
6452 if (speed)
6454 /* Floating-point FMA/FMUL can also support negations of the
6455 operands, unless the rounding mode is upward or downward in
6456 which case FNMUL is different than FMUL with operand negation. */
6457 bool neg0 = GET_CODE (op0) == NEG;
6458 bool neg1 = GET_CODE (op1) == NEG;
6459 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6461 if (neg0)
6462 op0 = XEXP (op0, 0);
6463 if (neg1)
6464 op1 = XEXP (op1, 0);
6467 if (compound_p)
6468 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6469 cost += extra_cost->fp[mode == DFmode].fma;
6470 else
6471 /* FMUL/FNMUL. */
6472 cost += extra_cost->fp[mode == DFmode].mult;
6475 cost += rtx_cost (op0, mode, MULT, 0, speed);
6476 cost += rtx_cost (op1, mode, MULT, 1, speed);
6477 return cost;
6481 static int
6482 aarch64_address_cost (rtx x,
6483 machine_mode mode,
6484 addr_space_t as ATTRIBUTE_UNUSED,
6485 bool speed)
6487 enum rtx_code c = GET_CODE (x);
6488 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6489 struct aarch64_address_info info;
6490 int cost = 0;
6491 info.shift = 0;
6493 if (!aarch64_classify_address (&info, x, mode, c, false))
6495 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6497 /* This is a CONST or SYMBOL ref which will be split
6498 in a different way depending on the code model in use.
6499 Cost it through the generic infrastructure. */
6500 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6501 /* Divide through by the cost of one instruction to
6502 bring it to the same units as the address costs. */
6503 cost_symbol_ref /= COSTS_N_INSNS (1);
6504 /* The cost is then the cost of preparing the address,
6505 followed by an immediate (possibly 0) offset. */
6506 return cost_symbol_ref + addr_cost->imm_offset;
6508 else
6510 /* This is most likely a jump table from a case
6511 statement. */
6512 return addr_cost->register_offset;
6516 switch (info.type)
6518 case ADDRESS_LO_SUM:
6519 case ADDRESS_SYMBOLIC:
6520 case ADDRESS_REG_IMM:
6521 cost += addr_cost->imm_offset;
6522 break;
6524 case ADDRESS_REG_WB:
6525 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6526 cost += addr_cost->pre_modify;
6527 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6528 cost += addr_cost->post_modify;
6529 else
6530 gcc_unreachable ();
6532 break;
6534 case ADDRESS_REG_REG:
6535 cost += addr_cost->register_offset;
6536 break;
6538 case ADDRESS_REG_SXTW:
6539 cost += addr_cost->register_sextend;
6540 break;
6542 case ADDRESS_REG_UXTW:
6543 cost += addr_cost->register_zextend;
6544 break;
6546 default:
6547 gcc_unreachable ();
6551 if (info.shift > 0)
6553 /* For the sake of calculating the cost of the shifted register
6554 component, we can treat same sized modes in the same way. */
6555 switch (GET_MODE_BITSIZE (mode))
6557 case 16:
6558 cost += addr_cost->addr_scale_costs.hi;
6559 break;
6561 case 32:
6562 cost += addr_cost->addr_scale_costs.si;
6563 break;
6565 case 64:
6566 cost += addr_cost->addr_scale_costs.di;
6567 break;
6569 /* We can't tell, or this is a 128-bit vector. */
6570 default:
6571 cost += addr_cost->addr_scale_costs.ti;
6572 break;
6576 return cost;
6579 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6580 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6581 to be taken. */
6584 aarch64_branch_cost (bool speed_p, bool predictable_p)
6586 /* When optimizing for speed, use the cost of unpredictable branches. */
6587 const struct cpu_branch_cost *branch_costs =
6588 aarch64_tune_params.branch_costs;
6590 if (!speed_p || predictable_p)
6591 return branch_costs->predictable;
6592 else
6593 return branch_costs->unpredictable;
6596 /* Return true if the RTX X in mode MODE is a zero or sign extract
6597 usable in an ADD or SUB (extended register) instruction. */
6598 static bool
6599 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6601 /* Catch add with a sign extract.
6602 This is add_<optab><mode>_multp2. */
6603 if (GET_CODE (x) == SIGN_EXTRACT
6604 || GET_CODE (x) == ZERO_EXTRACT)
6606 rtx op0 = XEXP (x, 0);
6607 rtx op1 = XEXP (x, 1);
6608 rtx op2 = XEXP (x, 2);
6610 if (GET_CODE (op0) == MULT
6611 && CONST_INT_P (op1)
6612 && op2 == const0_rtx
6613 && CONST_INT_P (XEXP (op0, 1))
6614 && aarch64_is_extend_from_extract (mode,
6615 XEXP (op0, 1),
6616 op1))
6618 return true;
6621 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6622 No shift. */
6623 else if (GET_CODE (x) == SIGN_EXTEND
6624 || GET_CODE (x) == ZERO_EXTEND)
6625 return REG_P (XEXP (x, 0));
6627 return false;
6630 static bool
6631 aarch64_frint_unspec_p (unsigned int u)
6633 switch (u)
6635 case UNSPEC_FRINTZ:
6636 case UNSPEC_FRINTP:
6637 case UNSPEC_FRINTM:
6638 case UNSPEC_FRINTA:
6639 case UNSPEC_FRINTN:
6640 case UNSPEC_FRINTX:
6641 case UNSPEC_FRINTI:
6642 return true;
6644 default:
6645 return false;
6649 /* Return true iff X is an rtx that will match an extr instruction
6650 i.e. as described in the *extr<mode>5_insn family of patterns.
6651 OP0 and OP1 will be set to the operands of the shifts involved
6652 on success and will be NULL_RTX otherwise. */
6654 static bool
6655 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6657 rtx op0, op1;
6658 machine_mode mode = GET_MODE (x);
6660 *res_op0 = NULL_RTX;
6661 *res_op1 = NULL_RTX;
6663 if (GET_CODE (x) != IOR)
6664 return false;
6666 op0 = XEXP (x, 0);
6667 op1 = XEXP (x, 1);
6669 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6670 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6672 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6673 if (GET_CODE (op1) == ASHIFT)
6674 std::swap (op0, op1);
6676 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6677 return false;
6679 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6680 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6682 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6683 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6685 *res_op0 = XEXP (op0, 0);
6686 *res_op1 = XEXP (op1, 0);
6687 return true;
6691 return false;
6694 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6695 storing it in *COST. Result is true if the total cost of the operation
6696 has now been calculated. */
6697 static bool
6698 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6700 rtx inner;
6701 rtx comparator;
6702 enum rtx_code cmpcode;
6704 if (COMPARISON_P (op0))
6706 inner = XEXP (op0, 0);
6707 comparator = XEXP (op0, 1);
6708 cmpcode = GET_CODE (op0);
6710 else
6712 inner = op0;
6713 comparator = const0_rtx;
6714 cmpcode = NE;
6717 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6719 /* Conditional branch. */
6720 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6721 return true;
6722 else
6724 if (cmpcode == NE || cmpcode == EQ)
6726 if (comparator == const0_rtx)
6728 /* TBZ/TBNZ/CBZ/CBNZ. */
6729 if (GET_CODE (inner) == ZERO_EXTRACT)
6730 /* TBZ/TBNZ. */
6731 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6732 ZERO_EXTRACT, 0, speed);
6733 else
6734 /* CBZ/CBNZ. */
6735 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6737 return true;
6740 else if (cmpcode == LT || cmpcode == GE)
6742 /* TBZ/TBNZ. */
6743 if (comparator == const0_rtx)
6744 return true;
6748 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6750 /* CCMP. */
6751 if (GET_CODE (op1) == COMPARE)
6753 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6754 if (XEXP (op1, 1) == const0_rtx)
6755 *cost += 1;
6756 if (speed)
6758 machine_mode mode = GET_MODE (XEXP (op1, 0));
6759 const struct cpu_cost_table *extra_cost
6760 = aarch64_tune_params.insn_extra_cost;
6762 if (GET_MODE_CLASS (mode) == MODE_INT)
6763 *cost += extra_cost->alu.arith;
6764 else
6765 *cost += extra_cost->fp[mode == DFmode].compare;
6767 return true;
6770 /* It's a conditional operation based on the status flags,
6771 so it must be some flavor of CSEL. */
6773 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6774 if (GET_CODE (op1) == NEG
6775 || GET_CODE (op1) == NOT
6776 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6777 op1 = XEXP (op1, 0);
6778 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6780 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6781 op1 = XEXP (op1, 0);
6782 op2 = XEXP (op2, 0);
6785 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6786 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6787 return true;
6790 /* We don't know what this is, cost all operands. */
6791 return false;
6794 /* Check whether X is a bitfield operation of the form shift + extend that
6795 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6796 operand to which the bitfield operation is applied. Otherwise return
6797 NULL_RTX. */
6799 static rtx
6800 aarch64_extend_bitfield_pattern_p (rtx x)
6802 rtx_code outer_code = GET_CODE (x);
6803 machine_mode outer_mode = GET_MODE (x);
6805 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6806 && outer_mode != SImode && outer_mode != DImode)
6807 return NULL_RTX;
6809 rtx inner = XEXP (x, 0);
6810 rtx_code inner_code = GET_CODE (inner);
6811 machine_mode inner_mode = GET_MODE (inner);
6812 rtx op = NULL_RTX;
6814 switch (inner_code)
6816 case ASHIFT:
6817 if (CONST_INT_P (XEXP (inner, 1))
6818 && (inner_mode == QImode || inner_mode == HImode))
6819 op = XEXP (inner, 0);
6820 break;
6821 case LSHIFTRT:
6822 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6823 && (inner_mode == QImode || inner_mode == HImode))
6824 op = XEXP (inner, 0);
6825 break;
6826 case ASHIFTRT:
6827 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6828 && (inner_mode == QImode || inner_mode == HImode))
6829 op = XEXP (inner, 0);
6830 break;
6831 default:
6832 break;
6835 return op;
6838 /* Return true if the mask and a shift amount from an RTX of the form
6839 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6840 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6842 bool
6843 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6845 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6846 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6847 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6848 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6851 /* Calculate the cost of calculating X, storing it in *COST. Result
6852 is true if the total cost of the operation has now been calculated. */
6853 static bool
6854 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6855 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6857 rtx op0, op1, op2;
6858 const struct cpu_cost_table *extra_cost
6859 = aarch64_tune_params.insn_extra_cost;
6860 int code = GET_CODE (x);
6861 scalar_int_mode int_mode;
6863 /* By default, assume that everything has equivalent cost to the
6864 cheapest instruction. Any additional costs are applied as a delta
6865 above this default. */
6866 *cost = COSTS_N_INSNS (1);
6868 switch (code)
6870 case SET:
6871 /* The cost depends entirely on the operands to SET. */
6872 *cost = 0;
6873 op0 = SET_DEST (x);
6874 op1 = SET_SRC (x);
6876 switch (GET_CODE (op0))
6878 case MEM:
6879 if (speed)
6881 rtx address = XEXP (op0, 0);
6882 if (VECTOR_MODE_P (mode))
6883 *cost += extra_cost->ldst.storev;
6884 else if (GET_MODE_CLASS (mode) == MODE_INT)
6885 *cost += extra_cost->ldst.store;
6886 else if (mode == SFmode)
6887 *cost += extra_cost->ldst.storef;
6888 else if (mode == DFmode)
6889 *cost += extra_cost->ldst.stored;
6891 *cost +=
6892 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6893 0, speed));
6896 *cost += rtx_cost (op1, mode, SET, 1, speed);
6897 return true;
6899 case SUBREG:
6900 if (! REG_P (SUBREG_REG (op0)))
6901 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6903 /* Fall through. */
6904 case REG:
6905 /* The cost is one per vector-register copied. */
6906 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6908 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6909 / GET_MODE_SIZE (V4SImode);
6910 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6912 /* const0_rtx is in general free, but we will use an
6913 instruction to set a register to 0. */
6914 else if (REG_P (op1) || op1 == const0_rtx)
6916 /* The cost is 1 per register copied. */
6917 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6918 / UNITS_PER_WORD;
6919 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6921 else
6922 /* Cost is just the cost of the RHS of the set. */
6923 *cost += rtx_cost (op1, mode, SET, 1, speed);
6924 return true;
6926 case ZERO_EXTRACT:
6927 case SIGN_EXTRACT:
6928 /* Bit-field insertion. Strip any redundant widening of
6929 the RHS to meet the width of the target. */
6930 if (GET_CODE (op1) == SUBREG)
6931 op1 = SUBREG_REG (op1);
6932 if ((GET_CODE (op1) == ZERO_EXTEND
6933 || GET_CODE (op1) == SIGN_EXTEND)
6934 && CONST_INT_P (XEXP (op0, 1))
6935 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6936 >= INTVAL (XEXP (op0, 1))))
6937 op1 = XEXP (op1, 0);
6939 if (CONST_INT_P (op1))
6941 /* MOV immediate is assumed to always be cheap. */
6942 *cost = COSTS_N_INSNS (1);
6944 else
6946 /* BFM. */
6947 if (speed)
6948 *cost += extra_cost->alu.bfi;
6949 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6952 return true;
6954 default:
6955 /* We can't make sense of this, assume default cost. */
6956 *cost = COSTS_N_INSNS (1);
6957 return false;
6959 return false;
6961 case CONST_INT:
6962 /* If an instruction can incorporate a constant within the
6963 instruction, the instruction's expression avoids calling
6964 rtx_cost() on the constant. If rtx_cost() is called on a
6965 constant, then it is usually because the constant must be
6966 moved into a register by one or more instructions.
6968 The exception is constant 0, which can be expressed
6969 as XZR/WZR and is therefore free. The exception to this is
6970 if we have (set (reg) (const0_rtx)) in which case we must cost
6971 the move. However, we can catch that when we cost the SET, so
6972 we don't need to consider that here. */
6973 if (x == const0_rtx)
6974 *cost = 0;
6975 else
6977 /* To an approximation, building any other constant is
6978 proportionally expensive to the number of instructions
6979 required to build that constant. This is true whether we
6980 are compiling for SPEED or otherwise. */
6981 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6982 (NULL_RTX, x, false, mode));
6984 return true;
6986 case CONST_DOUBLE:
6988 /* First determine number of instructions to do the move
6989 as an integer constant. */
6990 if (!aarch64_float_const_representable_p (x)
6991 && !aarch64_can_const_movi_rtx_p (x, mode)
6992 && aarch64_float_const_rtx_p (x))
6994 unsigned HOST_WIDE_INT ival;
6995 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6996 gcc_assert (succeed);
6998 machine_mode imode = (mode == HFmode
6999 ? SImode
7000 : int_mode_for_mode (mode).require ());
7001 int ncost = aarch64_internal_mov_immediate
7002 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7003 *cost += COSTS_N_INSNS (ncost);
7004 return true;
7007 if (speed)
7009 /* mov[df,sf]_aarch64. */
7010 if (aarch64_float_const_representable_p (x))
7011 /* FMOV (scalar immediate). */
7012 *cost += extra_cost->fp[mode == DFmode].fpconst;
7013 else if (!aarch64_float_const_zero_rtx_p (x))
7015 /* This will be a load from memory. */
7016 if (mode == DFmode)
7017 *cost += extra_cost->ldst.loadd;
7018 else
7019 *cost += extra_cost->ldst.loadf;
7021 else
7022 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7023 or MOV v0.s[0], wzr - neither of which are modeled by the
7024 cost tables. Just use the default cost. */
7029 return true;
7031 case MEM:
7032 if (speed)
7034 /* For loads we want the base cost of a load, plus an
7035 approximation for the additional cost of the addressing
7036 mode. */
7037 rtx address = XEXP (x, 0);
7038 if (VECTOR_MODE_P (mode))
7039 *cost += extra_cost->ldst.loadv;
7040 else if (GET_MODE_CLASS (mode) == MODE_INT)
7041 *cost += extra_cost->ldst.load;
7042 else if (mode == SFmode)
7043 *cost += extra_cost->ldst.loadf;
7044 else if (mode == DFmode)
7045 *cost += extra_cost->ldst.loadd;
7047 *cost +=
7048 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7049 0, speed));
7052 return true;
7054 case NEG:
7055 op0 = XEXP (x, 0);
7057 if (VECTOR_MODE_P (mode))
7059 if (speed)
7061 /* FNEG. */
7062 *cost += extra_cost->vect.alu;
7064 return false;
7067 if (GET_MODE_CLASS (mode) == MODE_INT)
7069 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7070 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7072 /* CSETM. */
7073 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7074 return true;
7077 /* Cost this as SUB wzr, X. */
7078 op0 = CONST0_RTX (mode);
7079 op1 = XEXP (x, 0);
7080 goto cost_minus;
7083 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7085 /* Support (neg(fma...)) as a single instruction only if
7086 sign of zeros is unimportant. This matches the decision
7087 making in aarch64.md. */
7088 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7090 /* FNMADD. */
7091 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7092 return true;
7094 if (GET_CODE (op0) == MULT)
7096 /* FNMUL. */
7097 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7098 return true;
7100 if (speed)
7101 /* FNEG. */
7102 *cost += extra_cost->fp[mode == DFmode].neg;
7103 return false;
7106 return false;
7108 case CLRSB:
7109 case CLZ:
7110 if (speed)
7112 if (VECTOR_MODE_P (mode))
7113 *cost += extra_cost->vect.alu;
7114 else
7115 *cost += extra_cost->alu.clz;
7118 return false;
7120 case COMPARE:
7121 op0 = XEXP (x, 0);
7122 op1 = XEXP (x, 1);
7124 if (op1 == const0_rtx
7125 && GET_CODE (op0) == AND)
7127 x = op0;
7128 mode = GET_MODE (op0);
7129 goto cost_logic;
7132 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7134 /* TODO: A write to the CC flags possibly costs extra, this
7135 needs encoding in the cost tables. */
7137 mode = GET_MODE (op0);
7138 /* ANDS. */
7139 if (GET_CODE (op0) == AND)
7141 x = op0;
7142 goto cost_logic;
7145 if (GET_CODE (op0) == PLUS)
7147 /* ADDS (and CMN alias). */
7148 x = op0;
7149 goto cost_plus;
7152 if (GET_CODE (op0) == MINUS)
7154 /* SUBS. */
7155 x = op0;
7156 goto cost_minus;
7159 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7160 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7161 && CONST_INT_P (XEXP (op0, 2)))
7163 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7164 Handle it here directly rather than going to cost_logic
7165 since we know the immediate generated for the TST is valid
7166 so we can avoid creating an intermediate rtx for it only
7167 for costing purposes. */
7168 if (speed)
7169 *cost += extra_cost->alu.logical;
7171 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7172 ZERO_EXTRACT, 0, speed);
7173 return true;
7176 if (GET_CODE (op1) == NEG)
7178 /* CMN. */
7179 if (speed)
7180 *cost += extra_cost->alu.arith;
7182 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7183 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7184 return true;
7187 /* CMP.
7189 Compare can freely swap the order of operands, and
7190 canonicalization puts the more complex operation first.
7191 But the integer MINUS logic expects the shift/extend
7192 operation in op1. */
7193 if (! (REG_P (op0)
7194 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7196 op0 = XEXP (x, 1);
7197 op1 = XEXP (x, 0);
7199 goto cost_minus;
7202 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7204 /* FCMP. */
7205 if (speed)
7206 *cost += extra_cost->fp[mode == DFmode].compare;
7208 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7210 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7211 /* FCMP supports constant 0.0 for no extra cost. */
7212 return true;
7214 return false;
7217 if (VECTOR_MODE_P (mode))
7219 /* Vector compare. */
7220 if (speed)
7221 *cost += extra_cost->vect.alu;
7223 if (aarch64_float_const_zero_rtx_p (op1))
7225 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7226 cost. */
7227 return true;
7229 return false;
7231 return false;
7233 case MINUS:
7235 op0 = XEXP (x, 0);
7236 op1 = XEXP (x, 1);
7238 cost_minus:
7239 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7241 /* Detect valid immediates. */
7242 if ((GET_MODE_CLASS (mode) == MODE_INT
7243 || (GET_MODE_CLASS (mode) == MODE_CC
7244 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7245 && CONST_INT_P (op1)
7246 && aarch64_uimm12_shift (INTVAL (op1)))
7248 if (speed)
7249 /* SUB(S) (immediate). */
7250 *cost += extra_cost->alu.arith;
7251 return true;
7254 /* Look for SUB (extended register). */
7255 if (aarch64_rtx_arith_op_extract_p (op1, mode))
7257 if (speed)
7258 *cost += extra_cost->alu.extend_arith;
7260 op1 = aarch64_strip_extend (op1, true);
7261 *cost += rtx_cost (op1, VOIDmode,
7262 (enum rtx_code) GET_CODE (op1), 0, speed);
7263 return true;
7266 rtx new_op1 = aarch64_strip_extend (op1, false);
7268 /* Cost this as an FMA-alike operation. */
7269 if ((GET_CODE (new_op1) == MULT
7270 || aarch64_shift_p (GET_CODE (new_op1)))
7271 && code != COMPARE)
7273 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7274 (enum rtx_code) code,
7275 speed);
7276 return true;
7279 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7281 if (speed)
7283 if (VECTOR_MODE_P (mode))
7285 /* Vector SUB. */
7286 *cost += extra_cost->vect.alu;
7288 else if (GET_MODE_CLASS (mode) == MODE_INT)
7290 /* SUB(S). */
7291 *cost += extra_cost->alu.arith;
7293 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7295 /* FSUB. */
7296 *cost += extra_cost->fp[mode == DFmode].addsub;
7299 return true;
7302 case PLUS:
7304 rtx new_op0;
7306 op0 = XEXP (x, 0);
7307 op1 = XEXP (x, 1);
7309 cost_plus:
7310 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7311 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7313 /* CSINC. */
7314 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7315 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7316 return true;
7319 if (GET_MODE_CLASS (mode) == MODE_INT
7320 && CONST_INT_P (op1)
7321 && aarch64_uimm12_shift (INTVAL (op1)))
7323 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7325 if (speed)
7326 /* ADD (immediate). */
7327 *cost += extra_cost->alu.arith;
7328 return true;
7331 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7333 /* Look for ADD (extended register). */
7334 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7336 if (speed)
7337 *cost += extra_cost->alu.extend_arith;
7339 op0 = aarch64_strip_extend (op0, true);
7340 *cost += rtx_cost (op0, VOIDmode,
7341 (enum rtx_code) GET_CODE (op0), 0, speed);
7342 return true;
7345 /* Strip any extend, leave shifts behind as we will
7346 cost them through mult_cost. */
7347 new_op0 = aarch64_strip_extend (op0, false);
7349 if (GET_CODE (new_op0) == MULT
7350 || aarch64_shift_p (GET_CODE (new_op0)))
7352 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7353 speed);
7354 return true;
7357 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7359 if (speed)
7361 if (VECTOR_MODE_P (mode))
7363 /* Vector ADD. */
7364 *cost += extra_cost->vect.alu;
7366 else if (GET_MODE_CLASS (mode) == MODE_INT)
7368 /* ADD. */
7369 *cost += extra_cost->alu.arith;
7371 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7373 /* FADD. */
7374 *cost += extra_cost->fp[mode == DFmode].addsub;
7377 return true;
7380 case BSWAP:
7381 *cost = COSTS_N_INSNS (1);
7383 if (speed)
7385 if (VECTOR_MODE_P (mode))
7386 *cost += extra_cost->vect.alu;
7387 else
7388 *cost += extra_cost->alu.rev;
7390 return false;
7392 case IOR:
7393 if (aarch_rev16_p (x))
7395 *cost = COSTS_N_INSNS (1);
7397 if (speed)
7399 if (VECTOR_MODE_P (mode))
7400 *cost += extra_cost->vect.alu;
7401 else
7402 *cost += extra_cost->alu.rev;
7404 return true;
7407 if (aarch64_extr_rtx_p (x, &op0, &op1))
7409 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7410 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7411 if (speed)
7412 *cost += extra_cost->alu.shift;
7414 return true;
7416 /* Fall through. */
7417 case XOR:
7418 case AND:
7419 cost_logic:
7420 op0 = XEXP (x, 0);
7421 op1 = XEXP (x, 1);
7423 if (VECTOR_MODE_P (mode))
7425 if (speed)
7426 *cost += extra_cost->vect.alu;
7427 return true;
7430 if (code == AND
7431 && GET_CODE (op0) == MULT
7432 && CONST_INT_P (XEXP (op0, 1))
7433 && CONST_INT_P (op1)
7434 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7435 INTVAL (op1)) != 0)
7437 /* This is a UBFM/SBFM. */
7438 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7439 if (speed)
7440 *cost += extra_cost->alu.bfx;
7441 return true;
7444 if (is_int_mode (mode, &int_mode))
7446 if (CONST_INT_P (op1))
7448 /* We have a mask + shift version of a UBFIZ
7449 i.e. the *andim_ashift<mode>_bfiz pattern. */
7450 if (GET_CODE (op0) == ASHIFT
7451 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7452 XEXP (op0, 1)))
7454 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7455 (enum rtx_code) code, 0, speed);
7456 if (speed)
7457 *cost += extra_cost->alu.bfx;
7459 return true;
7461 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7463 /* We possibly get the immediate for free, this is not
7464 modelled. */
7465 *cost += rtx_cost (op0, int_mode,
7466 (enum rtx_code) code, 0, speed);
7467 if (speed)
7468 *cost += extra_cost->alu.logical;
7470 return true;
7473 else
7475 rtx new_op0 = op0;
7477 /* Handle ORN, EON, or BIC. */
7478 if (GET_CODE (op0) == NOT)
7479 op0 = XEXP (op0, 0);
7481 new_op0 = aarch64_strip_shift (op0);
7483 /* If we had a shift on op0 then this is a logical-shift-
7484 by-register/immediate operation. Otherwise, this is just
7485 a logical operation. */
7486 if (speed)
7488 if (new_op0 != op0)
7490 /* Shift by immediate. */
7491 if (CONST_INT_P (XEXP (op0, 1)))
7492 *cost += extra_cost->alu.log_shift;
7493 else
7494 *cost += extra_cost->alu.log_shift_reg;
7496 else
7497 *cost += extra_cost->alu.logical;
7500 /* In both cases we want to cost both operands. */
7501 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7502 0, speed);
7503 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7504 1, speed);
7506 return true;
7509 return false;
7511 case NOT:
7512 x = XEXP (x, 0);
7513 op0 = aarch64_strip_shift (x);
7515 if (VECTOR_MODE_P (mode))
7517 /* Vector NOT. */
7518 *cost += extra_cost->vect.alu;
7519 return false;
7522 /* MVN-shifted-reg. */
7523 if (op0 != x)
7525 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7527 if (speed)
7528 *cost += extra_cost->alu.log_shift;
7530 return true;
7532 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7533 Handle the second form here taking care that 'a' in the above can
7534 be a shift. */
7535 else if (GET_CODE (op0) == XOR)
7537 rtx newop0 = XEXP (op0, 0);
7538 rtx newop1 = XEXP (op0, 1);
7539 rtx op0_stripped = aarch64_strip_shift (newop0);
7541 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7542 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7544 if (speed)
7546 if (op0_stripped != newop0)
7547 *cost += extra_cost->alu.log_shift;
7548 else
7549 *cost += extra_cost->alu.logical;
7552 return true;
7554 /* MVN. */
7555 if (speed)
7556 *cost += extra_cost->alu.logical;
7558 return false;
7560 case ZERO_EXTEND:
7562 op0 = XEXP (x, 0);
7563 /* If a value is written in SI mode, then zero extended to DI
7564 mode, the operation will in general be free as a write to
7565 a 'w' register implicitly zeroes the upper bits of an 'x'
7566 register. However, if this is
7568 (set (reg) (zero_extend (reg)))
7570 we must cost the explicit register move. */
7571 if (mode == DImode
7572 && GET_MODE (op0) == SImode
7573 && outer == SET)
7575 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7577 /* If OP_COST is non-zero, then the cost of the zero extend
7578 is effectively the cost of the inner operation. Otherwise
7579 we have a MOV instruction and we take the cost from the MOV
7580 itself. This is true independently of whether we are
7581 optimizing for space or time. */
7582 if (op_cost)
7583 *cost = op_cost;
7585 return true;
7587 else if (MEM_P (op0))
7589 /* All loads can zero extend to any size for free. */
7590 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7591 return true;
7594 op0 = aarch64_extend_bitfield_pattern_p (x);
7595 if (op0)
7597 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7598 if (speed)
7599 *cost += extra_cost->alu.bfx;
7600 return true;
7603 if (speed)
7605 if (VECTOR_MODE_P (mode))
7607 /* UMOV. */
7608 *cost += extra_cost->vect.alu;
7610 else
7612 /* We generate an AND instead of UXTB/UXTH. */
7613 *cost += extra_cost->alu.logical;
7616 return false;
7618 case SIGN_EXTEND:
7619 if (MEM_P (XEXP (x, 0)))
7621 /* LDRSH. */
7622 if (speed)
7624 rtx address = XEXP (XEXP (x, 0), 0);
7625 *cost += extra_cost->ldst.load_sign_extend;
7627 *cost +=
7628 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7629 0, speed));
7631 return true;
7634 op0 = aarch64_extend_bitfield_pattern_p (x);
7635 if (op0)
7637 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7638 if (speed)
7639 *cost += extra_cost->alu.bfx;
7640 return true;
7643 if (speed)
7645 if (VECTOR_MODE_P (mode))
7646 *cost += extra_cost->vect.alu;
7647 else
7648 *cost += extra_cost->alu.extend;
7650 return false;
7652 case ASHIFT:
7653 op0 = XEXP (x, 0);
7654 op1 = XEXP (x, 1);
7656 if (CONST_INT_P (op1))
7658 if (speed)
7660 if (VECTOR_MODE_P (mode))
7662 /* Vector shift (immediate). */
7663 *cost += extra_cost->vect.alu;
7665 else
7667 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7668 aliases. */
7669 *cost += extra_cost->alu.shift;
7673 /* We can incorporate zero/sign extend for free. */
7674 if (GET_CODE (op0) == ZERO_EXTEND
7675 || GET_CODE (op0) == SIGN_EXTEND)
7676 op0 = XEXP (op0, 0);
7678 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7679 return true;
7681 else
7683 if (VECTOR_MODE_P (mode))
7685 if (speed)
7686 /* Vector shift (register). */
7687 *cost += extra_cost->vect.alu;
7689 else
7691 if (speed)
7692 /* LSLV. */
7693 *cost += extra_cost->alu.shift_reg;
7695 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7696 && CONST_INT_P (XEXP (op1, 1))
7697 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7699 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7700 /* We already demanded XEXP (op1, 0) to be REG_P, so
7701 don't recurse into it. */
7702 return true;
7705 return false; /* All arguments need to be in registers. */
7708 case ROTATE:
7709 case ROTATERT:
7710 case LSHIFTRT:
7711 case ASHIFTRT:
7712 op0 = XEXP (x, 0);
7713 op1 = XEXP (x, 1);
7715 if (CONST_INT_P (op1))
7717 /* ASR (immediate) and friends. */
7718 if (speed)
7720 if (VECTOR_MODE_P (mode))
7721 *cost += extra_cost->vect.alu;
7722 else
7723 *cost += extra_cost->alu.shift;
7726 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7727 return true;
7729 else
7731 if (VECTOR_MODE_P (mode))
7733 if (speed)
7734 /* Vector shift (register). */
7735 *cost += extra_cost->vect.alu;
7737 else
7739 if (speed)
7740 /* ASR (register) and friends. */
7741 *cost += extra_cost->alu.shift_reg;
7743 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7744 && CONST_INT_P (XEXP (op1, 1))
7745 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7747 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7748 /* We already demanded XEXP (op1, 0) to be REG_P, so
7749 don't recurse into it. */
7750 return true;
7753 return false; /* All arguments need to be in registers. */
7756 case SYMBOL_REF:
7758 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7759 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7761 /* LDR. */
7762 if (speed)
7763 *cost += extra_cost->ldst.load;
7765 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7766 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7768 /* ADRP, followed by ADD. */
7769 *cost += COSTS_N_INSNS (1);
7770 if (speed)
7771 *cost += 2 * extra_cost->alu.arith;
7773 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7774 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7776 /* ADR. */
7777 if (speed)
7778 *cost += extra_cost->alu.arith;
7781 if (flag_pic)
7783 /* One extra load instruction, after accessing the GOT. */
7784 *cost += COSTS_N_INSNS (1);
7785 if (speed)
7786 *cost += extra_cost->ldst.load;
7788 return true;
7790 case HIGH:
7791 case LO_SUM:
7792 /* ADRP/ADD (immediate). */
7793 if (speed)
7794 *cost += extra_cost->alu.arith;
7795 return true;
7797 case ZERO_EXTRACT:
7798 case SIGN_EXTRACT:
7799 /* UBFX/SBFX. */
7800 if (speed)
7802 if (VECTOR_MODE_P (mode))
7803 *cost += extra_cost->vect.alu;
7804 else
7805 *cost += extra_cost->alu.bfx;
7808 /* We can trust that the immediates used will be correct (there
7809 are no by-register forms), so we need only cost op0. */
7810 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7811 return true;
7813 case MULT:
7814 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7815 /* aarch64_rtx_mult_cost always handles recursion to its
7816 operands. */
7817 return true;
7819 case MOD:
7820 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7821 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7822 an unconditional negate. This case should only ever be reached through
7823 the set_smod_pow2_cheap check in expmed.c. */
7824 if (CONST_INT_P (XEXP (x, 1))
7825 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7826 && (mode == SImode || mode == DImode))
7828 /* We expand to 4 instructions. Reset the baseline. */
7829 *cost = COSTS_N_INSNS (4);
7831 if (speed)
7832 *cost += 2 * extra_cost->alu.logical
7833 + 2 * extra_cost->alu.arith;
7835 return true;
7838 /* Fall-through. */
7839 case UMOD:
7840 if (speed)
7842 /* Slighly prefer UMOD over SMOD. */
7843 if (VECTOR_MODE_P (mode))
7844 *cost += extra_cost->vect.alu;
7845 else if (GET_MODE_CLASS (mode) == MODE_INT)
7846 *cost += (extra_cost->mult[mode == DImode].add
7847 + extra_cost->mult[mode == DImode].idiv
7848 + (code == MOD ? 1 : 0));
7850 return false; /* All arguments need to be in registers. */
7852 case DIV:
7853 case UDIV:
7854 case SQRT:
7855 if (speed)
7857 if (VECTOR_MODE_P (mode))
7858 *cost += extra_cost->vect.alu;
7859 else if (GET_MODE_CLASS (mode) == MODE_INT)
7860 /* There is no integer SQRT, so only DIV and UDIV can get
7861 here. */
7862 *cost += (extra_cost->mult[mode == DImode].idiv
7863 /* Slighly prefer UDIV over SDIV. */
7864 + (code == DIV ? 1 : 0));
7865 else
7866 *cost += extra_cost->fp[mode == DFmode].div;
7868 return false; /* All arguments need to be in registers. */
7870 case IF_THEN_ELSE:
7871 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7872 XEXP (x, 2), cost, speed);
7874 case EQ:
7875 case NE:
7876 case GT:
7877 case GTU:
7878 case LT:
7879 case LTU:
7880 case GE:
7881 case GEU:
7882 case LE:
7883 case LEU:
7885 return false; /* All arguments must be in registers. */
7887 case FMA:
7888 op0 = XEXP (x, 0);
7889 op1 = XEXP (x, 1);
7890 op2 = XEXP (x, 2);
7892 if (speed)
7894 if (VECTOR_MODE_P (mode))
7895 *cost += extra_cost->vect.alu;
7896 else
7897 *cost += extra_cost->fp[mode == DFmode].fma;
7900 /* FMSUB, FNMADD, and FNMSUB are free. */
7901 if (GET_CODE (op0) == NEG)
7902 op0 = XEXP (op0, 0);
7904 if (GET_CODE (op2) == NEG)
7905 op2 = XEXP (op2, 0);
7907 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7908 and the by-element operand as operand 0. */
7909 if (GET_CODE (op1) == NEG)
7910 op1 = XEXP (op1, 0);
7912 /* Catch vector-by-element operations. The by-element operand can
7913 either be (vec_duplicate (vec_select (x))) or just
7914 (vec_select (x)), depending on whether we are multiplying by
7915 a vector or a scalar.
7917 Canonicalization is not very good in these cases, FMA4 will put the
7918 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7919 if (GET_CODE (op0) == VEC_DUPLICATE)
7920 op0 = XEXP (op0, 0);
7921 else if (GET_CODE (op1) == VEC_DUPLICATE)
7922 op1 = XEXP (op1, 0);
7924 if (GET_CODE (op0) == VEC_SELECT)
7925 op0 = XEXP (op0, 0);
7926 else if (GET_CODE (op1) == VEC_SELECT)
7927 op1 = XEXP (op1, 0);
7929 /* If the remaining parameters are not registers,
7930 get the cost to put them into registers. */
7931 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7932 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7933 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7934 return true;
7936 case FLOAT:
7937 case UNSIGNED_FLOAT:
7938 if (speed)
7939 *cost += extra_cost->fp[mode == DFmode].fromint;
7940 return false;
7942 case FLOAT_EXTEND:
7943 if (speed)
7945 if (VECTOR_MODE_P (mode))
7947 /*Vector truncate. */
7948 *cost += extra_cost->vect.alu;
7950 else
7951 *cost += extra_cost->fp[mode == DFmode].widen;
7953 return false;
7955 case FLOAT_TRUNCATE:
7956 if (speed)
7958 if (VECTOR_MODE_P (mode))
7960 /*Vector conversion. */
7961 *cost += extra_cost->vect.alu;
7963 else
7964 *cost += extra_cost->fp[mode == DFmode].narrow;
7966 return false;
7968 case FIX:
7969 case UNSIGNED_FIX:
7970 x = XEXP (x, 0);
7971 /* Strip the rounding part. They will all be implemented
7972 by the fcvt* family of instructions anyway. */
7973 if (GET_CODE (x) == UNSPEC)
7975 unsigned int uns_code = XINT (x, 1);
7977 if (uns_code == UNSPEC_FRINTA
7978 || uns_code == UNSPEC_FRINTM
7979 || uns_code == UNSPEC_FRINTN
7980 || uns_code == UNSPEC_FRINTP
7981 || uns_code == UNSPEC_FRINTZ)
7982 x = XVECEXP (x, 0, 0);
7985 if (speed)
7987 if (VECTOR_MODE_P (mode))
7988 *cost += extra_cost->vect.alu;
7989 else
7990 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7993 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7994 fixed-point fcvt. */
7995 if (GET_CODE (x) == MULT
7996 && ((VECTOR_MODE_P (mode)
7997 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7998 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8000 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8001 0, speed);
8002 return true;
8005 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8006 return true;
8008 case ABS:
8009 if (VECTOR_MODE_P (mode))
8011 /* ABS (vector). */
8012 if (speed)
8013 *cost += extra_cost->vect.alu;
8015 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8017 op0 = XEXP (x, 0);
8019 /* FABD, which is analogous to FADD. */
8020 if (GET_CODE (op0) == MINUS)
8022 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8023 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8024 if (speed)
8025 *cost += extra_cost->fp[mode == DFmode].addsub;
8027 return true;
8029 /* Simple FABS is analogous to FNEG. */
8030 if (speed)
8031 *cost += extra_cost->fp[mode == DFmode].neg;
8033 else
8035 /* Integer ABS will either be split to
8036 two arithmetic instructions, or will be an ABS
8037 (scalar), which we don't model. */
8038 *cost = COSTS_N_INSNS (2);
8039 if (speed)
8040 *cost += 2 * extra_cost->alu.arith;
8042 return false;
8044 case SMAX:
8045 case SMIN:
8046 if (speed)
8048 if (VECTOR_MODE_P (mode))
8049 *cost += extra_cost->vect.alu;
8050 else
8052 /* FMAXNM/FMINNM/FMAX/FMIN.
8053 TODO: This may not be accurate for all implementations, but
8054 we do not model this in the cost tables. */
8055 *cost += extra_cost->fp[mode == DFmode].addsub;
8058 return false;
8060 case UNSPEC:
8061 /* The floating point round to integer frint* instructions. */
8062 if (aarch64_frint_unspec_p (XINT (x, 1)))
8064 if (speed)
8065 *cost += extra_cost->fp[mode == DFmode].roundint;
8067 return false;
8070 if (XINT (x, 1) == UNSPEC_RBIT)
8072 if (speed)
8073 *cost += extra_cost->alu.rev;
8075 return false;
8077 break;
8079 case TRUNCATE:
8081 /* Decompose <su>muldi3_highpart. */
8082 if (/* (truncate:DI */
8083 mode == DImode
8084 /* (lshiftrt:TI */
8085 && GET_MODE (XEXP (x, 0)) == TImode
8086 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8087 /* (mult:TI */
8088 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8089 /* (ANY_EXTEND:TI (reg:DI))
8090 (ANY_EXTEND:TI (reg:DI))) */
8091 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8092 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8093 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8094 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8095 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8096 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8097 /* (const_int 64) */
8098 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8099 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8101 /* UMULH/SMULH. */
8102 if (speed)
8103 *cost += extra_cost->mult[mode == DImode].extend;
8104 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8105 mode, MULT, 0, speed);
8106 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8107 mode, MULT, 1, speed);
8108 return true;
8111 /* Fall through. */
8112 default:
8113 break;
8116 if (dump_file
8117 && flag_aarch64_verbose_cost)
8118 fprintf (dump_file,
8119 "\nFailed to cost RTX. Assuming default cost.\n");
8121 return true;
8124 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8125 calculated for X. This cost is stored in *COST. Returns true
8126 if the total cost of X was calculated. */
8127 static bool
8128 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8129 int param, int *cost, bool speed)
8131 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8133 if (dump_file
8134 && flag_aarch64_verbose_cost)
8136 print_rtl_single (dump_file, x);
8137 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8138 speed ? "Hot" : "Cold",
8139 *cost, result ? "final" : "partial");
8142 return result;
8145 static int
8146 aarch64_register_move_cost (machine_mode mode,
8147 reg_class_t from_i, reg_class_t to_i)
8149 enum reg_class from = (enum reg_class) from_i;
8150 enum reg_class to = (enum reg_class) to_i;
8151 const struct cpu_regmove_cost *regmove_cost
8152 = aarch64_tune_params.regmove_cost;
8154 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8155 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8156 to = GENERAL_REGS;
8158 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8159 from = GENERAL_REGS;
8161 /* Moving between GPR and stack cost is the same as GP2GP. */
8162 if ((from == GENERAL_REGS && to == STACK_REG)
8163 || (to == GENERAL_REGS && from == STACK_REG))
8164 return regmove_cost->GP2GP;
8166 /* To/From the stack register, we move via the gprs. */
8167 if (to == STACK_REG || from == STACK_REG)
8168 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8169 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8171 if (GET_MODE_SIZE (mode) == 16)
8173 /* 128-bit operations on general registers require 2 instructions. */
8174 if (from == GENERAL_REGS && to == GENERAL_REGS)
8175 return regmove_cost->GP2GP * 2;
8176 else if (from == GENERAL_REGS)
8177 return regmove_cost->GP2FP * 2;
8178 else if (to == GENERAL_REGS)
8179 return regmove_cost->FP2GP * 2;
8181 /* When AdvSIMD instructions are disabled it is not possible to move
8182 a 128-bit value directly between Q registers. This is handled in
8183 secondary reload. A general register is used as a scratch to move
8184 the upper DI value and the lower DI value is moved directly,
8185 hence the cost is the sum of three moves. */
8186 if (! TARGET_SIMD)
8187 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8189 return regmove_cost->FP2FP;
8192 if (from == GENERAL_REGS && to == GENERAL_REGS)
8193 return regmove_cost->GP2GP;
8194 else if (from == GENERAL_REGS)
8195 return regmove_cost->GP2FP;
8196 else if (to == GENERAL_REGS)
8197 return regmove_cost->FP2GP;
8199 return regmove_cost->FP2FP;
8202 static int
8203 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8204 reg_class_t rclass ATTRIBUTE_UNUSED,
8205 bool in ATTRIBUTE_UNUSED)
8207 return aarch64_tune_params.memmov_cost;
8210 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8211 to optimize 1.0/sqrt. */
8213 static bool
8214 use_rsqrt_p (machine_mode mode)
8216 return (!flag_trapping_math
8217 && flag_unsafe_math_optimizations
8218 && ((aarch64_tune_params.approx_modes->recip_sqrt
8219 & AARCH64_APPROX_MODE (mode))
8220 || flag_mrecip_low_precision_sqrt));
8223 /* Function to decide when to use the approximate reciprocal square root
8224 builtin. */
8226 static tree
8227 aarch64_builtin_reciprocal (tree fndecl)
8229 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8231 if (!use_rsqrt_p (mode))
8232 return NULL_TREE;
8233 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8236 typedef rtx (*rsqrte_type) (rtx, rtx);
8238 /* Select reciprocal square root initial estimate insn depending on machine
8239 mode. */
8241 static rsqrte_type
8242 get_rsqrte_type (machine_mode mode)
8244 switch (mode)
8246 case E_DFmode: return gen_aarch64_rsqrtedf;
8247 case E_SFmode: return gen_aarch64_rsqrtesf;
8248 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8249 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8250 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8251 default: gcc_unreachable ();
8255 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8257 /* Select reciprocal square root series step insn depending on machine mode. */
8259 static rsqrts_type
8260 get_rsqrts_type (machine_mode mode)
8262 switch (mode)
8264 case E_DFmode: return gen_aarch64_rsqrtsdf;
8265 case E_SFmode: return gen_aarch64_rsqrtssf;
8266 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8267 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8268 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8269 default: gcc_unreachable ();
8273 /* Emit instruction sequence to compute either the approximate square root
8274 or its approximate reciprocal, depending on the flag RECP, and return
8275 whether the sequence was emitted or not. */
8277 bool
8278 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8280 machine_mode mode = GET_MODE (dst);
8282 if (GET_MODE_INNER (mode) == HFmode)
8284 gcc_assert (!recp);
8285 return false;
8288 machine_mode mmsk
8289 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)).require (),
8290 GET_MODE_NUNITS (mode));
8291 if (!recp)
8293 if (!(flag_mlow_precision_sqrt
8294 || (aarch64_tune_params.approx_modes->sqrt
8295 & AARCH64_APPROX_MODE (mode))))
8296 return false;
8298 if (flag_finite_math_only
8299 || flag_trapping_math
8300 || !flag_unsafe_math_optimizations
8301 || optimize_function_for_size_p (cfun))
8302 return false;
8304 else
8305 /* Caller assumes we cannot fail. */
8306 gcc_assert (use_rsqrt_p (mode));
8309 rtx xmsk = gen_reg_rtx (mmsk);
8310 if (!recp)
8311 /* When calculating the approximate square root, compare the
8312 argument with 0.0 and create a mask. */
8313 emit_insn (gen_rtx_SET (xmsk,
8314 gen_rtx_NEG (mmsk,
8315 gen_rtx_EQ (mmsk, src,
8316 CONST0_RTX (mode)))));
8318 /* Estimate the approximate reciprocal square root. */
8319 rtx xdst = gen_reg_rtx (mode);
8320 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8322 /* Iterate over the series twice for SF and thrice for DF. */
8323 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8325 /* Optionally iterate over the series once less for faster performance
8326 while sacrificing the accuracy. */
8327 if ((recp && flag_mrecip_low_precision_sqrt)
8328 || (!recp && flag_mlow_precision_sqrt))
8329 iterations--;
8331 /* Iterate over the series to calculate the approximate reciprocal square
8332 root. */
8333 rtx x1 = gen_reg_rtx (mode);
8334 while (iterations--)
8336 rtx x2 = gen_reg_rtx (mode);
8337 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8339 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8341 if (iterations > 0)
8342 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8345 if (!recp)
8347 /* Qualify the approximate reciprocal square root when the argument is
8348 0.0 by squashing the intermediary result to 0.0. */
8349 rtx xtmp = gen_reg_rtx (mmsk);
8350 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8351 gen_rtx_SUBREG (mmsk, xdst, 0)));
8352 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8354 /* Calculate the approximate square root. */
8355 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8358 /* Finalize the approximation. */
8359 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8361 return true;
8364 typedef rtx (*recpe_type) (rtx, rtx);
8366 /* Select reciprocal initial estimate insn depending on machine mode. */
8368 static recpe_type
8369 get_recpe_type (machine_mode mode)
8371 switch (mode)
8373 case E_SFmode: return (gen_aarch64_frecpesf);
8374 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8375 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8376 case E_DFmode: return (gen_aarch64_frecpedf);
8377 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8378 default: gcc_unreachable ();
8382 typedef rtx (*recps_type) (rtx, rtx, rtx);
8384 /* Select reciprocal series step insn depending on machine mode. */
8386 static recps_type
8387 get_recps_type (machine_mode mode)
8389 switch (mode)
8391 case E_SFmode: return (gen_aarch64_frecpssf);
8392 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8393 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8394 case E_DFmode: return (gen_aarch64_frecpsdf);
8395 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8396 default: gcc_unreachable ();
8400 /* Emit the instruction sequence to compute the approximation for the division
8401 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8403 bool
8404 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8406 machine_mode mode = GET_MODE (quo);
8408 if (GET_MODE_INNER (mode) == HFmode)
8409 return false;
8411 bool use_approx_division_p = (flag_mlow_precision_div
8412 || (aarch64_tune_params.approx_modes->division
8413 & AARCH64_APPROX_MODE (mode)));
8415 if (!flag_finite_math_only
8416 || flag_trapping_math
8417 || !flag_unsafe_math_optimizations
8418 || optimize_function_for_size_p (cfun)
8419 || !use_approx_division_p)
8420 return false;
8422 /* Estimate the approximate reciprocal. */
8423 rtx xrcp = gen_reg_rtx (mode);
8424 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8426 /* Iterate over the series twice for SF and thrice for DF. */
8427 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8429 /* Optionally iterate over the series once less for faster performance,
8430 while sacrificing the accuracy. */
8431 if (flag_mlow_precision_div)
8432 iterations--;
8434 /* Iterate over the series to calculate the approximate reciprocal. */
8435 rtx xtmp = gen_reg_rtx (mode);
8436 while (iterations--)
8438 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8440 if (iterations > 0)
8441 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8444 if (num != CONST1_RTX (mode))
8446 /* As the approximate reciprocal of DEN is already calculated, only
8447 calculate the approximate division when NUM is not 1.0. */
8448 rtx xnum = force_reg (mode, num);
8449 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8452 /* Finalize the approximation. */
8453 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8454 return true;
8457 /* Return the number of instructions that can be issued per cycle. */
8458 static int
8459 aarch64_sched_issue_rate (void)
8461 return aarch64_tune_params.issue_rate;
8464 static int
8465 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8467 int issue_rate = aarch64_sched_issue_rate ();
8469 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8473 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8474 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8475 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8477 static int
8478 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8479 int ready_index)
8481 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8485 /* Vectorizer cost model target hooks. */
8487 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8488 static int
8489 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8490 tree vectype,
8491 int misalign ATTRIBUTE_UNUSED)
8493 unsigned elements;
8494 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8495 bool fp = false;
8497 if (vectype != NULL)
8498 fp = FLOAT_TYPE_P (vectype);
8500 switch (type_of_cost)
8502 case scalar_stmt:
8503 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8505 case scalar_load:
8506 return costs->scalar_load_cost;
8508 case scalar_store:
8509 return costs->scalar_store_cost;
8511 case vector_stmt:
8512 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8514 case vector_load:
8515 return costs->vec_align_load_cost;
8517 case vector_store:
8518 return costs->vec_store_cost;
8520 case vec_to_scalar:
8521 return costs->vec_to_scalar_cost;
8523 case scalar_to_vec:
8524 return costs->scalar_to_vec_cost;
8526 case unaligned_load:
8527 return costs->vec_unalign_load_cost;
8529 case unaligned_store:
8530 return costs->vec_unalign_store_cost;
8532 case cond_branch_taken:
8533 return costs->cond_taken_branch_cost;
8535 case cond_branch_not_taken:
8536 return costs->cond_not_taken_branch_cost;
8538 case vec_perm:
8539 return costs->vec_permute_cost;
8541 case vec_promote_demote:
8542 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8544 case vec_construct:
8545 elements = TYPE_VECTOR_SUBPARTS (vectype);
8546 return elements / 2 + 1;
8548 default:
8549 gcc_unreachable ();
8553 /* Implement targetm.vectorize.add_stmt_cost. */
8554 static unsigned
8555 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8556 struct _stmt_vec_info *stmt_info, int misalign,
8557 enum vect_cost_model_location where)
8559 unsigned *cost = (unsigned *) data;
8560 unsigned retval = 0;
8562 if (flag_vect_cost_model)
8564 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8565 int stmt_cost =
8566 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8568 /* Statements in an inner loop relative to the loop being
8569 vectorized are weighted more heavily. The value here is
8570 arbitrary and could potentially be improved with analysis. */
8571 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8572 count *= 50; /* FIXME */
8574 retval = (unsigned) (count * stmt_cost);
8575 cost[where] += retval;
8578 return retval;
8581 static void initialize_aarch64_code_model (struct gcc_options *);
8583 /* Parse the TO_PARSE string and put the architecture struct that it
8584 selects into RES and the architectural features into ISA_FLAGS.
8585 Return an aarch64_parse_opt_result describing the parse result.
8586 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8588 static enum aarch64_parse_opt_result
8589 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8590 unsigned long *isa_flags)
8592 char *ext;
8593 const struct processor *arch;
8594 char *str = (char *) alloca (strlen (to_parse) + 1);
8595 size_t len;
8597 strcpy (str, to_parse);
8599 ext = strchr (str, '+');
8601 if (ext != NULL)
8602 len = ext - str;
8603 else
8604 len = strlen (str);
8606 if (len == 0)
8607 return AARCH64_PARSE_MISSING_ARG;
8610 /* Loop through the list of supported ARCHes to find a match. */
8611 for (arch = all_architectures; arch->name != NULL; arch++)
8613 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8615 unsigned long isa_temp = arch->flags;
8617 if (ext != NULL)
8619 /* TO_PARSE string contains at least one extension. */
8620 enum aarch64_parse_opt_result ext_res
8621 = aarch64_parse_extension (ext, &isa_temp);
8623 if (ext_res != AARCH64_PARSE_OK)
8624 return ext_res;
8626 /* Extension parsing was successful. Confirm the result
8627 arch and ISA flags. */
8628 *res = arch;
8629 *isa_flags = isa_temp;
8630 return AARCH64_PARSE_OK;
8634 /* ARCH name not found in list. */
8635 return AARCH64_PARSE_INVALID_ARG;
8638 /* Parse the TO_PARSE string and put the result tuning in RES and the
8639 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8640 describing the parse result. If there is an error parsing, RES and
8641 ISA_FLAGS are left unchanged. */
8643 static enum aarch64_parse_opt_result
8644 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8645 unsigned long *isa_flags)
8647 char *ext;
8648 const struct processor *cpu;
8649 char *str = (char *) alloca (strlen (to_parse) + 1);
8650 size_t len;
8652 strcpy (str, to_parse);
8654 ext = strchr (str, '+');
8656 if (ext != NULL)
8657 len = ext - str;
8658 else
8659 len = strlen (str);
8661 if (len == 0)
8662 return AARCH64_PARSE_MISSING_ARG;
8665 /* Loop through the list of supported CPUs to find a match. */
8666 for (cpu = all_cores; cpu->name != NULL; cpu++)
8668 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8670 unsigned long isa_temp = cpu->flags;
8673 if (ext != NULL)
8675 /* TO_PARSE string contains at least one extension. */
8676 enum aarch64_parse_opt_result ext_res
8677 = aarch64_parse_extension (ext, &isa_temp);
8679 if (ext_res != AARCH64_PARSE_OK)
8680 return ext_res;
8682 /* Extension parsing was successfull. Confirm the result
8683 cpu and ISA flags. */
8684 *res = cpu;
8685 *isa_flags = isa_temp;
8686 return AARCH64_PARSE_OK;
8690 /* CPU name not found in list. */
8691 return AARCH64_PARSE_INVALID_ARG;
8694 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8695 Return an aarch64_parse_opt_result describing the parse result.
8696 If the parsing fails the RES does not change. */
8698 static enum aarch64_parse_opt_result
8699 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8701 const struct processor *cpu;
8702 char *str = (char *) alloca (strlen (to_parse) + 1);
8704 strcpy (str, to_parse);
8706 /* Loop through the list of supported CPUs to find a match. */
8707 for (cpu = all_cores; cpu->name != NULL; cpu++)
8709 if (strcmp (cpu->name, str) == 0)
8711 *res = cpu;
8712 return AARCH64_PARSE_OK;
8716 /* CPU name not found in list. */
8717 return AARCH64_PARSE_INVALID_ARG;
8720 /* Parse TOKEN, which has length LENGTH to see if it is an option
8721 described in FLAG. If it is, return the index bit for that fusion type.
8722 If not, error (printing OPTION_NAME) and return zero. */
8724 static unsigned int
8725 aarch64_parse_one_option_token (const char *token,
8726 size_t length,
8727 const struct aarch64_flag_desc *flag,
8728 const char *option_name)
8730 for (; flag->name != NULL; flag++)
8732 if (length == strlen (flag->name)
8733 && !strncmp (flag->name, token, length))
8734 return flag->flag;
8737 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8738 return 0;
8741 /* Parse OPTION which is a comma-separated list of flags to enable.
8742 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8743 default state we inherit from the CPU tuning structures. OPTION_NAME
8744 gives the top-level option we are parsing in the -moverride string,
8745 for use in error messages. */
8747 static unsigned int
8748 aarch64_parse_boolean_options (const char *option,
8749 const struct aarch64_flag_desc *flags,
8750 unsigned int initial_state,
8751 const char *option_name)
8753 const char separator = '.';
8754 const char* specs = option;
8755 const char* ntoken = option;
8756 unsigned int found_flags = initial_state;
8758 while ((ntoken = strchr (specs, separator)))
8760 size_t token_length = ntoken - specs;
8761 unsigned token_ops = aarch64_parse_one_option_token (specs,
8762 token_length,
8763 flags,
8764 option_name);
8765 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8766 in the token stream, reset the supported operations. So:
8768 adrp+add.cmp+branch.none.adrp+add
8770 would have the result of turning on only adrp+add fusion. */
8771 if (!token_ops)
8772 found_flags = 0;
8774 found_flags |= token_ops;
8775 specs = ++ntoken;
8778 /* We ended with a comma, print something. */
8779 if (!(*specs))
8781 error ("%s string ill-formed\n", option_name);
8782 return 0;
8785 /* We still have one more token to parse. */
8786 size_t token_length = strlen (specs);
8787 unsigned token_ops = aarch64_parse_one_option_token (specs,
8788 token_length,
8789 flags,
8790 option_name);
8791 if (!token_ops)
8792 found_flags = 0;
8794 found_flags |= token_ops;
8795 return found_flags;
8798 /* Support for overriding instruction fusion. */
8800 static void
8801 aarch64_parse_fuse_string (const char *fuse_string,
8802 struct tune_params *tune)
8804 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8805 aarch64_fusible_pairs,
8806 tune->fusible_ops,
8807 "fuse=");
8810 /* Support for overriding other tuning flags. */
8812 static void
8813 aarch64_parse_tune_string (const char *tune_string,
8814 struct tune_params *tune)
8816 tune->extra_tuning_flags
8817 = aarch64_parse_boolean_options (tune_string,
8818 aarch64_tuning_flags,
8819 tune->extra_tuning_flags,
8820 "tune=");
8823 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8824 we understand. If it is, extract the option string and handoff to
8825 the appropriate function. */
8827 void
8828 aarch64_parse_one_override_token (const char* token,
8829 size_t length,
8830 struct tune_params *tune)
8832 const struct aarch64_tuning_override_function *fn
8833 = aarch64_tuning_override_functions;
8835 const char *option_part = strchr (token, '=');
8836 if (!option_part)
8838 error ("tuning string missing in option (%s)", token);
8839 return;
8842 /* Get the length of the option name. */
8843 length = option_part - token;
8844 /* Skip the '=' to get to the option string. */
8845 option_part++;
8847 for (; fn->name != NULL; fn++)
8849 if (!strncmp (fn->name, token, length))
8851 fn->parse_override (option_part, tune);
8852 return;
8856 error ("unknown tuning option (%s)",token);
8857 return;
8860 /* A checking mechanism for the implementation of the tls size. */
8862 static void
8863 initialize_aarch64_tls_size (struct gcc_options *opts)
8865 if (aarch64_tls_size == 0)
8866 aarch64_tls_size = 24;
8868 switch (opts->x_aarch64_cmodel_var)
8870 case AARCH64_CMODEL_TINY:
8871 /* Both the default and maximum TLS size allowed under tiny is 1M which
8872 needs two instructions to address, so we clamp the size to 24. */
8873 if (aarch64_tls_size > 24)
8874 aarch64_tls_size = 24;
8875 break;
8876 case AARCH64_CMODEL_SMALL:
8877 /* The maximum TLS size allowed under small is 4G. */
8878 if (aarch64_tls_size > 32)
8879 aarch64_tls_size = 32;
8880 break;
8881 case AARCH64_CMODEL_LARGE:
8882 /* The maximum TLS size allowed under large is 16E.
8883 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8884 if (aarch64_tls_size > 48)
8885 aarch64_tls_size = 48;
8886 break;
8887 default:
8888 gcc_unreachable ();
8891 return;
8894 /* Parse STRING looking for options in the format:
8895 string :: option:string
8896 option :: name=substring
8897 name :: {a-z}
8898 substring :: defined by option. */
8900 static void
8901 aarch64_parse_override_string (const char* input_string,
8902 struct tune_params* tune)
8904 const char separator = ':';
8905 size_t string_length = strlen (input_string) + 1;
8906 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8907 char *string = string_root;
8908 strncpy (string, input_string, string_length);
8909 string[string_length - 1] = '\0';
8911 char* ntoken = string;
8913 while ((ntoken = strchr (string, separator)))
8915 size_t token_length = ntoken - string;
8916 /* Make this substring look like a string. */
8917 *ntoken = '\0';
8918 aarch64_parse_one_override_token (string, token_length, tune);
8919 string = ++ntoken;
8922 /* One last option to parse. */
8923 aarch64_parse_one_override_token (string, strlen (string), tune);
8924 free (string_root);
8928 static void
8929 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8931 /* The logic here is that if we are disabling all frame pointer generation
8932 then we do not need to disable leaf frame pointer generation as a
8933 separate operation. But if we are *only* disabling leaf frame pointer
8934 generation then we set flag_omit_frame_pointer to true, but in
8935 aarch64_frame_pointer_required we return false only for leaf functions.
8937 PR 70044: We have to be careful about being called multiple times for the
8938 same function. Once we have decided to set flag_omit_frame_pointer just
8939 so that we can omit leaf frame pointers, we must then not interpret a
8940 second call as meaning that all frame pointer generation should be
8941 omitted. We do this by setting flag_omit_frame_pointer to a special,
8942 non-zero value. */
8943 if (opts->x_flag_omit_frame_pointer == 2)
8944 opts->x_flag_omit_frame_pointer = 0;
8946 if (opts->x_flag_omit_frame_pointer)
8947 opts->x_flag_omit_leaf_frame_pointer = false;
8948 else if (opts->x_flag_omit_leaf_frame_pointer)
8949 opts->x_flag_omit_frame_pointer = 2;
8951 /* If not optimizing for size, set the default
8952 alignment to what the target wants. */
8953 if (!opts->x_optimize_size)
8955 if (opts->x_align_loops <= 0)
8956 opts->x_align_loops = aarch64_tune_params.loop_align;
8957 if (opts->x_align_jumps <= 0)
8958 opts->x_align_jumps = aarch64_tune_params.jump_align;
8959 if (opts->x_align_functions <= 0)
8960 opts->x_align_functions = aarch64_tune_params.function_align;
8963 /* We default to no pc-relative literal loads. */
8965 aarch64_pcrelative_literal_loads = false;
8967 /* If -mpc-relative-literal-loads is set on the command line, this
8968 implies that the user asked for PC relative literal loads. */
8969 if (opts->x_pcrelative_literal_loads == 1)
8970 aarch64_pcrelative_literal_loads = true;
8972 /* This is PR70113. When building the Linux kernel with
8973 CONFIG_ARM64_ERRATUM_843419, support for relocations
8974 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8975 removed from the kernel to avoid loading objects with possibly
8976 offending sequences. Without -mpc-relative-literal-loads we would
8977 generate such relocations, preventing the kernel build from
8978 succeeding. */
8979 if (opts->x_pcrelative_literal_loads == 2
8980 && TARGET_FIX_ERR_A53_843419)
8981 aarch64_pcrelative_literal_loads = true;
8983 /* In the tiny memory model it makes no sense to disallow PC relative
8984 literal pool loads. */
8985 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8986 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8987 aarch64_pcrelative_literal_loads = true;
8989 /* When enabling the lower precision Newton series for the square root, also
8990 enable it for the reciprocal square root, since the latter is an
8991 intermediary step for the former. */
8992 if (flag_mlow_precision_sqrt)
8993 flag_mrecip_low_precision_sqrt = true;
8996 /* 'Unpack' up the internal tuning structs and update the options
8997 in OPTS. The caller must have set up selected_tune and selected_arch
8998 as all the other target-specific codegen decisions are
8999 derived from them. */
9001 void
9002 aarch64_override_options_internal (struct gcc_options *opts)
9004 aarch64_tune_flags = selected_tune->flags;
9005 aarch64_tune = selected_tune->sched_core;
9006 /* Make a copy of the tuning parameters attached to the core, which
9007 we may later overwrite. */
9008 aarch64_tune_params = *(selected_tune->tune);
9009 aarch64_architecture_version = selected_arch->architecture_version;
9011 if (opts->x_aarch64_override_tune_string)
9012 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9013 &aarch64_tune_params);
9015 /* This target defaults to strict volatile bitfields. */
9016 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9017 opts->x_flag_strict_volatile_bitfields = 1;
9019 initialize_aarch64_code_model (opts);
9020 initialize_aarch64_tls_size (opts);
9022 int queue_depth = 0;
9023 switch (aarch64_tune_params.autoprefetcher_model)
9025 case tune_params::AUTOPREFETCHER_OFF:
9026 queue_depth = -1;
9027 break;
9028 case tune_params::AUTOPREFETCHER_WEAK:
9029 queue_depth = 0;
9030 break;
9031 case tune_params::AUTOPREFETCHER_STRONG:
9032 queue_depth = max_insn_queue_index + 1;
9033 break;
9034 default:
9035 gcc_unreachable ();
9038 /* We don't mind passing in global_options_set here as we don't use
9039 the *options_set structs anyway. */
9040 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9041 queue_depth,
9042 opts->x_param_values,
9043 global_options_set.x_param_values);
9045 /* Set up parameters to be used in prefetching algorithm. Do not
9046 override the defaults unless we are tuning for a core we have
9047 researched values for. */
9048 if (aarch64_tune_params.prefetch->num_slots > 0)
9049 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9050 aarch64_tune_params.prefetch->num_slots,
9051 opts->x_param_values,
9052 global_options_set.x_param_values);
9053 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9054 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9055 aarch64_tune_params.prefetch->l1_cache_size,
9056 opts->x_param_values,
9057 global_options_set.x_param_values);
9058 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9059 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9060 aarch64_tune_params.prefetch->l1_cache_line_size,
9061 opts->x_param_values,
9062 global_options_set.x_param_values);
9063 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9064 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9065 aarch64_tune_params.prefetch->l2_cache_size,
9066 opts->x_param_values,
9067 global_options_set.x_param_values);
9069 /* Enable sw prefetching at specified optimization level for
9070 CPUS that have prefetch. Lower optimization level threshold by 1
9071 when profiling is enabled. */
9072 if (opts->x_flag_prefetch_loop_arrays < 0
9073 && !opts->x_optimize_size
9074 && aarch64_tune_params.prefetch->default_opt_level >= 0
9075 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9076 opts->x_flag_prefetch_loop_arrays = 1;
9078 aarch64_override_options_after_change_1 (opts);
9081 /* Print a hint with a suggestion for a core or architecture name that
9082 most closely resembles what the user passed in STR. ARCH is true if
9083 the user is asking for an architecture name. ARCH is false if the user
9084 is asking for a core name. */
9086 static void
9087 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9089 auto_vec<const char *> candidates;
9090 const struct processor *entry = arch ? all_architectures : all_cores;
9091 for (; entry->name != NULL; entry++)
9092 candidates.safe_push (entry->name);
9093 char *s;
9094 const char *hint = candidates_list_and_hint (str, s, candidates);
9095 if (hint)
9096 inform (input_location, "valid arguments are: %s;"
9097 " did you mean %qs?", s, hint);
9098 XDELETEVEC (s);
9101 /* Print a hint with a suggestion for a core name that most closely resembles
9102 what the user passed in STR. */
9104 inline static void
9105 aarch64_print_hint_for_core (const char *str)
9107 aarch64_print_hint_for_core_or_arch (str, false);
9110 /* Print a hint with a suggestion for an architecture name that most closely
9111 resembles what the user passed in STR. */
9113 inline static void
9114 aarch64_print_hint_for_arch (const char *str)
9116 aarch64_print_hint_for_core_or_arch (str, true);
9119 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9120 specified in STR and throw errors if appropriate. Put the results if
9121 they are valid in RES and ISA_FLAGS. Return whether the option is
9122 valid. */
9124 static bool
9125 aarch64_validate_mcpu (const char *str, const struct processor **res,
9126 unsigned long *isa_flags)
9128 enum aarch64_parse_opt_result parse_res
9129 = aarch64_parse_cpu (str, res, isa_flags);
9131 if (parse_res == AARCH64_PARSE_OK)
9132 return true;
9134 switch (parse_res)
9136 case AARCH64_PARSE_MISSING_ARG:
9137 error ("missing cpu name in %<-mcpu=%s%>", str);
9138 break;
9139 case AARCH64_PARSE_INVALID_ARG:
9140 error ("unknown value %qs for -mcpu", str);
9141 aarch64_print_hint_for_core (str);
9142 break;
9143 case AARCH64_PARSE_INVALID_FEATURE:
9144 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9145 break;
9146 default:
9147 gcc_unreachable ();
9150 return false;
9153 /* Validate a command-line -march option. Parse the arch and extensions
9154 (if any) specified in STR and throw errors if appropriate. Put the
9155 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9156 option is valid. */
9158 static bool
9159 aarch64_validate_march (const char *str, const struct processor **res,
9160 unsigned long *isa_flags)
9162 enum aarch64_parse_opt_result parse_res
9163 = aarch64_parse_arch (str, res, isa_flags);
9165 if (parse_res == AARCH64_PARSE_OK)
9166 return true;
9168 switch (parse_res)
9170 case AARCH64_PARSE_MISSING_ARG:
9171 error ("missing arch name in %<-march=%s%>", str);
9172 break;
9173 case AARCH64_PARSE_INVALID_ARG:
9174 error ("unknown value %qs for -march", str);
9175 aarch64_print_hint_for_arch (str);
9176 break;
9177 case AARCH64_PARSE_INVALID_FEATURE:
9178 error ("invalid feature modifier in %<-march=%s%>", str);
9179 break;
9180 default:
9181 gcc_unreachable ();
9184 return false;
9187 /* Validate a command-line -mtune option. Parse the cpu
9188 specified in STR and throw errors if appropriate. Put the
9189 result, if it is valid, in RES. Return whether the option is
9190 valid. */
9192 static bool
9193 aarch64_validate_mtune (const char *str, const struct processor **res)
9195 enum aarch64_parse_opt_result parse_res
9196 = aarch64_parse_tune (str, res);
9198 if (parse_res == AARCH64_PARSE_OK)
9199 return true;
9201 switch (parse_res)
9203 case AARCH64_PARSE_MISSING_ARG:
9204 error ("missing cpu name in %<-mtune=%s%>", str);
9205 break;
9206 case AARCH64_PARSE_INVALID_ARG:
9207 error ("unknown value %qs for -mtune", str);
9208 aarch64_print_hint_for_core (str);
9209 break;
9210 default:
9211 gcc_unreachable ();
9213 return false;
9216 /* Return the CPU corresponding to the enum CPU.
9217 If it doesn't specify a cpu, return the default. */
9219 static const struct processor *
9220 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9222 if (cpu != aarch64_none)
9223 return &all_cores[cpu];
9225 /* The & 0x3f is to extract the bottom 6 bits that encode the
9226 default cpu as selected by the --with-cpu GCC configure option
9227 in config.gcc.
9228 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9229 flags mechanism should be reworked to make it more sane. */
9230 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9233 /* Return the architecture corresponding to the enum ARCH.
9234 If it doesn't specify a valid architecture, return the default. */
9236 static const struct processor *
9237 aarch64_get_arch (enum aarch64_arch arch)
9239 if (arch != aarch64_no_arch)
9240 return &all_architectures[arch];
9242 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9244 return &all_architectures[cpu->arch];
9247 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9248 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9249 tuning structs. In particular it must set selected_tune and
9250 aarch64_isa_flags that define the available ISA features and tuning
9251 decisions. It must also set selected_arch as this will be used to
9252 output the .arch asm tags for each function. */
9254 static void
9255 aarch64_override_options (void)
9257 unsigned long cpu_isa = 0;
9258 unsigned long arch_isa = 0;
9259 aarch64_isa_flags = 0;
9261 bool valid_cpu = true;
9262 bool valid_tune = true;
9263 bool valid_arch = true;
9265 selected_cpu = NULL;
9266 selected_arch = NULL;
9267 selected_tune = NULL;
9269 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9270 If either of -march or -mtune is given, they override their
9271 respective component of -mcpu. */
9272 if (aarch64_cpu_string)
9273 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9274 &cpu_isa);
9276 if (aarch64_arch_string)
9277 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9278 &arch_isa);
9280 if (aarch64_tune_string)
9281 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9283 /* If the user did not specify a processor, choose the default
9284 one for them. This will be the CPU set during configuration using
9285 --with-cpu, otherwise it is "generic". */
9286 if (!selected_cpu)
9288 if (selected_arch)
9290 selected_cpu = &all_cores[selected_arch->ident];
9291 aarch64_isa_flags = arch_isa;
9292 explicit_arch = selected_arch->arch;
9294 else
9296 /* Get default configure-time CPU. */
9297 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9298 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9301 if (selected_tune)
9302 explicit_tune_core = selected_tune->ident;
9304 /* If both -mcpu and -march are specified check that they are architecturally
9305 compatible, warn if they're not and prefer the -march ISA flags. */
9306 else if (selected_arch)
9308 if (selected_arch->arch != selected_cpu->arch)
9310 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9311 all_architectures[selected_cpu->arch].name,
9312 selected_arch->name);
9314 aarch64_isa_flags = arch_isa;
9315 explicit_arch = selected_arch->arch;
9316 explicit_tune_core = selected_tune ? selected_tune->ident
9317 : selected_cpu->ident;
9319 else
9321 /* -mcpu but no -march. */
9322 aarch64_isa_flags = cpu_isa;
9323 explicit_tune_core = selected_tune ? selected_tune->ident
9324 : selected_cpu->ident;
9325 gcc_assert (selected_cpu);
9326 selected_arch = &all_architectures[selected_cpu->arch];
9327 explicit_arch = selected_arch->arch;
9330 /* Set the arch as well as we will need it when outputing
9331 the .arch directive in assembly. */
9332 if (!selected_arch)
9334 gcc_assert (selected_cpu);
9335 selected_arch = &all_architectures[selected_cpu->arch];
9338 if (!selected_tune)
9339 selected_tune = selected_cpu;
9341 #ifndef HAVE_AS_MABI_OPTION
9342 /* The compiler may have been configured with 2.23.* binutils, which does
9343 not have support for ILP32. */
9344 if (TARGET_ILP32)
9345 error ("Assembler does not support -mabi=ilp32");
9346 #endif
9348 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9349 sorry ("Return address signing is only supported for -mabi=lp64");
9351 /* Make sure we properly set up the explicit options. */
9352 if ((aarch64_cpu_string && valid_cpu)
9353 || (aarch64_tune_string && valid_tune))
9354 gcc_assert (explicit_tune_core != aarch64_none);
9356 if ((aarch64_cpu_string && valid_cpu)
9357 || (aarch64_arch_string && valid_arch))
9358 gcc_assert (explicit_arch != aarch64_no_arch);
9360 aarch64_override_options_internal (&global_options);
9362 /* Save these options as the default ones in case we push and pop them later
9363 while processing functions with potential target attributes. */
9364 target_option_default_node = target_option_current_node
9365 = build_target_option_node (&global_options);
9368 /* Implement targetm.override_options_after_change. */
9370 static void
9371 aarch64_override_options_after_change (void)
9373 aarch64_override_options_after_change_1 (&global_options);
9376 static struct machine_function *
9377 aarch64_init_machine_status (void)
9379 struct machine_function *machine;
9380 machine = ggc_cleared_alloc<machine_function> ();
9381 return machine;
9384 void
9385 aarch64_init_expanders (void)
9387 init_machine_status = aarch64_init_machine_status;
9390 /* A checking mechanism for the implementation of the various code models. */
9391 static void
9392 initialize_aarch64_code_model (struct gcc_options *opts)
9394 if (opts->x_flag_pic)
9396 switch (opts->x_aarch64_cmodel_var)
9398 case AARCH64_CMODEL_TINY:
9399 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9400 break;
9401 case AARCH64_CMODEL_SMALL:
9402 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9403 aarch64_cmodel = (flag_pic == 2
9404 ? AARCH64_CMODEL_SMALL_PIC
9405 : AARCH64_CMODEL_SMALL_SPIC);
9406 #else
9407 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9408 #endif
9409 break;
9410 case AARCH64_CMODEL_LARGE:
9411 sorry ("code model %qs with -f%s", "large",
9412 opts->x_flag_pic > 1 ? "PIC" : "pic");
9413 break;
9414 default:
9415 gcc_unreachable ();
9418 else
9419 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9422 /* Implement TARGET_OPTION_SAVE. */
9424 static void
9425 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9427 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9430 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9431 using the information saved in PTR. */
9433 static void
9434 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9436 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9437 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9438 opts->x_explicit_arch = ptr->x_explicit_arch;
9439 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9440 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9442 aarch64_override_options_internal (opts);
9445 /* Implement TARGET_OPTION_PRINT. */
9447 static void
9448 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9450 const struct processor *cpu
9451 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9452 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9453 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9454 std::string extension
9455 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9457 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9458 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9459 arch->name, extension.c_str ());
9462 static GTY(()) tree aarch64_previous_fndecl;
9464 void
9465 aarch64_reset_previous_fndecl (void)
9467 aarch64_previous_fndecl = NULL;
9470 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9471 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9472 make sure optab availability predicates are recomputed when necessary. */
9474 void
9475 aarch64_save_restore_target_globals (tree new_tree)
9477 if (TREE_TARGET_GLOBALS (new_tree))
9478 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9479 else if (new_tree == target_option_default_node)
9480 restore_target_globals (&default_target_globals);
9481 else
9482 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9485 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9486 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9487 of the function, if such exists. This function may be called multiple
9488 times on a single function so use aarch64_previous_fndecl to avoid
9489 setting up identical state. */
9491 static void
9492 aarch64_set_current_function (tree fndecl)
9494 if (!fndecl || fndecl == aarch64_previous_fndecl)
9495 return;
9497 tree old_tree = (aarch64_previous_fndecl
9498 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9499 : NULL_TREE);
9501 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9503 /* If current function has no attributes but the previous one did,
9504 use the default node. */
9505 if (!new_tree && old_tree)
9506 new_tree = target_option_default_node;
9508 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9509 the default have been handled by aarch64_save_restore_target_globals from
9510 aarch64_pragma_target_parse. */
9511 if (old_tree == new_tree)
9512 return;
9514 aarch64_previous_fndecl = fndecl;
9516 /* First set the target options. */
9517 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9519 aarch64_save_restore_target_globals (new_tree);
9522 /* Enum describing the various ways we can handle attributes.
9523 In many cases we can reuse the generic option handling machinery. */
9525 enum aarch64_attr_opt_type
9527 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9528 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9529 aarch64_attr_enum, /* Attribute sets an enum variable. */
9530 aarch64_attr_custom /* Attribute requires a custom handling function. */
9533 /* All the information needed to handle a target attribute.
9534 NAME is the name of the attribute.
9535 ATTR_TYPE specifies the type of behavior of the attribute as described
9536 in the definition of enum aarch64_attr_opt_type.
9537 ALLOW_NEG is true if the attribute supports a "no-" form.
9538 HANDLER is the function that takes the attribute string and whether
9539 it is a pragma or attribute and handles the option. It is needed only
9540 when the ATTR_TYPE is aarch64_attr_custom.
9541 OPT_NUM is the enum specifying the option that the attribute modifies.
9542 This is needed for attributes that mirror the behavior of a command-line
9543 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9544 aarch64_attr_enum. */
9546 struct aarch64_attribute_info
9548 const char *name;
9549 enum aarch64_attr_opt_type attr_type;
9550 bool allow_neg;
9551 bool (*handler) (const char *, const char *);
9552 enum opt_code opt_num;
9555 /* Handle the ARCH_STR argument to the arch= target attribute.
9556 PRAGMA_OR_ATTR is used in potential error messages. */
9558 static bool
9559 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9561 const struct processor *tmp_arch = NULL;
9562 enum aarch64_parse_opt_result parse_res
9563 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9565 if (parse_res == AARCH64_PARSE_OK)
9567 gcc_assert (tmp_arch);
9568 selected_arch = tmp_arch;
9569 explicit_arch = selected_arch->arch;
9570 return true;
9573 switch (parse_res)
9575 case AARCH64_PARSE_MISSING_ARG:
9576 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9577 break;
9578 case AARCH64_PARSE_INVALID_ARG:
9579 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9580 aarch64_print_hint_for_arch (str);
9581 break;
9582 case AARCH64_PARSE_INVALID_FEATURE:
9583 error ("invalid feature modifier %qs for 'arch' target %s",
9584 str, pragma_or_attr);
9585 break;
9586 default:
9587 gcc_unreachable ();
9590 return false;
9593 /* Handle the argument CPU_STR to the cpu= target attribute.
9594 PRAGMA_OR_ATTR is used in potential error messages. */
9596 static bool
9597 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9599 const struct processor *tmp_cpu = NULL;
9600 enum aarch64_parse_opt_result parse_res
9601 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9603 if (parse_res == AARCH64_PARSE_OK)
9605 gcc_assert (tmp_cpu);
9606 selected_tune = tmp_cpu;
9607 explicit_tune_core = selected_tune->ident;
9609 selected_arch = &all_architectures[tmp_cpu->arch];
9610 explicit_arch = selected_arch->arch;
9611 return true;
9614 switch (parse_res)
9616 case AARCH64_PARSE_MISSING_ARG:
9617 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9618 break;
9619 case AARCH64_PARSE_INVALID_ARG:
9620 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9621 aarch64_print_hint_for_core (str);
9622 break;
9623 case AARCH64_PARSE_INVALID_FEATURE:
9624 error ("invalid feature modifier %qs for 'cpu' target %s",
9625 str, pragma_or_attr);
9626 break;
9627 default:
9628 gcc_unreachable ();
9631 return false;
9634 /* Handle the argument STR to the tune= target attribute.
9635 PRAGMA_OR_ATTR is used in potential error messages. */
9637 static bool
9638 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9640 const struct processor *tmp_tune = NULL;
9641 enum aarch64_parse_opt_result parse_res
9642 = aarch64_parse_tune (str, &tmp_tune);
9644 if (parse_res == AARCH64_PARSE_OK)
9646 gcc_assert (tmp_tune);
9647 selected_tune = tmp_tune;
9648 explicit_tune_core = selected_tune->ident;
9649 return true;
9652 switch (parse_res)
9654 case AARCH64_PARSE_INVALID_ARG:
9655 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9656 aarch64_print_hint_for_core (str);
9657 break;
9658 default:
9659 gcc_unreachable ();
9662 return false;
9665 /* Parse an architecture extensions target attribute string specified in STR.
9666 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9667 if successful. Update aarch64_isa_flags to reflect the ISA features
9668 modified.
9669 PRAGMA_OR_ATTR is used in potential error messages. */
9671 static bool
9672 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9674 enum aarch64_parse_opt_result parse_res;
9675 unsigned long isa_flags = aarch64_isa_flags;
9677 /* We allow "+nothing" in the beginning to clear out all architectural
9678 features if the user wants to handpick specific features. */
9679 if (strncmp ("+nothing", str, 8) == 0)
9681 isa_flags = 0;
9682 str += 8;
9685 parse_res = aarch64_parse_extension (str, &isa_flags);
9687 if (parse_res == AARCH64_PARSE_OK)
9689 aarch64_isa_flags = isa_flags;
9690 return true;
9693 switch (parse_res)
9695 case AARCH64_PARSE_MISSING_ARG:
9696 error ("missing feature modifier in target %s %qs",
9697 pragma_or_attr, str);
9698 break;
9700 case AARCH64_PARSE_INVALID_FEATURE:
9701 error ("invalid feature modifier in target %s %qs",
9702 pragma_or_attr, str);
9703 break;
9705 default:
9706 gcc_unreachable ();
9709 return false;
9712 /* The target attributes that we support. On top of these we also support just
9713 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9714 handled explicitly in aarch64_process_one_target_attr. */
9716 static const struct aarch64_attribute_info aarch64_attributes[] =
9718 { "general-regs-only", aarch64_attr_mask, false, NULL,
9719 OPT_mgeneral_regs_only },
9720 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9721 OPT_mfix_cortex_a53_835769 },
9722 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9723 OPT_mfix_cortex_a53_843419 },
9724 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9725 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9726 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9727 OPT_momit_leaf_frame_pointer },
9728 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9729 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9730 OPT_march_ },
9731 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9732 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9733 OPT_mtune_ },
9734 { "sign-return-address", aarch64_attr_enum, false, NULL,
9735 OPT_msign_return_address_ },
9736 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9739 /* Parse ARG_STR which contains the definition of one target attribute.
9740 Show appropriate errors if any or return true if the attribute is valid.
9741 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9742 we're processing a target attribute or pragma. */
9744 static bool
9745 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9747 bool invert = false;
9749 size_t len = strlen (arg_str);
9751 if (len == 0)
9753 error ("malformed target %s", pragma_or_attr);
9754 return false;
9757 char *str_to_check = (char *) alloca (len + 1);
9758 strcpy (str_to_check, arg_str);
9760 /* Skip leading whitespace. */
9761 while (*str_to_check == ' ' || *str_to_check == '\t')
9762 str_to_check++;
9764 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9765 It is easier to detect and handle it explicitly here rather than going
9766 through the machinery for the rest of the target attributes in this
9767 function. */
9768 if (*str_to_check == '+')
9769 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9771 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9773 invert = true;
9774 str_to_check += 3;
9776 char *arg = strchr (str_to_check, '=');
9778 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9779 and point ARG to "foo". */
9780 if (arg)
9782 *arg = '\0';
9783 arg++;
9785 const struct aarch64_attribute_info *p_attr;
9786 bool found = false;
9787 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9789 /* If the names don't match up, or the user has given an argument
9790 to an attribute that doesn't accept one, or didn't give an argument
9791 to an attribute that expects one, fail to match. */
9792 if (strcmp (str_to_check, p_attr->name) != 0)
9793 continue;
9795 found = true;
9796 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9797 || p_attr->attr_type == aarch64_attr_enum;
9799 if (attr_need_arg_p ^ (arg != NULL))
9801 error ("target %s %qs does not accept an argument",
9802 pragma_or_attr, str_to_check);
9803 return false;
9806 /* If the name matches but the attribute does not allow "no-" versions
9807 then we can't match. */
9808 if (invert && !p_attr->allow_neg)
9810 error ("target %s %qs does not allow a negated form",
9811 pragma_or_attr, str_to_check);
9812 return false;
9815 switch (p_attr->attr_type)
9817 /* Has a custom handler registered.
9818 For example, cpu=, arch=, tune=. */
9819 case aarch64_attr_custom:
9820 gcc_assert (p_attr->handler);
9821 if (!p_attr->handler (arg, pragma_or_attr))
9822 return false;
9823 break;
9825 /* Either set or unset a boolean option. */
9826 case aarch64_attr_bool:
9828 struct cl_decoded_option decoded;
9830 generate_option (p_attr->opt_num, NULL, !invert,
9831 CL_TARGET, &decoded);
9832 aarch64_handle_option (&global_options, &global_options_set,
9833 &decoded, input_location);
9834 break;
9836 /* Set or unset a bit in the target_flags. aarch64_handle_option
9837 should know what mask to apply given the option number. */
9838 case aarch64_attr_mask:
9840 struct cl_decoded_option decoded;
9841 /* We only need to specify the option number.
9842 aarch64_handle_option will know which mask to apply. */
9843 decoded.opt_index = p_attr->opt_num;
9844 decoded.value = !invert;
9845 aarch64_handle_option (&global_options, &global_options_set,
9846 &decoded, input_location);
9847 break;
9849 /* Use the option setting machinery to set an option to an enum. */
9850 case aarch64_attr_enum:
9852 gcc_assert (arg);
9853 bool valid;
9854 int value;
9855 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9856 &value, CL_TARGET);
9857 if (valid)
9859 set_option (&global_options, NULL, p_attr->opt_num, value,
9860 NULL, DK_UNSPECIFIED, input_location,
9861 global_dc);
9863 else
9865 error ("target %s %s=%s is not valid",
9866 pragma_or_attr, str_to_check, arg);
9868 break;
9870 default:
9871 gcc_unreachable ();
9875 /* If we reached here we either have found an attribute and validated
9876 it or didn't match any. If we matched an attribute but its arguments
9877 were malformed we will have returned false already. */
9878 return found;
9881 /* Count how many times the character C appears in
9882 NULL-terminated string STR. */
9884 static unsigned int
9885 num_occurences_in_str (char c, char *str)
9887 unsigned int res = 0;
9888 while (*str != '\0')
9890 if (*str == c)
9891 res++;
9893 str++;
9896 return res;
9899 /* Parse the tree in ARGS that contains the target attribute information
9900 and update the global target options space. PRAGMA_OR_ATTR is a string
9901 to be used in error messages, specifying whether this is processing
9902 a target attribute or a target pragma. */
9904 bool
9905 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9907 if (TREE_CODE (args) == TREE_LIST)
9911 tree head = TREE_VALUE (args);
9912 if (head)
9914 if (!aarch64_process_target_attr (head, pragma_or_attr))
9915 return false;
9917 args = TREE_CHAIN (args);
9918 } while (args);
9920 return true;
9923 if (TREE_CODE (args) != STRING_CST)
9925 error ("attribute %<target%> argument not a string");
9926 return false;
9929 size_t len = strlen (TREE_STRING_POINTER (args));
9930 char *str_to_check = (char *) alloca (len + 1);
9931 strcpy (str_to_check, TREE_STRING_POINTER (args));
9933 if (len == 0)
9935 error ("malformed target %s value", pragma_or_attr);
9936 return false;
9939 /* Used to catch empty spaces between commas i.e.
9940 attribute ((target ("attr1,,attr2"))). */
9941 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9943 /* Handle multiple target attributes separated by ','. */
9944 char *token = strtok (str_to_check, ",");
9946 unsigned int num_attrs = 0;
9947 while (token)
9949 num_attrs++;
9950 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9952 error ("target %s %qs is invalid", pragma_or_attr, token);
9953 return false;
9956 token = strtok (NULL, ",");
9959 if (num_attrs != num_commas + 1)
9961 error ("malformed target %s list %qs",
9962 pragma_or_attr, TREE_STRING_POINTER (args));
9963 return false;
9966 return true;
9969 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9970 process attribute ((target ("..."))). */
9972 static bool
9973 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9975 struct cl_target_option cur_target;
9976 bool ret;
9977 tree old_optimize;
9978 tree new_target, new_optimize;
9979 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9981 /* If what we're processing is the current pragma string then the
9982 target option node is already stored in target_option_current_node
9983 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9984 having to re-parse the string. This is especially useful to keep
9985 arm_neon.h compile times down since that header contains a lot
9986 of intrinsics enclosed in pragmas. */
9987 if (!existing_target && args == current_target_pragma)
9989 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9990 return true;
9992 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9994 old_optimize = build_optimization_node (&global_options);
9995 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9997 /* If the function changed the optimization levels as well as setting
9998 target options, start with the optimizations specified. */
9999 if (func_optimize && func_optimize != old_optimize)
10000 cl_optimization_restore (&global_options,
10001 TREE_OPTIMIZATION (func_optimize));
10003 /* Save the current target options to restore at the end. */
10004 cl_target_option_save (&cur_target, &global_options);
10006 /* If fndecl already has some target attributes applied to it, unpack
10007 them so that we add this attribute on top of them, rather than
10008 overwriting them. */
10009 if (existing_target)
10011 struct cl_target_option *existing_options
10012 = TREE_TARGET_OPTION (existing_target);
10014 if (existing_options)
10015 cl_target_option_restore (&global_options, existing_options);
10017 else
10018 cl_target_option_restore (&global_options,
10019 TREE_TARGET_OPTION (target_option_current_node));
10022 ret = aarch64_process_target_attr (args, "attribute");
10024 /* Set up any additional state. */
10025 if (ret)
10027 aarch64_override_options_internal (&global_options);
10028 /* Initialize SIMD builtins if we haven't already.
10029 Set current_target_pragma to NULL for the duration so that
10030 the builtin initialization code doesn't try to tag the functions
10031 being built with the attributes specified by any current pragma, thus
10032 going into an infinite recursion. */
10033 if (TARGET_SIMD)
10035 tree saved_current_target_pragma = current_target_pragma;
10036 current_target_pragma = NULL;
10037 aarch64_init_simd_builtins ();
10038 current_target_pragma = saved_current_target_pragma;
10040 new_target = build_target_option_node (&global_options);
10042 else
10043 new_target = NULL;
10045 new_optimize = build_optimization_node (&global_options);
10047 if (fndecl && ret)
10049 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10051 if (old_optimize != new_optimize)
10052 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10055 cl_target_option_restore (&global_options, &cur_target);
10057 if (old_optimize != new_optimize)
10058 cl_optimization_restore (&global_options,
10059 TREE_OPTIMIZATION (old_optimize));
10060 return ret;
10063 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10064 tri-bool options (yes, no, don't care) and the default value is
10065 DEF, determine whether to reject inlining. */
10067 static bool
10068 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10069 int dont_care, int def)
10071 /* If the callee doesn't care, always allow inlining. */
10072 if (callee == dont_care)
10073 return true;
10075 /* If the caller doesn't care, always allow inlining. */
10076 if (caller == dont_care)
10077 return true;
10079 /* Otherwise, allow inlining if either the callee and caller values
10080 agree, or if the callee is using the default value. */
10081 return (callee == caller || callee == def);
10084 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10085 to inline CALLEE into CALLER based on target-specific info.
10086 Make sure that the caller and callee have compatible architectural
10087 features. Then go through the other possible target attributes
10088 and see if they can block inlining. Try not to reject always_inline
10089 callees unless they are incompatible architecturally. */
10091 static bool
10092 aarch64_can_inline_p (tree caller, tree callee)
10094 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10095 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10097 /* If callee has no option attributes, then it is ok to inline. */
10098 if (!callee_tree)
10099 return true;
10101 struct cl_target_option *caller_opts
10102 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10103 : target_option_default_node);
10105 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10108 /* Callee's ISA flags should be a subset of the caller's. */
10109 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10110 != callee_opts->x_aarch64_isa_flags)
10111 return false;
10113 /* Allow non-strict aligned functions inlining into strict
10114 aligned ones. */
10115 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10116 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10117 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10118 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10119 return false;
10121 bool always_inline = lookup_attribute ("always_inline",
10122 DECL_ATTRIBUTES (callee));
10124 /* If the architectural features match up and the callee is always_inline
10125 then the other attributes don't matter. */
10126 if (always_inline)
10127 return true;
10129 if (caller_opts->x_aarch64_cmodel_var
10130 != callee_opts->x_aarch64_cmodel_var)
10131 return false;
10133 if (caller_opts->x_aarch64_tls_dialect
10134 != callee_opts->x_aarch64_tls_dialect)
10135 return false;
10137 /* Honour explicit requests to workaround errata. */
10138 if (!aarch64_tribools_ok_for_inlining_p (
10139 caller_opts->x_aarch64_fix_a53_err835769,
10140 callee_opts->x_aarch64_fix_a53_err835769,
10141 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10142 return false;
10144 if (!aarch64_tribools_ok_for_inlining_p (
10145 caller_opts->x_aarch64_fix_a53_err843419,
10146 callee_opts->x_aarch64_fix_a53_err843419,
10147 2, TARGET_FIX_ERR_A53_843419))
10148 return false;
10150 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10151 caller and calle and they don't match up, reject inlining. */
10152 if (!aarch64_tribools_ok_for_inlining_p (
10153 caller_opts->x_flag_omit_leaf_frame_pointer,
10154 callee_opts->x_flag_omit_leaf_frame_pointer,
10155 2, 1))
10156 return false;
10158 /* If the callee has specific tuning overrides, respect them. */
10159 if (callee_opts->x_aarch64_override_tune_string != NULL
10160 && caller_opts->x_aarch64_override_tune_string == NULL)
10161 return false;
10163 /* If the user specified tuning override strings for the
10164 caller and callee and they don't match up, reject inlining.
10165 We just do a string compare here, we don't analyze the meaning
10166 of the string, as it would be too costly for little gain. */
10167 if (callee_opts->x_aarch64_override_tune_string
10168 && caller_opts->x_aarch64_override_tune_string
10169 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10170 caller_opts->x_aarch64_override_tune_string) != 0))
10171 return false;
10173 return true;
10176 /* Return true if SYMBOL_REF X binds locally. */
10178 static bool
10179 aarch64_symbol_binds_local_p (const_rtx x)
10181 return (SYMBOL_REF_DECL (x)
10182 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10183 : SYMBOL_REF_LOCAL_P (x));
10186 /* Return true if SYMBOL_REF X is thread local */
10187 static bool
10188 aarch64_tls_symbol_p (rtx x)
10190 if (! TARGET_HAVE_TLS)
10191 return false;
10193 if (GET_CODE (x) != SYMBOL_REF)
10194 return false;
10196 return SYMBOL_REF_TLS_MODEL (x) != 0;
10199 /* Classify a TLS symbol into one of the TLS kinds. */
10200 enum aarch64_symbol_type
10201 aarch64_classify_tls_symbol (rtx x)
10203 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10205 switch (tls_kind)
10207 case TLS_MODEL_GLOBAL_DYNAMIC:
10208 case TLS_MODEL_LOCAL_DYNAMIC:
10209 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10211 case TLS_MODEL_INITIAL_EXEC:
10212 switch (aarch64_cmodel)
10214 case AARCH64_CMODEL_TINY:
10215 case AARCH64_CMODEL_TINY_PIC:
10216 return SYMBOL_TINY_TLSIE;
10217 default:
10218 return SYMBOL_SMALL_TLSIE;
10221 case TLS_MODEL_LOCAL_EXEC:
10222 if (aarch64_tls_size == 12)
10223 return SYMBOL_TLSLE12;
10224 else if (aarch64_tls_size == 24)
10225 return SYMBOL_TLSLE24;
10226 else if (aarch64_tls_size == 32)
10227 return SYMBOL_TLSLE32;
10228 else if (aarch64_tls_size == 48)
10229 return SYMBOL_TLSLE48;
10230 else
10231 gcc_unreachable ();
10233 case TLS_MODEL_EMULATED:
10234 case TLS_MODEL_NONE:
10235 return SYMBOL_FORCE_TO_MEM;
10237 default:
10238 gcc_unreachable ();
10242 /* Return the method that should be used to access SYMBOL_REF or
10243 LABEL_REF X. */
10245 enum aarch64_symbol_type
10246 aarch64_classify_symbol (rtx x, rtx offset)
10248 if (GET_CODE (x) == LABEL_REF)
10250 switch (aarch64_cmodel)
10252 case AARCH64_CMODEL_LARGE:
10253 return SYMBOL_FORCE_TO_MEM;
10255 case AARCH64_CMODEL_TINY_PIC:
10256 case AARCH64_CMODEL_TINY:
10257 return SYMBOL_TINY_ABSOLUTE;
10259 case AARCH64_CMODEL_SMALL_SPIC:
10260 case AARCH64_CMODEL_SMALL_PIC:
10261 case AARCH64_CMODEL_SMALL:
10262 return SYMBOL_SMALL_ABSOLUTE;
10264 default:
10265 gcc_unreachable ();
10269 if (GET_CODE (x) == SYMBOL_REF)
10271 if (aarch64_tls_symbol_p (x))
10272 return aarch64_classify_tls_symbol (x);
10274 switch (aarch64_cmodel)
10276 case AARCH64_CMODEL_TINY:
10277 /* When we retrieve symbol + offset address, we have to make sure
10278 the offset does not cause overflow of the final address. But
10279 we have no way of knowing the address of symbol at compile time
10280 so we can't accurately say if the distance between the PC and
10281 symbol + offset is outside the addressible range of +/-1M in the
10282 TINY code model. So we rely on images not being greater than
10283 1M and cap the offset at 1M and anything beyond 1M will have to
10284 be loaded using an alternative mechanism. Furthermore if the
10285 symbol is a weak reference to something that isn't known to
10286 resolve to a symbol in this module, then force to memory. */
10287 if ((SYMBOL_REF_WEAK (x)
10288 && !aarch64_symbol_binds_local_p (x))
10289 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10290 return SYMBOL_FORCE_TO_MEM;
10291 return SYMBOL_TINY_ABSOLUTE;
10293 case AARCH64_CMODEL_SMALL:
10294 /* Same reasoning as the tiny code model, but the offset cap here is
10295 4G. */
10296 if ((SYMBOL_REF_WEAK (x)
10297 && !aarch64_symbol_binds_local_p (x))
10298 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10299 HOST_WIDE_INT_C (4294967264)))
10300 return SYMBOL_FORCE_TO_MEM;
10301 return SYMBOL_SMALL_ABSOLUTE;
10303 case AARCH64_CMODEL_TINY_PIC:
10304 if (!aarch64_symbol_binds_local_p (x))
10305 return SYMBOL_TINY_GOT;
10306 return SYMBOL_TINY_ABSOLUTE;
10308 case AARCH64_CMODEL_SMALL_SPIC:
10309 case AARCH64_CMODEL_SMALL_PIC:
10310 if (!aarch64_symbol_binds_local_p (x))
10311 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10312 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10313 return SYMBOL_SMALL_ABSOLUTE;
10315 case AARCH64_CMODEL_LARGE:
10316 /* This is alright even in PIC code as the constant
10317 pool reference is always PC relative and within
10318 the same translation unit. */
10319 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10320 return SYMBOL_SMALL_ABSOLUTE;
10321 else
10322 return SYMBOL_FORCE_TO_MEM;
10324 default:
10325 gcc_unreachable ();
10329 /* By default push everything into the constant pool. */
10330 return SYMBOL_FORCE_TO_MEM;
10333 bool
10334 aarch64_constant_address_p (rtx x)
10336 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10339 bool
10340 aarch64_legitimate_pic_operand_p (rtx x)
10342 if (GET_CODE (x) == SYMBOL_REF
10343 || (GET_CODE (x) == CONST
10344 && GET_CODE (XEXP (x, 0)) == PLUS
10345 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10346 return false;
10348 return true;
10351 /* Return true if X holds either a quarter-precision or
10352 floating-point +0.0 constant. */
10353 static bool
10354 aarch64_valid_floating_const (rtx x)
10356 if (!CONST_DOUBLE_P (x))
10357 return false;
10359 /* This call determines which constants can be used in mov<mode>
10360 as integer moves instead of constant loads. */
10361 if (aarch64_float_const_rtx_p (x))
10362 return true;
10364 return aarch64_float_const_representable_p (x);
10367 static bool
10368 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10370 /* Do not allow vector struct mode constants. We could support
10371 0 and -1 easily, but they need support in aarch64-simd.md. */
10372 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10373 return false;
10375 /* For these cases we never want to use a literal load.
10376 As such we have to prevent the compiler from forcing these
10377 to memory. */
10378 if ((GET_CODE (x) == CONST_VECTOR
10379 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10380 || CONST_INT_P (x)
10381 || aarch64_valid_floating_const (x)
10382 || aarch64_can_const_movi_rtx_p (x, mode)
10383 || aarch64_float_const_rtx_p (x))
10384 return !targetm.cannot_force_const_mem (mode, x);
10386 if (GET_CODE (x) == HIGH
10387 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10388 return true;
10390 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10391 so spilling them is better than rematerialization. */
10392 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10393 return true;
10395 return aarch64_constant_address_p (x);
10399 aarch64_load_tp (rtx target)
10401 if (!target
10402 || GET_MODE (target) != Pmode
10403 || !register_operand (target, Pmode))
10404 target = gen_reg_rtx (Pmode);
10406 /* Can return in any reg. */
10407 emit_insn (gen_aarch64_load_tp_hard (target));
10408 return target;
10411 /* On AAPCS systems, this is the "struct __va_list". */
10412 static GTY(()) tree va_list_type;
10414 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10415 Return the type to use as __builtin_va_list.
10417 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10419 struct __va_list
10421 void *__stack;
10422 void *__gr_top;
10423 void *__vr_top;
10424 int __gr_offs;
10425 int __vr_offs;
10426 }; */
10428 static tree
10429 aarch64_build_builtin_va_list (void)
10431 tree va_list_name;
10432 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10434 /* Create the type. */
10435 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10436 /* Give it the required name. */
10437 va_list_name = build_decl (BUILTINS_LOCATION,
10438 TYPE_DECL,
10439 get_identifier ("__va_list"),
10440 va_list_type);
10441 DECL_ARTIFICIAL (va_list_name) = 1;
10442 TYPE_NAME (va_list_type) = va_list_name;
10443 TYPE_STUB_DECL (va_list_type) = va_list_name;
10445 /* Create the fields. */
10446 f_stack = build_decl (BUILTINS_LOCATION,
10447 FIELD_DECL, get_identifier ("__stack"),
10448 ptr_type_node);
10449 f_grtop = build_decl (BUILTINS_LOCATION,
10450 FIELD_DECL, get_identifier ("__gr_top"),
10451 ptr_type_node);
10452 f_vrtop = build_decl (BUILTINS_LOCATION,
10453 FIELD_DECL, get_identifier ("__vr_top"),
10454 ptr_type_node);
10455 f_groff = build_decl (BUILTINS_LOCATION,
10456 FIELD_DECL, get_identifier ("__gr_offs"),
10457 integer_type_node);
10458 f_vroff = build_decl (BUILTINS_LOCATION,
10459 FIELD_DECL, get_identifier ("__vr_offs"),
10460 integer_type_node);
10462 /* Tell tree-stdarg pass about our internal offset fields.
10463 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10464 purpose to identify whether the code is updating va_list internal
10465 offset fields through irregular way. */
10466 va_list_gpr_counter_field = f_groff;
10467 va_list_fpr_counter_field = f_vroff;
10469 DECL_ARTIFICIAL (f_stack) = 1;
10470 DECL_ARTIFICIAL (f_grtop) = 1;
10471 DECL_ARTIFICIAL (f_vrtop) = 1;
10472 DECL_ARTIFICIAL (f_groff) = 1;
10473 DECL_ARTIFICIAL (f_vroff) = 1;
10475 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10476 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10477 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10478 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10479 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10481 TYPE_FIELDS (va_list_type) = f_stack;
10482 DECL_CHAIN (f_stack) = f_grtop;
10483 DECL_CHAIN (f_grtop) = f_vrtop;
10484 DECL_CHAIN (f_vrtop) = f_groff;
10485 DECL_CHAIN (f_groff) = f_vroff;
10487 /* Compute its layout. */
10488 layout_type (va_list_type);
10490 return va_list_type;
10493 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10494 static void
10495 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10497 const CUMULATIVE_ARGS *cum;
10498 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10499 tree stack, grtop, vrtop, groff, vroff;
10500 tree t;
10501 int gr_save_area_size = cfun->va_list_gpr_size;
10502 int vr_save_area_size = cfun->va_list_fpr_size;
10503 int vr_offset;
10505 cum = &crtl->args.info;
10506 if (cfun->va_list_gpr_size)
10507 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10508 cfun->va_list_gpr_size);
10509 if (cfun->va_list_fpr_size)
10510 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10511 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10513 if (!TARGET_FLOAT)
10515 gcc_assert (cum->aapcs_nvrn == 0);
10516 vr_save_area_size = 0;
10519 f_stack = TYPE_FIELDS (va_list_type_node);
10520 f_grtop = DECL_CHAIN (f_stack);
10521 f_vrtop = DECL_CHAIN (f_grtop);
10522 f_groff = DECL_CHAIN (f_vrtop);
10523 f_vroff = DECL_CHAIN (f_groff);
10525 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10526 NULL_TREE);
10527 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10528 NULL_TREE);
10529 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10530 NULL_TREE);
10531 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10532 NULL_TREE);
10533 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10534 NULL_TREE);
10536 /* Emit code to initialize STACK, which points to the next varargs stack
10537 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10538 by named arguments. STACK is 8-byte aligned. */
10539 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10540 if (cum->aapcs_stack_size > 0)
10541 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10542 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10543 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10545 /* Emit code to initialize GRTOP, the top of the GR save area.
10546 virtual_incoming_args_rtx should have been 16 byte aligned. */
10547 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10548 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10549 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10551 /* Emit code to initialize VRTOP, the top of the VR save area.
10552 This address is gr_save_area_bytes below GRTOP, rounded
10553 down to the next 16-byte boundary. */
10554 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10555 vr_offset = ROUND_UP (gr_save_area_size,
10556 STACK_BOUNDARY / BITS_PER_UNIT);
10558 if (vr_offset)
10559 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10560 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10561 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10563 /* Emit code to initialize GROFF, the offset from GRTOP of the
10564 next GPR argument. */
10565 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10566 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10567 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10569 /* Likewise emit code to initialize VROFF, the offset from FTOP
10570 of the next VR argument. */
10571 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10572 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10573 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10576 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10578 static tree
10579 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10580 gimple_seq *post_p ATTRIBUTE_UNUSED)
10582 tree addr;
10583 bool indirect_p;
10584 bool is_ha; /* is HFA or HVA. */
10585 bool dw_align; /* double-word align. */
10586 machine_mode ag_mode = VOIDmode;
10587 int nregs;
10588 machine_mode mode;
10590 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10591 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10592 HOST_WIDE_INT size, rsize, adjust, align;
10593 tree t, u, cond1, cond2;
10595 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10596 if (indirect_p)
10597 type = build_pointer_type (type);
10599 mode = TYPE_MODE (type);
10601 f_stack = TYPE_FIELDS (va_list_type_node);
10602 f_grtop = DECL_CHAIN (f_stack);
10603 f_vrtop = DECL_CHAIN (f_grtop);
10604 f_groff = DECL_CHAIN (f_vrtop);
10605 f_vroff = DECL_CHAIN (f_groff);
10607 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10608 f_stack, NULL_TREE);
10609 size = int_size_in_bytes (type);
10610 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10612 dw_align = false;
10613 adjust = 0;
10614 if (aarch64_vfp_is_call_or_return_candidate (mode,
10615 type,
10616 &ag_mode,
10617 &nregs,
10618 &is_ha))
10620 /* TYPE passed in fp/simd registers. */
10621 if (!TARGET_FLOAT)
10622 aarch64_err_no_fpadvsimd (mode, "varargs");
10624 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10625 unshare_expr (valist), f_vrtop, NULL_TREE);
10626 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10627 unshare_expr (valist), f_vroff, NULL_TREE);
10629 rsize = nregs * UNITS_PER_VREG;
10631 if (is_ha)
10633 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10634 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10636 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10637 && size < UNITS_PER_VREG)
10639 adjust = UNITS_PER_VREG - size;
10642 else
10644 /* TYPE passed in general registers. */
10645 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10646 unshare_expr (valist), f_grtop, NULL_TREE);
10647 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10648 unshare_expr (valist), f_groff, NULL_TREE);
10649 rsize = ROUND_UP (size, UNITS_PER_WORD);
10650 nregs = rsize / UNITS_PER_WORD;
10652 if (align > 8)
10653 dw_align = true;
10655 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10656 && size < UNITS_PER_WORD)
10658 adjust = UNITS_PER_WORD - size;
10662 /* Get a local temporary for the field value. */
10663 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10665 /* Emit code to branch if off >= 0. */
10666 t = build2 (GE_EXPR, boolean_type_node, off,
10667 build_int_cst (TREE_TYPE (off), 0));
10668 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10670 if (dw_align)
10672 /* Emit: offs = (offs + 15) & -16. */
10673 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10674 build_int_cst (TREE_TYPE (off), 15));
10675 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10676 build_int_cst (TREE_TYPE (off), -16));
10677 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10679 else
10680 roundup = NULL;
10682 /* Update ap.__[g|v]r_offs */
10683 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10684 build_int_cst (TREE_TYPE (off), rsize));
10685 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10687 /* String up. */
10688 if (roundup)
10689 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10691 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10692 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10693 build_int_cst (TREE_TYPE (f_off), 0));
10694 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10696 /* String up: make sure the assignment happens before the use. */
10697 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10698 COND_EXPR_ELSE (cond1) = t;
10700 /* Prepare the trees handling the argument that is passed on the stack;
10701 the top level node will store in ON_STACK. */
10702 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10703 if (align > 8)
10705 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10706 t = fold_convert (intDI_type_node, arg);
10707 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10708 build_int_cst (TREE_TYPE (t), 15));
10709 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10710 build_int_cst (TREE_TYPE (t), -16));
10711 t = fold_convert (TREE_TYPE (arg), t);
10712 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10714 else
10715 roundup = NULL;
10716 /* Advance ap.__stack */
10717 t = fold_convert (intDI_type_node, arg);
10718 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10719 build_int_cst (TREE_TYPE (t), size + 7));
10720 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10721 build_int_cst (TREE_TYPE (t), -8));
10722 t = fold_convert (TREE_TYPE (arg), t);
10723 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10724 /* String up roundup and advance. */
10725 if (roundup)
10726 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10727 /* String up with arg */
10728 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10729 /* Big-endianness related address adjustment. */
10730 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10731 && size < UNITS_PER_WORD)
10733 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10734 size_int (UNITS_PER_WORD - size));
10735 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10738 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10739 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10741 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10742 t = off;
10743 if (adjust)
10744 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10745 build_int_cst (TREE_TYPE (off), adjust));
10747 t = fold_convert (sizetype, t);
10748 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10750 if (is_ha)
10752 /* type ha; // treat as "struct {ftype field[n];}"
10753 ... [computing offs]
10754 for (i = 0; i <nregs; ++i, offs += 16)
10755 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10756 return ha; */
10757 int i;
10758 tree tmp_ha, field_t, field_ptr_t;
10760 /* Declare a local variable. */
10761 tmp_ha = create_tmp_var_raw (type, "ha");
10762 gimple_add_tmp_var (tmp_ha);
10764 /* Establish the base type. */
10765 switch (ag_mode)
10767 case E_SFmode:
10768 field_t = float_type_node;
10769 field_ptr_t = float_ptr_type_node;
10770 break;
10771 case E_DFmode:
10772 field_t = double_type_node;
10773 field_ptr_t = double_ptr_type_node;
10774 break;
10775 case E_TFmode:
10776 field_t = long_double_type_node;
10777 field_ptr_t = long_double_ptr_type_node;
10778 break;
10779 case E_HFmode:
10780 field_t = aarch64_fp16_type_node;
10781 field_ptr_t = aarch64_fp16_ptr_type_node;
10782 break;
10783 case E_V2SImode:
10784 case E_V4SImode:
10786 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10787 field_t = build_vector_type_for_mode (innertype, ag_mode);
10788 field_ptr_t = build_pointer_type (field_t);
10790 break;
10791 default:
10792 gcc_assert (0);
10795 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10796 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10797 addr = t;
10798 t = fold_convert (field_ptr_t, addr);
10799 t = build2 (MODIFY_EXPR, field_t,
10800 build1 (INDIRECT_REF, field_t, tmp_ha),
10801 build1 (INDIRECT_REF, field_t, t));
10803 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10804 for (i = 1; i < nregs; ++i)
10806 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10807 u = fold_convert (field_ptr_t, addr);
10808 u = build2 (MODIFY_EXPR, field_t,
10809 build2 (MEM_REF, field_t, tmp_ha,
10810 build_int_cst (field_ptr_t,
10811 (i *
10812 int_size_in_bytes (field_t)))),
10813 build1 (INDIRECT_REF, field_t, u));
10814 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10817 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10818 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10821 COND_EXPR_ELSE (cond2) = t;
10822 addr = fold_convert (build_pointer_type (type), cond1);
10823 addr = build_va_arg_indirect_ref (addr);
10825 if (indirect_p)
10826 addr = build_va_arg_indirect_ref (addr);
10828 return addr;
10831 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10833 static void
10834 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10835 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10836 int no_rtl)
10838 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10839 CUMULATIVE_ARGS local_cum;
10840 int gr_saved = cfun->va_list_gpr_size;
10841 int vr_saved = cfun->va_list_fpr_size;
10843 /* The caller has advanced CUM up to, but not beyond, the last named
10844 argument. Advance a local copy of CUM past the last "real" named
10845 argument, to find out how many registers are left over. */
10846 local_cum = *cum;
10847 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10849 /* Found out how many registers we need to save.
10850 Honor tree-stdvar analysis results. */
10851 if (cfun->va_list_gpr_size)
10852 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10853 cfun->va_list_gpr_size / UNITS_PER_WORD);
10854 if (cfun->va_list_fpr_size)
10855 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10856 cfun->va_list_fpr_size / UNITS_PER_VREG);
10858 if (!TARGET_FLOAT)
10860 gcc_assert (local_cum.aapcs_nvrn == 0);
10861 vr_saved = 0;
10864 if (!no_rtl)
10866 if (gr_saved > 0)
10868 rtx ptr, mem;
10870 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10871 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10872 - gr_saved * UNITS_PER_WORD);
10873 mem = gen_frame_mem (BLKmode, ptr);
10874 set_mem_alias_set (mem, get_varargs_alias_set ());
10876 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10877 mem, gr_saved);
10879 if (vr_saved > 0)
10881 /* We can't use move_block_from_reg, because it will use
10882 the wrong mode, storing D regs only. */
10883 machine_mode mode = TImode;
10884 int off, i, vr_start;
10886 /* Set OFF to the offset from virtual_incoming_args_rtx of
10887 the first vector register. The VR save area lies below
10888 the GR one, and is aligned to 16 bytes. */
10889 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10890 STACK_BOUNDARY / BITS_PER_UNIT);
10891 off -= vr_saved * UNITS_PER_VREG;
10893 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10894 for (i = 0; i < vr_saved; ++i)
10896 rtx ptr, mem;
10898 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10899 mem = gen_frame_mem (mode, ptr);
10900 set_mem_alias_set (mem, get_varargs_alias_set ());
10901 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10902 off += UNITS_PER_VREG;
10907 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10908 any complication of having crtl->args.pretend_args_size changed. */
10909 cfun->machine->frame.saved_varargs_size
10910 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10911 STACK_BOUNDARY / BITS_PER_UNIT)
10912 + vr_saved * UNITS_PER_VREG);
10915 static void
10916 aarch64_conditional_register_usage (void)
10918 int i;
10919 if (!TARGET_FLOAT)
10921 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10923 fixed_regs[i] = 1;
10924 call_used_regs[i] = 1;
10929 /* Walk down the type tree of TYPE counting consecutive base elements.
10930 If *MODEP is VOIDmode, then set it to the first valid floating point
10931 type. If a non-floating point type is found, or if a floating point
10932 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10933 otherwise return the count in the sub-tree. */
10934 static int
10935 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10937 machine_mode mode;
10938 HOST_WIDE_INT size;
10940 switch (TREE_CODE (type))
10942 case REAL_TYPE:
10943 mode = TYPE_MODE (type);
10944 if (mode != DFmode && mode != SFmode
10945 && mode != TFmode && mode != HFmode)
10946 return -1;
10948 if (*modep == VOIDmode)
10949 *modep = mode;
10951 if (*modep == mode)
10952 return 1;
10954 break;
10956 case COMPLEX_TYPE:
10957 mode = TYPE_MODE (TREE_TYPE (type));
10958 if (mode != DFmode && mode != SFmode
10959 && mode != TFmode && mode != HFmode)
10960 return -1;
10962 if (*modep == VOIDmode)
10963 *modep = mode;
10965 if (*modep == mode)
10966 return 2;
10968 break;
10970 case VECTOR_TYPE:
10971 /* Use V2SImode and V4SImode as representatives of all 64-bit
10972 and 128-bit vector types. */
10973 size = int_size_in_bytes (type);
10974 switch (size)
10976 case 8:
10977 mode = V2SImode;
10978 break;
10979 case 16:
10980 mode = V4SImode;
10981 break;
10982 default:
10983 return -1;
10986 if (*modep == VOIDmode)
10987 *modep = mode;
10989 /* Vector modes are considered to be opaque: two vectors are
10990 equivalent for the purposes of being homogeneous aggregates
10991 if they are the same size. */
10992 if (*modep == mode)
10993 return 1;
10995 break;
10997 case ARRAY_TYPE:
10999 int count;
11000 tree index = TYPE_DOMAIN (type);
11002 /* Can't handle incomplete types nor sizes that are not
11003 fixed. */
11004 if (!COMPLETE_TYPE_P (type)
11005 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11006 return -1;
11008 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11009 if (count == -1
11010 || !index
11011 || !TYPE_MAX_VALUE (index)
11012 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11013 || !TYPE_MIN_VALUE (index)
11014 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11015 || count < 0)
11016 return -1;
11018 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11019 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11021 /* There must be no padding. */
11022 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11023 return -1;
11025 return count;
11028 case RECORD_TYPE:
11030 int count = 0;
11031 int sub_count;
11032 tree field;
11034 /* Can't handle incomplete types nor sizes that are not
11035 fixed. */
11036 if (!COMPLETE_TYPE_P (type)
11037 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11038 return -1;
11040 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11042 if (TREE_CODE (field) != FIELD_DECL)
11043 continue;
11045 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11046 if (sub_count < 0)
11047 return -1;
11048 count += sub_count;
11051 /* There must be no padding. */
11052 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11053 return -1;
11055 return count;
11058 case UNION_TYPE:
11059 case QUAL_UNION_TYPE:
11061 /* These aren't very interesting except in a degenerate case. */
11062 int count = 0;
11063 int sub_count;
11064 tree field;
11066 /* Can't handle incomplete types nor sizes that are not
11067 fixed. */
11068 if (!COMPLETE_TYPE_P (type)
11069 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11070 return -1;
11072 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11074 if (TREE_CODE (field) != FIELD_DECL)
11075 continue;
11077 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11078 if (sub_count < 0)
11079 return -1;
11080 count = count > sub_count ? count : sub_count;
11083 /* There must be no padding. */
11084 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11085 return -1;
11087 return count;
11090 default:
11091 break;
11094 return -1;
11097 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11098 type as described in AAPCS64 \S 4.1.2.
11100 See the comment above aarch64_composite_type_p for the notes on MODE. */
11102 static bool
11103 aarch64_short_vector_p (const_tree type,
11104 machine_mode mode)
11106 HOST_WIDE_INT size = -1;
11108 if (type && TREE_CODE (type) == VECTOR_TYPE)
11109 size = int_size_in_bytes (type);
11110 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11111 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11112 size = GET_MODE_SIZE (mode);
11114 return (size == 8 || size == 16);
11117 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11118 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11119 array types. The C99 floating-point complex types are also considered
11120 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11121 types, which are GCC extensions and out of the scope of AAPCS64, are
11122 treated as composite types here as well.
11124 Note that MODE itself is not sufficient in determining whether a type
11125 is such a composite type or not. This is because
11126 stor-layout.c:compute_record_mode may have already changed the MODE
11127 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11128 structure with only one field may have its MODE set to the mode of the
11129 field. Also an integer mode whose size matches the size of the
11130 RECORD_TYPE type may be used to substitute the original mode
11131 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11132 solely relied on. */
11134 static bool
11135 aarch64_composite_type_p (const_tree type,
11136 machine_mode mode)
11138 if (aarch64_short_vector_p (type, mode))
11139 return false;
11141 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11142 return true;
11144 if (mode == BLKmode
11145 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11146 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11147 return true;
11149 return false;
11152 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11153 shall be passed or returned in simd/fp register(s) (providing these
11154 parameter passing registers are available).
11156 Upon successful return, *COUNT returns the number of needed registers,
11157 *BASE_MODE returns the mode of the individual register and when IS_HAF
11158 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11159 floating-point aggregate or a homogeneous short-vector aggregate. */
11161 static bool
11162 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11163 const_tree type,
11164 machine_mode *base_mode,
11165 int *count,
11166 bool *is_ha)
11168 machine_mode new_mode = VOIDmode;
11169 bool composite_p = aarch64_composite_type_p (type, mode);
11171 if (is_ha != NULL) *is_ha = false;
11173 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11174 || aarch64_short_vector_p (type, mode))
11176 *count = 1;
11177 new_mode = mode;
11179 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11181 if (is_ha != NULL) *is_ha = true;
11182 *count = 2;
11183 new_mode = GET_MODE_INNER (mode);
11185 else if (type && composite_p)
11187 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11189 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11191 if (is_ha != NULL) *is_ha = true;
11192 *count = ag_count;
11194 else
11195 return false;
11197 else
11198 return false;
11200 *base_mode = new_mode;
11201 return true;
11204 /* Implement TARGET_STRUCT_VALUE_RTX. */
11206 static rtx
11207 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11208 int incoming ATTRIBUTE_UNUSED)
11210 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11213 /* Implements target hook vector_mode_supported_p. */
11214 static bool
11215 aarch64_vector_mode_supported_p (machine_mode mode)
11217 if (TARGET_SIMD
11218 && (mode == V4SImode || mode == V8HImode
11219 || mode == V16QImode || mode == V2DImode
11220 || mode == V2SImode || mode == V4HImode
11221 || mode == V8QImode || mode == V2SFmode
11222 || mode == V4SFmode || mode == V2DFmode
11223 || mode == V4HFmode || mode == V8HFmode
11224 || mode == V1DFmode))
11225 return true;
11227 return false;
11230 /* Return appropriate SIMD container
11231 for MODE within a vector of WIDTH bits. */
11232 static machine_mode
11233 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11235 gcc_assert (width == 64 || width == 128);
11236 if (TARGET_SIMD)
11238 if (width == 128)
11239 switch (mode)
11241 case E_DFmode:
11242 return V2DFmode;
11243 case E_SFmode:
11244 return V4SFmode;
11245 case E_HFmode:
11246 return V8HFmode;
11247 case E_SImode:
11248 return V4SImode;
11249 case E_HImode:
11250 return V8HImode;
11251 case E_QImode:
11252 return V16QImode;
11253 case E_DImode:
11254 return V2DImode;
11255 default:
11256 break;
11258 else
11259 switch (mode)
11261 case E_SFmode:
11262 return V2SFmode;
11263 case E_HFmode:
11264 return V4HFmode;
11265 case E_SImode:
11266 return V2SImode;
11267 case E_HImode:
11268 return V4HImode;
11269 case E_QImode:
11270 return V8QImode;
11271 default:
11272 break;
11275 return word_mode;
11278 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11279 static machine_mode
11280 aarch64_preferred_simd_mode (scalar_mode mode)
11282 return aarch64_simd_container_mode (mode, 128);
11285 /* Return the bitmask of possible vector sizes for the vectorizer
11286 to iterate over. */
11287 static unsigned int
11288 aarch64_autovectorize_vector_sizes (void)
11290 return (16 | 8);
11293 /* Implement TARGET_MANGLE_TYPE. */
11295 static const char *
11296 aarch64_mangle_type (const_tree type)
11298 /* The AArch64 ABI documents say that "__va_list" has to be
11299 managled as if it is in the "std" namespace. */
11300 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11301 return "St9__va_list";
11303 /* Half-precision float. */
11304 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11305 return "Dh";
11307 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11308 builtin types. */
11309 if (TYPE_NAME (type) != NULL)
11310 return aarch64_mangle_builtin_type (type);
11312 /* Use the default mangling. */
11313 return NULL;
11316 /* Find the first rtx_insn before insn that will generate an assembly
11317 instruction. */
11319 static rtx_insn *
11320 aarch64_prev_real_insn (rtx_insn *insn)
11322 if (!insn)
11323 return NULL;
11327 insn = prev_real_insn (insn);
11329 while (insn && recog_memoized (insn) < 0);
11331 return insn;
11334 static bool
11335 is_madd_op (enum attr_type t1)
11337 unsigned int i;
11338 /* A number of these may be AArch32 only. */
11339 enum attr_type mlatypes[] = {
11340 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11341 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11342 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11345 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11347 if (t1 == mlatypes[i])
11348 return true;
11351 return false;
11354 /* Check if there is a register dependency between a load and the insn
11355 for which we hold recog_data. */
11357 static bool
11358 dep_between_memop_and_curr (rtx memop)
11360 rtx load_reg;
11361 int opno;
11363 gcc_assert (GET_CODE (memop) == SET);
11365 if (!REG_P (SET_DEST (memop)))
11366 return false;
11368 load_reg = SET_DEST (memop);
11369 for (opno = 1; opno < recog_data.n_operands; opno++)
11371 rtx operand = recog_data.operand[opno];
11372 if (REG_P (operand)
11373 && reg_overlap_mentioned_p (load_reg, operand))
11374 return true;
11377 return false;
11381 /* When working around the Cortex-A53 erratum 835769,
11382 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11383 instruction and has a preceding memory instruction such that a NOP
11384 should be inserted between them. */
11386 bool
11387 aarch64_madd_needs_nop (rtx_insn* insn)
11389 enum attr_type attr_type;
11390 rtx_insn *prev;
11391 rtx body;
11393 if (!TARGET_FIX_ERR_A53_835769)
11394 return false;
11396 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11397 return false;
11399 attr_type = get_attr_type (insn);
11400 if (!is_madd_op (attr_type))
11401 return false;
11403 prev = aarch64_prev_real_insn (insn);
11404 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11405 Restore recog state to INSN to avoid state corruption. */
11406 extract_constrain_insn_cached (insn);
11408 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11409 return false;
11411 body = single_set (prev);
11413 /* If the previous insn is a memory op and there is no dependency between
11414 it and the DImode madd, emit a NOP between them. If body is NULL then we
11415 have a complex memory operation, probably a load/store pair.
11416 Be conservative for now and emit a NOP. */
11417 if (GET_MODE (recog_data.operand[0]) == DImode
11418 && (!body || !dep_between_memop_and_curr (body)))
11419 return true;
11421 return false;
11426 /* Implement FINAL_PRESCAN_INSN. */
11428 void
11429 aarch64_final_prescan_insn (rtx_insn *insn)
11431 if (aarch64_madd_needs_nop (insn))
11432 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11436 /* Return the equivalent letter for size. */
11437 static char
11438 sizetochar (int size)
11440 switch (size)
11442 case 64: return 'd';
11443 case 32: return 's';
11444 case 16: return 'h';
11445 case 8 : return 'b';
11446 default: gcc_unreachable ();
11450 /* Return true iff x is a uniform vector of floating-point
11451 constants, and the constant can be represented in
11452 quarter-precision form. Note, as aarch64_float_const_representable
11453 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11454 static bool
11455 aarch64_vect_float_const_representable_p (rtx x)
11457 rtx elt;
11458 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11459 && const_vec_duplicate_p (x, &elt)
11460 && aarch64_float_const_representable_p (elt));
11463 /* Return true for valid and false for invalid. */
11464 bool
11465 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11466 struct simd_immediate_info *info)
11468 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11469 matches = 1; \
11470 for (i = 0; i < idx; i += (STRIDE)) \
11471 if (!(TEST)) \
11472 matches = 0; \
11473 if (matches) \
11475 immtype = (CLASS); \
11476 elsize = (ELSIZE); \
11477 eshift = (SHIFT); \
11478 emvn = (NEG); \
11479 break; \
11482 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11483 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11484 unsigned char bytes[16];
11485 int immtype = -1, matches;
11486 unsigned int invmask = inverse ? 0xff : 0;
11487 int eshift, emvn;
11489 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11491 if (! (aarch64_simd_imm_zero_p (op, mode)
11492 || aarch64_vect_float_const_representable_p (op)))
11493 return false;
11495 if (info)
11497 rtx elt = CONST_VECTOR_ELT (op, 0);
11498 scalar_float_mode elt_mode
11499 = as_a <scalar_float_mode> (GET_MODE (elt));
11501 info->value = elt;
11502 info->element_width = GET_MODE_BITSIZE (elt_mode);
11503 info->mvn = false;
11504 info->shift = 0;
11507 return true;
11510 /* Splat vector constant out into a byte vector. */
11511 for (i = 0; i < n_elts; i++)
11513 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11514 it must be laid out in the vector register in reverse order. */
11515 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11516 unsigned HOST_WIDE_INT elpart;
11518 gcc_assert (CONST_INT_P (el));
11519 elpart = INTVAL (el);
11521 for (unsigned int byte = 0; byte < innersize; byte++)
11523 bytes[idx++] = (elpart & 0xff) ^ invmask;
11524 elpart >>= BITS_PER_UNIT;
11529 /* Sanity check. */
11530 gcc_assert (idx == GET_MODE_SIZE (mode));
11534 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11535 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11537 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11538 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11540 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11541 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11543 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11544 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11546 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11548 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11550 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11551 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11553 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11554 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11556 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11557 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11559 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11560 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11562 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11564 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11566 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11567 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11569 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11570 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11572 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11573 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11575 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11576 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11578 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11580 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11581 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11583 while (0);
11585 if (immtype == -1)
11586 return false;
11588 if (info)
11590 info->element_width = elsize;
11591 info->mvn = emvn != 0;
11592 info->shift = eshift;
11594 unsigned HOST_WIDE_INT imm = 0;
11596 if (immtype >= 12 && immtype <= 15)
11597 info->msl = true;
11599 /* Un-invert bytes of recognized vector, if necessary. */
11600 if (invmask != 0)
11601 for (i = 0; i < idx; i++)
11602 bytes[i] ^= invmask;
11604 if (immtype == 17)
11606 /* FIXME: Broken on 32-bit H_W_I hosts. */
11607 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11609 for (i = 0; i < 8; i++)
11610 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11611 << (i * BITS_PER_UNIT);
11614 info->value = GEN_INT (imm);
11616 else
11618 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11619 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11621 /* Construct 'abcdefgh' because the assembler cannot handle
11622 generic constants. */
11623 if (info->mvn)
11624 imm = ~imm;
11625 imm = (imm >> info->shift) & 0xff;
11626 info->value = GEN_INT (imm);
11630 return true;
11631 #undef CHECK
11634 /* Check of immediate shift constants are within range. */
11635 bool
11636 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11638 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11639 if (left)
11640 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11641 else
11642 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11645 /* Return true if X is a uniform vector where all elements
11646 are either the floating-point constant 0.0 or the
11647 integer constant 0. */
11648 bool
11649 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11651 return x == CONST0_RTX (mode);
11655 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11656 operation of width WIDTH at bit position POS. */
11659 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11661 gcc_assert (CONST_INT_P (width));
11662 gcc_assert (CONST_INT_P (pos));
11664 unsigned HOST_WIDE_INT mask
11665 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11666 return GEN_INT (mask << UINTVAL (pos));
11669 bool
11670 aarch64_mov_operand_p (rtx x, machine_mode mode)
11672 if (GET_CODE (x) == HIGH
11673 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11674 return true;
11676 if (CONST_INT_P (x))
11677 return true;
11679 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11680 return true;
11682 return aarch64_classify_symbolic_expression (x)
11683 == SYMBOL_TINY_ABSOLUTE;
11686 /* Return a const_int vector of VAL. */
11688 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11690 int nunits = GET_MODE_NUNITS (mode);
11691 rtvec v = rtvec_alloc (nunits);
11692 int i;
11694 rtx cache = GEN_INT (val);
11696 for (i=0; i < nunits; i++)
11697 RTVEC_ELT (v, i) = cache;
11699 return gen_rtx_CONST_VECTOR (mode, v);
11702 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11704 bool
11705 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11707 machine_mode vmode;
11709 gcc_assert (!VECTOR_MODE_P (mode));
11710 vmode = aarch64_preferred_simd_mode (as_a <scalar_mode> (mode));
11711 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11712 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11715 /* Construct and return a PARALLEL RTX vector with elements numbering the
11716 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11717 the vector - from the perspective of the architecture. This does not
11718 line up with GCC's perspective on lane numbers, so we end up with
11719 different masks depending on our target endian-ness. The diagram
11720 below may help. We must draw the distinction when building masks
11721 which select one half of the vector. An instruction selecting
11722 architectural low-lanes for a big-endian target, must be described using
11723 a mask selecting GCC high-lanes.
11725 Big-Endian Little-Endian
11727 GCC 0 1 2 3 3 2 1 0
11728 | x | x | x | x | | x | x | x | x |
11729 Architecture 3 2 1 0 3 2 1 0
11731 Low Mask: { 2, 3 } { 0, 1 }
11732 High Mask: { 0, 1 } { 2, 3 }
11736 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11738 int nunits = GET_MODE_NUNITS (mode);
11739 rtvec v = rtvec_alloc (nunits / 2);
11740 int high_base = nunits / 2;
11741 int low_base = 0;
11742 int base;
11743 rtx t1;
11744 int i;
11746 if (BYTES_BIG_ENDIAN)
11747 base = high ? low_base : high_base;
11748 else
11749 base = high ? high_base : low_base;
11751 for (i = 0; i < nunits / 2; i++)
11752 RTVEC_ELT (v, i) = GEN_INT (base + i);
11754 t1 = gen_rtx_PARALLEL (mode, v);
11755 return t1;
11758 /* Check OP for validity as a PARALLEL RTX vector with elements
11759 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11760 from the perspective of the architecture. See the diagram above
11761 aarch64_simd_vect_par_cnst_half for more details. */
11763 bool
11764 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11765 bool high)
11767 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11768 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11769 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11770 int i = 0;
11772 if (!VECTOR_MODE_P (mode))
11773 return false;
11775 if (count_op != count_ideal)
11776 return false;
11778 for (i = 0; i < count_ideal; i++)
11780 rtx elt_op = XVECEXP (op, 0, i);
11781 rtx elt_ideal = XVECEXP (ideal, 0, i);
11783 if (!CONST_INT_P (elt_op)
11784 || INTVAL (elt_ideal) != INTVAL (elt_op))
11785 return false;
11787 return true;
11790 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11791 HIGH (exclusive). */
11792 void
11793 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11794 const_tree exp)
11796 HOST_WIDE_INT lane;
11797 gcc_assert (CONST_INT_P (operand));
11798 lane = INTVAL (operand);
11800 if (lane < low || lane >= high)
11802 if (exp)
11803 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11804 else
11805 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11809 /* Return TRUE if OP is a valid vector addressing mode. */
11810 bool
11811 aarch64_simd_mem_operand_p (rtx op)
11813 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11814 || REG_P (XEXP (op, 0)));
11817 /* Emit a register copy from operand to operand, taking care not to
11818 early-clobber source registers in the process.
11820 COUNT is the number of components into which the copy needs to be
11821 decomposed. */
11822 void
11823 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11824 unsigned int count)
11826 unsigned int i;
11827 int rdest = REGNO (operands[0]);
11828 int rsrc = REGNO (operands[1]);
11830 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11831 || rdest < rsrc)
11832 for (i = 0; i < count; i++)
11833 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11834 gen_rtx_REG (mode, rsrc + i));
11835 else
11836 for (i = 0; i < count; i++)
11837 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11838 gen_rtx_REG (mode, rsrc + count - i - 1));
11841 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11842 one of VSTRUCT modes: OI, CI, or XI. */
11844 aarch64_simd_attr_length_rglist (machine_mode mode)
11846 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11849 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11850 alignment of a vector to 128 bits. */
11851 static HOST_WIDE_INT
11852 aarch64_simd_vector_alignment (const_tree type)
11854 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11855 return MIN (align, 128);
11858 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11859 static bool
11860 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11862 if (is_packed)
11863 return false;
11865 /* We guarantee alignment for vectors up to 128-bits. */
11866 if (tree_int_cst_compare (TYPE_SIZE (type),
11867 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11868 return false;
11870 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11871 return true;
11874 /* Return true if the vector misalignment factor is supported by the
11875 target. */
11876 static bool
11877 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11878 const_tree type, int misalignment,
11879 bool is_packed)
11881 if (TARGET_SIMD && STRICT_ALIGNMENT)
11883 /* Return if movmisalign pattern is not supported for this mode. */
11884 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11885 return false;
11887 if (misalignment == -1)
11889 /* Misalignment factor is unknown at compile time but we know
11890 it's word aligned. */
11891 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11893 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11895 if (element_size != 64)
11896 return true;
11898 return false;
11901 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11902 is_packed);
11905 /* If VALS is a vector constant that can be loaded into a register
11906 using DUP, generate instructions to do so and return an RTX to
11907 assign to the register. Otherwise return NULL_RTX. */
11908 static rtx
11909 aarch64_simd_dup_constant (rtx vals)
11911 machine_mode mode = GET_MODE (vals);
11912 machine_mode inner_mode = GET_MODE_INNER (mode);
11913 rtx x;
11915 if (!const_vec_duplicate_p (vals, &x))
11916 return NULL_RTX;
11918 /* We can load this constant by using DUP and a constant in a
11919 single ARM register. This will be cheaper than a vector
11920 load. */
11921 x = copy_to_mode_reg (inner_mode, x);
11922 return gen_rtx_VEC_DUPLICATE (mode, x);
11926 /* Generate code to load VALS, which is a PARALLEL containing only
11927 constants (for vec_init) or CONST_VECTOR, efficiently into a
11928 register. Returns an RTX to copy into the register, or NULL_RTX
11929 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11930 static rtx
11931 aarch64_simd_make_constant (rtx vals)
11933 machine_mode mode = GET_MODE (vals);
11934 rtx const_dup;
11935 rtx const_vec = NULL_RTX;
11936 int n_elts = GET_MODE_NUNITS (mode);
11937 int n_const = 0;
11938 int i;
11940 if (GET_CODE (vals) == CONST_VECTOR)
11941 const_vec = vals;
11942 else if (GET_CODE (vals) == PARALLEL)
11944 /* A CONST_VECTOR must contain only CONST_INTs and
11945 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11946 Only store valid constants in a CONST_VECTOR. */
11947 for (i = 0; i < n_elts; ++i)
11949 rtx x = XVECEXP (vals, 0, i);
11950 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11951 n_const++;
11953 if (n_const == n_elts)
11954 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11956 else
11957 gcc_unreachable ();
11959 if (const_vec != NULL_RTX
11960 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11961 /* Load using MOVI/MVNI. */
11962 return const_vec;
11963 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11964 /* Loaded using DUP. */
11965 return const_dup;
11966 else if (const_vec != NULL_RTX)
11967 /* Load from constant pool. We can not take advantage of single-cycle
11968 LD1 because we need a PC-relative addressing mode. */
11969 return const_vec;
11970 else
11971 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11972 We can not construct an initializer. */
11973 return NULL_RTX;
11976 /* Expand a vector initialisation sequence, such that TARGET is
11977 initialised to contain VALS. */
11979 void
11980 aarch64_expand_vector_init (rtx target, rtx vals)
11982 machine_mode mode = GET_MODE (target);
11983 machine_mode inner_mode = GET_MODE_INNER (mode);
11984 /* The number of vector elements. */
11985 int n_elts = GET_MODE_NUNITS (mode);
11986 /* The number of vector elements which are not constant. */
11987 int n_var = 0;
11988 rtx any_const = NULL_RTX;
11989 /* The first element of vals. */
11990 rtx v0 = XVECEXP (vals, 0, 0);
11991 bool all_same = true;
11993 /* Count the number of variable elements to initialise. */
11994 for (int i = 0; i < n_elts; ++i)
11996 rtx x = XVECEXP (vals, 0, i);
11997 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11998 ++n_var;
11999 else
12000 any_const = x;
12002 all_same &= rtx_equal_p (x, v0);
12005 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12006 how best to handle this. */
12007 if (n_var == 0)
12009 rtx constant = aarch64_simd_make_constant (vals);
12010 if (constant != NULL_RTX)
12012 emit_move_insn (target, constant);
12013 return;
12017 /* Splat a single non-constant element if we can. */
12018 if (all_same)
12020 rtx x = copy_to_mode_reg (inner_mode, v0);
12021 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12022 return;
12025 enum insn_code icode = optab_handler (vec_set_optab, mode);
12026 gcc_assert (icode != CODE_FOR_nothing);
12028 /* If there are only variable elements, try to optimize
12029 the insertion using dup for the most common element
12030 followed by insertions. */
12032 /* The algorithm will fill matches[*][0] with the earliest matching element,
12033 and matches[X][1] with the count of duplicate elements (if X is the
12034 earliest element which has duplicates). */
12036 if (n_var == n_elts && n_elts <= 16)
12038 int matches[16][2] = {0};
12039 for (int i = 0; i < n_elts; i++)
12041 for (int j = 0; j <= i; j++)
12043 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12045 matches[i][0] = j;
12046 matches[j][1]++;
12047 break;
12051 int maxelement = 0;
12052 int maxv = 0;
12053 for (int i = 0; i < n_elts; i++)
12054 if (matches[i][1] > maxv)
12056 maxelement = i;
12057 maxv = matches[i][1];
12060 /* Create a duplicate of the most common element. */
12061 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12062 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12064 /* Insert the rest. */
12065 for (int i = 0; i < n_elts; i++)
12067 rtx x = XVECEXP (vals, 0, i);
12068 if (matches[i][0] == maxelement)
12069 continue;
12070 x = copy_to_mode_reg (inner_mode, x);
12071 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12073 return;
12076 /* Initialise a vector which is part-variable. We want to first try
12077 to build those lanes which are constant in the most efficient way we
12078 can. */
12079 if (n_var != n_elts)
12081 rtx copy = copy_rtx (vals);
12083 /* Load constant part of vector. We really don't care what goes into the
12084 parts we will overwrite, but we're more likely to be able to load the
12085 constant efficiently if it has fewer, larger, repeating parts
12086 (see aarch64_simd_valid_immediate). */
12087 for (int i = 0; i < n_elts; i++)
12089 rtx x = XVECEXP (vals, 0, i);
12090 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12091 continue;
12092 rtx subst = any_const;
12093 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12095 /* Look in the copied vector, as more elements are const. */
12096 rtx test = XVECEXP (copy, 0, i ^ bit);
12097 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12099 subst = test;
12100 break;
12103 XVECEXP (copy, 0, i) = subst;
12105 aarch64_expand_vector_init (target, copy);
12108 /* Insert the variable lanes directly. */
12109 for (int i = 0; i < n_elts; i++)
12111 rtx x = XVECEXP (vals, 0, i);
12112 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12113 continue;
12114 x = copy_to_mode_reg (inner_mode, x);
12115 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12119 static unsigned HOST_WIDE_INT
12120 aarch64_shift_truncation_mask (machine_mode mode)
12122 return
12123 (!SHIFT_COUNT_TRUNCATED
12124 || aarch64_vector_mode_supported_p (mode)
12125 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12128 /* Select a format to encode pointers in exception handling data. */
12130 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12132 int type;
12133 switch (aarch64_cmodel)
12135 case AARCH64_CMODEL_TINY:
12136 case AARCH64_CMODEL_TINY_PIC:
12137 case AARCH64_CMODEL_SMALL:
12138 case AARCH64_CMODEL_SMALL_PIC:
12139 case AARCH64_CMODEL_SMALL_SPIC:
12140 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12141 for everything. */
12142 type = DW_EH_PE_sdata4;
12143 break;
12144 default:
12145 /* No assumptions here. 8-byte relocs required. */
12146 type = DW_EH_PE_sdata8;
12147 break;
12149 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12152 /* The last .arch and .tune assembly strings that we printed. */
12153 static std::string aarch64_last_printed_arch_string;
12154 static std::string aarch64_last_printed_tune_string;
12156 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12157 by the function fndecl. */
12159 void
12160 aarch64_declare_function_name (FILE *stream, const char* name,
12161 tree fndecl)
12163 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12165 struct cl_target_option *targ_options;
12166 if (target_parts)
12167 targ_options = TREE_TARGET_OPTION (target_parts);
12168 else
12169 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12170 gcc_assert (targ_options);
12172 const struct processor *this_arch
12173 = aarch64_get_arch (targ_options->x_explicit_arch);
12175 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12176 std::string extension
12177 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12178 this_arch->flags);
12179 /* Only update the assembler .arch string if it is distinct from the last
12180 such string we printed. */
12181 std::string to_print = this_arch->name + extension;
12182 if (to_print != aarch64_last_printed_arch_string)
12184 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12185 aarch64_last_printed_arch_string = to_print;
12188 /* Print the cpu name we're tuning for in the comments, might be
12189 useful to readers of the generated asm. Do it only when it changes
12190 from function to function and verbose assembly is requested. */
12191 const struct processor *this_tune
12192 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12194 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12196 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12197 this_tune->name);
12198 aarch64_last_printed_tune_string = this_tune->name;
12201 /* Don't forget the type directive for ELF. */
12202 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12203 ASM_OUTPUT_LABEL (stream, name);
12206 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12208 static void
12209 aarch64_start_file (void)
12211 struct cl_target_option *default_options
12212 = TREE_TARGET_OPTION (target_option_default_node);
12214 const struct processor *default_arch
12215 = aarch64_get_arch (default_options->x_explicit_arch);
12216 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12217 std::string extension
12218 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12219 default_arch->flags);
12221 aarch64_last_printed_arch_string = default_arch->name + extension;
12222 aarch64_last_printed_tune_string = "";
12223 asm_fprintf (asm_out_file, "\t.arch %s\n",
12224 aarch64_last_printed_arch_string.c_str ());
12226 default_file_start ();
12229 /* Emit load exclusive. */
12231 static void
12232 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12233 rtx mem, rtx model_rtx)
12235 rtx (*gen) (rtx, rtx, rtx);
12237 switch (mode)
12239 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12240 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12241 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12242 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12243 default:
12244 gcc_unreachable ();
12247 emit_insn (gen (rval, mem, model_rtx));
12250 /* Emit store exclusive. */
12252 static void
12253 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12254 rtx rval, rtx mem, rtx model_rtx)
12256 rtx (*gen) (rtx, rtx, rtx, rtx);
12258 switch (mode)
12260 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12261 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12262 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12263 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12264 default:
12265 gcc_unreachable ();
12268 emit_insn (gen (bval, rval, mem, model_rtx));
12271 /* Mark the previous jump instruction as unlikely. */
12273 static void
12274 aarch64_emit_unlikely_jump (rtx insn)
12276 rtx_insn *jump = emit_jump_insn (insn);
12277 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12280 /* Expand a compare and swap pattern. */
12282 void
12283 aarch64_expand_compare_and_swap (rtx operands[])
12285 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12286 machine_mode mode, cmp_mode;
12287 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12288 int idx;
12289 gen_cas_fn gen;
12290 const gen_cas_fn split_cas[] =
12292 gen_aarch64_compare_and_swapqi,
12293 gen_aarch64_compare_and_swaphi,
12294 gen_aarch64_compare_and_swapsi,
12295 gen_aarch64_compare_and_swapdi
12297 const gen_cas_fn atomic_cas[] =
12299 gen_aarch64_compare_and_swapqi_lse,
12300 gen_aarch64_compare_and_swaphi_lse,
12301 gen_aarch64_compare_and_swapsi_lse,
12302 gen_aarch64_compare_and_swapdi_lse
12305 bval = operands[0];
12306 rval = operands[1];
12307 mem = operands[2];
12308 oldval = operands[3];
12309 newval = operands[4];
12310 is_weak = operands[5];
12311 mod_s = operands[6];
12312 mod_f = operands[7];
12313 mode = GET_MODE (mem);
12314 cmp_mode = mode;
12316 /* Normally the succ memory model must be stronger than fail, but in the
12317 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12318 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12320 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12321 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12322 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12324 switch (mode)
12326 case E_QImode:
12327 case E_HImode:
12328 /* For short modes, we're going to perform the comparison in SImode,
12329 so do the zero-extension now. */
12330 cmp_mode = SImode;
12331 rval = gen_reg_rtx (SImode);
12332 oldval = convert_modes (SImode, mode, oldval, true);
12333 /* Fall through. */
12335 case E_SImode:
12336 case E_DImode:
12337 /* Force the value into a register if needed. */
12338 if (!aarch64_plus_operand (oldval, mode))
12339 oldval = force_reg (cmp_mode, oldval);
12340 break;
12342 default:
12343 gcc_unreachable ();
12346 switch (mode)
12348 case E_QImode: idx = 0; break;
12349 case E_HImode: idx = 1; break;
12350 case E_SImode: idx = 2; break;
12351 case E_DImode: idx = 3; break;
12352 default:
12353 gcc_unreachable ();
12355 if (TARGET_LSE)
12356 gen = atomic_cas[idx];
12357 else
12358 gen = split_cas[idx];
12360 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12362 if (mode == QImode || mode == HImode)
12363 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12365 x = gen_rtx_REG (CCmode, CC_REGNUM);
12366 x = gen_rtx_EQ (SImode, x, const0_rtx);
12367 emit_insn (gen_rtx_SET (bval, x));
12370 /* Test whether the target supports using a atomic load-operate instruction.
12371 CODE is the operation and AFTER is TRUE if the data in memory after the
12372 operation should be returned and FALSE if the data before the operation
12373 should be returned. Returns FALSE if the operation isn't supported by the
12374 architecture. */
12376 bool
12377 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12379 if (!TARGET_LSE)
12380 return false;
12382 switch (code)
12384 case SET:
12385 case AND:
12386 case IOR:
12387 case XOR:
12388 case MINUS:
12389 case PLUS:
12390 return true;
12391 default:
12392 return false;
12396 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12397 sequence implementing an atomic operation. */
12399 static void
12400 aarch64_emit_post_barrier (enum memmodel model)
12402 const enum memmodel base_model = memmodel_base (model);
12404 if (is_mm_sync (model)
12405 && (base_model == MEMMODEL_ACQUIRE
12406 || base_model == MEMMODEL_ACQ_REL
12407 || base_model == MEMMODEL_SEQ_CST))
12409 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12413 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12414 for the data in memory. EXPECTED is the value expected to be in memory.
12415 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12416 is the memory ordering to use. */
12418 void
12419 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12420 rtx expected, rtx desired,
12421 rtx model)
12423 rtx (*gen) (rtx, rtx, rtx, rtx);
12424 machine_mode mode;
12426 mode = GET_MODE (mem);
12428 switch (mode)
12430 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12431 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12432 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12433 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12434 default:
12435 gcc_unreachable ();
12438 /* Move the expected value into the CAS destination register. */
12439 emit_insn (gen_rtx_SET (rval, expected));
12441 /* Emit the CAS. */
12442 emit_insn (gen (rval, mem, desired, model));
12444 /* Compare the expected value with the value loaded by the CAS, to establish
12445 whether the swap was made. */
12446 aarch64_gen_compare_reg (EQ, rval, expected);
12449 /* Split a compare and swap pattern. */
12451 void
12452 aarch64_split_compare_and_swap (rtx operands[])
12454 rtx rval, mem, oldval, newval, scratch;
12455 machine_mode mode;
12456 bool is_weak;
12457 rtx_code_label *label1, *label2;
12458 rtx x, cond;
12459 enum memmodel model;
12460 rtx model_rtx;
12462 rval = operands[0];
12463 mem = operands[1];
12464 oldval = operands[2];
12465 newval = operands[3];
12466 is_weak = (operands[4] != const0_rtx);
12467 model_rtx = operands[5];
12468 scratch = operands[7];
12469 mode = GET_MODE (mem);
12470 model = memmodel_from_int (INTVAL (model_rtx));
12472 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12473 loop:
12474 .label1:
12475 LD[A]XR rval, [mem]
12476 CBNZ rval, .label2
12477 ST[L]XR scratch, newval, [mem]
12478 CBNZ scratch, .label1
12479 .label2:
12480 CMP rval, 0. */
12481 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12483 label1 = NULL;
12484 if (!is_weak)
12486 label1 = gen_label_rtx ();
12487 emit_label (label1);
12489 label2 = gen_label_rtx ();
12491 /* The initial load can be relaxed for a __sync operation since a final
12492 barrier will be emitted to stop code hoisting. */
12493 if (is_mm_sync (model))
12494 aarch64_emit_load_exclusive (mode, rval, mem,
12495 GEN_INT (MEMMODEL_RELAXED));
12496 else
12497 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12499 if (strong_zero_p)
12501 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12502 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12503 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12504 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12506 else
12508 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12509 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12510 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12511 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12512 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12515 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12517 if (!is_weak)
12519 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12520 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12521 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12522 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12524 else
12526 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12527 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12528 emit_insn (gen_rtx_SET (cond, x));
12531 emit_label (label2);
12532 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12533 to set the condition flags. If this is not used it will be removed by
12534 later passes. */
12535 if (strong_zero_p)
12537 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12538 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12539 emit_insn (gen_rtx_SET (cond, x));
12541 /* Emit any final barrier needed for a __sync operation. */
12542 if (is_mm_sync (model))
12543 aarch64_emit_post_barrier (model);
12546 /* Emit a BIC instruction. */
12548 static void
12549 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12551 rtx shift_rtx = GEN_INT (shift);
12552 rtx (*gen) (rtx, rtx, rtx, rtx);
12554 switch (mode)
12556 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12557 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12558 default:
12559 gcc_unreachable ();
12562 emit_insn (gen (dst, s2, shift_rtx, s1));
12565 /* Emit an atomic swap. */
12567 static void
12568 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12569 rtx mem, rtx model)
12571 rtx (*gen) (rtx, rtx, rtx, rtx);
12573 switch (mode)
12575 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12576 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12577 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12578 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12579 default:
12580 gcc_unreachable ();
12583 emit_insn (gen (dst, mem, value, model));
12586 /* Operations supported by aarch64_emit_atomic_load_op. */
12588 enum aarch64_atomic_load_op_code
12590 AARCH64_LDOP_PLUS, /* A + B */
12591 AARCH64_LDOP_XOR, /* A ^ B */
12592 AARCH64_LDOP_OR, /* A | B */
12593 AARCH64_LDOP_BIC /* A & ~B */
12596 /* Emit an atomic load-operate. */
12598 static void
12599 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12600 machine_mode mode, rtx dst, rtx src,
12601 rtx mem, rtx model)
12603 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12604 const aarch64_atomic_load_op_fn plus[] =
12606 gen_aarch64_atomic_loadaddqi,
12607 gen_aarch64_atomic_loadaddhi,
12608 gen_aarch64_atomic_loadaddsi,
12609 gen_aarch64_atomic_loadadddi
12611 const aarch64_atomic_load_op_fn eor[] =
12613 gen_aarch64_atomic_loadeorqi,
12614 gen_aarch64_atomic_loadeorhi,
12615 gen_aarch64_atomic_loadeorsi,
12616 gen_aarch64_atomic_loadeordi
12618 const aarch64_atomic_load_op_fn ior[] =
12620 gen_aarch64_atomic_loadsetqi,
12621 gen_aarch64_atomic_loadsethi,
12622 gen_aarch64_atomic_loadsetsi,
12623 gen_aarch64_atomic_loadsetdi
12625 const aarch64_atomic_load_op_fn bic[] =
12627 gen_aarch64_atomic_loadclrqi,
12628 gen_aarch64_atomic_loadclrhi,
12629 gen_aarch64_atomic_loadclrsi,
12630 gen_aarch64_atomic_loadclrdi
12632 aarch64_atomic_load_op_fn gen;
12633 int idx = 0;
12635 switch (mode)
12637 case E_QImode: idx = 0; break;
12638 case E_HImode: idx = 1; break;
12639 case E_SImode: idx = 2; break;
12640 case E_DImode: idx = 3; break;
12641 default:
12642 gcc_unreachable ();
12645 switch (code)
12647 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12648 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12649 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12650 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12651 default:
12652 gcc_unreachable ();
12655 emit_insn (gen (dst, mem, src, model));
12658 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12659 location to store the data read from memory. OUT_RESULT is the location to
12660 store the result of the operation. MEM is the memory location to read and
12661 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12662 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12663 be NULL. */
12665 void
12666 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12667 rtx mem, rtx value, rtx model_rtx)
12669 machine_mode mode = GET_MODE (mem);
12670 machine_mode wmode = (mode == DImode ? DImode : SImode);
12671 const bool short_mode = (mode < SImode);
12672 aarch64_atomic_load_op_code ldop_code;
12673 rtx src;
12674 rtx x;
12676 if (out_data)
12677 out_data = gen_lowpart (mode, out_data);
12679 if (out_result)
12680 out_result = gen_lowpart (mode, out_result);
12682 /* Make sure the value is in a register, putting it into a destination
12683 register if it needs to be manipulated. */
12684 if (!register_operand (value, mode)
12685 || code == AND || code == MINUS)
12687 src = out_result ? out_result : out_data;
12688 emit_move_insn (src, gen_lowpart (mode, value));
12690 else
12691 src = value;
12692 gcc_assert (register_operand (src, mode));
12694 /* Preprocess the data for the operation as necessary. If the operation is
12695 a SET then emit a swap instruction and finish. */
12696 switch (code)
12698 case SET:
12699 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12700 return;
12702 case MINUS:
12703 /* Negate the value and treat it as a PLUS. */
12705 rtx neg_src;
12707 /* Resize the value if necessary. */
12708 if (short_mode)
12709 src = gen_lowpart (wmode, src);
12711 neg_src = gen_rtx_NEG (wmode, src);
12712 emit_insn (gen_rtx_SET (src, neg_src));
12714 if (short_mode)
12715 src = gen_lowpart (mode, src);
12717 /* Fall-through. */
12718 case PLUS:
12719 ldop_code = AARCH64_LDOP_PLUS;
12720 break;
12722 case IOR:
12723 ldop_code = AARCH64_LDOP_OR;
12724 break;
12726 case XOR:
12727 ldop_code = AARCH64_LDOP_XOR;
12728 break;
12730 case AND:
12732 rtx not_src;
12734 /* Resize the value if necessary. */
12735 if (short_mode)
12736 src = gen_lowpart (wmode, src);
12738 not_src = gen_rtx_NOT (wmode, src);
12739 emit_insn (gen_rtx_SET (src, not_src));
12741 if (short_mode)
12742 src = gen_lowpart (mode, src);
12744 ldop_code = AARCH64_LDOP_BIC;
12745 break;
12747 default:
12748 /* The operation can't be done with atomic instructions. */
12749 gcc_unreachable ();
12752 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12754 /* If necessary, calculate the data in memory after the update by redoing the
12755 operation from values in registers. */
12756 if (!out_result)
12757 return;
12759 if (short_mode)
12761 src = gen_lowpart (wmode, src);
12762 out_data = gen_lowpart (wmode, out_data);
12763 out_result = gen_lowpart (wmode, out_result);
12766 x = NULL_RTX;
12768 switch (code)
12770 case MINUS:
12771 case PLUS:
12772 x = gen_rtx_PLUS (wmode, out_data, src);
12773 break;
12774 case IOR:
12775 x = gen_rtx_IOR (wmode, out_data, src);
12776 break;
12777 case XOR:
12778 x = gen_rtx_XOR (wmode, out_data, src);
12779 break;
12780 case AND:
12781 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12782 return;
12783 default:
12784 gcc_unreachable ();
12787 emit_set_insn (out_result, x);
12789 return;
12792 /* Split an atomic operation. */
12794 void
12795 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12796 rtx value, rtx model_rtx, rtx cond)
12798 machine_mode mode = GET_MODE (mem);
12799 machine_mode wmode = (mode == DImode ? DImode : SImode);
12800 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12801 const bool is_sync = is_mm_sync (model);
12802 rtx_code_label *label;
12803 rtx x;
12805 /* Split the atomic operation into a sequence. */
12806 label = gen_label_rtx ();
12807 emit_label (label);
12809 if (new_out)
12810 new_out = gen_lowpart (wmode, new_out);
12811 if (old_out)
12812 old_out = gen_lowpart (wmode, old_out);
12813 else
12814 old_out = new_out;
12815 value = simplify_gen_subreg (wmode, value, mode, 0);
12817 /* The initial load can be relaxed for a __sync operation since a final
12818 barrier will be emitted to stop code hoisting. */
12819 if (is_sync)
12820 aarch64_emit_load_exclusive (mode, old_out, mem,
12821 GEN_INT (MEMMODEL_RELAXED));
12822 else
12823 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12825 switch (code)
12827 case SET:
12828 new_out = value;
12829 break;
12831 case NOT:
12832 x = gen_rtx_AND (wmode, old_out, value);
12833 emit_insn (gen_rtx_SET (new_out, x));
12834 x = gen_rtx_NOT (wmode, new_out);
12835 emit_insn (gen_rtx_SET (new_out, x));
12836 break;
12838 case MINUS:
12839 if (CONST_INT_P (value))
12841 value = GEN_INT (-INTVAL (value));
12842 code = PLUS;
12844 /* Fall through. */
12846 default:
12847 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12848 emit_insn (gen_rtx_SET (new_out, x));
12849 break;
12852 aarch64_emit_store_exclusive (mode, cond, mem,
12853 gen_lowpart (mode, new_out), model_rtx);
12855 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12856 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12857 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12858 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12860 /* Emit any final barrier needed for a __sync operation. */
12861 if (is_sync)
12862 aarch64_emit_post_barrier (model);
12865 static void
12866 aarch64_init_libfuncs (void)
12868 /* Half-precision float operations. The compiler handles all operations
12869 with NULL libfuncs by converting to SFmode. */
12871 /* Conversions. */
12872 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12873 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12875 /* Arithmetic. */
12876 set_optab_libfunc (add_optab, HFmode, NULL);
12877 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12878 set_optab_libfunc (smul_optab, HFmode, NULL);
12879 set_optab_libfunc (neg_optab, HFmode, NULL);
12880 set_optab_libfunc (sub_optab, HFmode, NULL);
12882 /* Comparisons. */
12883 set_optab_libfunc (eq_optab, HFmode, NULL);
12884 set_optab_libfunc (ne_optab, HFmode, NULL);
12885 set_optab_libfunc (lt_optab, HFmode, NULL);
12886 set_optab_libfunc (le_optab, HFmode, NULL);
12887 set_optab_libfunc (ge_optab, HFmode, NULL);
12888 set_optab_libfunc (gt_optab, HFmode, NULL);
12889 set_optab_libfunc (unord_optab, HFmode, NULL);
12892 /* Target hook for c_mode_for_suffix. */
12893 static machine_mode
12894 aarch64_c_mode_for_suffix (char suffix)
12896 if (suffix == 'q')
12897 return TFmode;
12899 return VOIDmode;
12902 /* We can only represent floating point constants which will fit in
12903 "quarter-precision" values. These values are characterised by
12904 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12907 (-1)^s * (n/16) * 2^r
12909 Where:
12910 's' is the sign bit.
12911 'n' is an integer in the range 16 <= n <= 31.
12912 'r' is an integer in the range -3 <= r <= 4. */
12914 /* Return true iff X can be represented by a quarter-precision
12915 floating point immediate operand X. Note, we cannot represent 0.0. */
12916 bool
12917 aarch64_float_const_representable_p (rtx x)
12919 /* This represents our current view of how many bits
12920 make up the mantissa. */
12921 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12922 int exponent;
12923 unsigned HOST_WIDE_INT mantissa, mask;
12924 REAL_VALUE_TYPE r, m;
12925 bool fail;
12927 if (!CONST_DOUBLE_P (x))
12928 return false;
12930 /* We don't support HFmode constants yet. */
12931 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12932 return false;
12934 r = *CONST_DOUBLE_REAL_VALUE (x);
12936 /* We cannot represent infinities, NaNs or +/-zero. We won't
12937 know if we have +zero until we analyse the mantissa, but we
12938 can reject the other invalid values. */
12939 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12940 || REAL_VALUE_MINUS_ZERO (r))
12941 return false;
12943 /* Extract exponent. */
12944 r = real_value_abs (&r);
12945 exponent = REAL_EXP (&r);
12947 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12948 highest (sign) bit, with a fixed binary point at bit point_pos.
12949 m1 holds the low part of the mantissa, m2 the high part.
12950 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12951 bits for the mantissa, this can fail (low bits will be lost). */
12952 real_ldexp (&m, &r, point_pos - exponent);
12953 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12955 /* If the low part of the mantissa has bits set we cannot represent
12956 the value. */
12957 if (w.ulow () != 0)
12958 return false;
12959 /* We have rejected the lower HOST_WIDE_INT, so update our
12960 understanding of how many bits lie in the mantissa and
12961 look only at the high HOST_WIDE_INT. */
12962 mantissa = w.elt (1);
12963 point_pos -= HOST_BITS_PER_WIDE_INT;
12965 /* We can only represent values with a mantissa of the form 1.xxxx. */
12966 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12967 if ((mantissa & mask) != 0)
12968 return false;
12970 /* Having filtered unrepresentable values, we may now remove all
12971 but the highest 5 bits. */
12972 mantissa >>= point_pos - 5;
12974 /* We cannot represent the value 0.0, so reject it. This is handled
12975 elsewhere. */
12976 if (mantissa == 0)
12977 return false;
12979 /* Then, as bit 4 is always set, we can mask it off, leaving
12980 the mantissa in the range [0, 15]. */
12981 mantissa &= ~(1 << 4);
12982 gcc_assert (mantissa <= 15);
12984 /* GCC internally does not use IEEE754-like encoding (where normalized
12985 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12986 Our mantissa values are shifted 4 places to the left relative to
12987 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12988 by 5 places to correct for GCC's representation. */
12989 exponent = 5 - exponent;
12991 return (exponent >= 0 && exponent <= 7);
12994 char*
12995 aarch64_output_simd_mov_immediate (rtx const_vector,
12996 machine_mode mode,
12997 unsigned width)
12999 bool is_valid;
13000 static char templ[40];
13001 const char *mnemonic;
13002 const char *shift_op;
13003 unsigned int lane_count = 0;
13004 char element_char;
13006 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13008 /* This will return true to show const_vector is legal for use as either
13009 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
13010 also update INFO to show how the immediate should be generated. */
13011 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13012 gcc_assert (is_valid);
13014 element_char = sizetochar (info.element_width);
13015 lane_count = width / info.element_width;
13017 mode = GET_MODE_INNER (mode);
13018 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13020 gcc_assert (info.shift == 0 && ! info.mvn);
13021 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13022 move immediate path. */
13023 if (aarch64_float_const_zero_rtx_p (info.value))
13024 info.value = GEN_INT (0);
13025 else
13027 const unsigned int buf_size = 20;
13028 char float_buf[buf_size] = {'\0'};
13029 real_to_decimal_for_mode (float_buf,
13030 CONST_DOUBLE_REAL_VALUE (info.value),
13031 buf_size, buf_size, 1, mode);
13033 if (lane_count == 1)
13034 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13035 else
13036 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13037 lane_count, element_char, float_buf);
13038 return templ;
13042 mnemonic = info.mvn ? "mvni" : "movi";
13043 shift_op = info.msl ? "msl" : "lsl";
13045 gcc_assert (CONST_INT_P (info.value));
13046 if (lane_count == 1)
13047 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13048 mnemonic, UINTVAL (info.value));
13049 else if (info.shift)
13050 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13051 ", %s %d", mnemonic, lane_count, element_char,
13052 UINTVAL (info.value), shift_op, info.shift);
13053 else
13054 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13055 mnemonic, lane_count, element_char, UINTVAL (info.value));
13056 return templ;
13059 char*
13060 aarch64_output_scalar_simd_mov_immediate (rtx immediate, machine_mode mode)
13063 /* If a floating point number was passed and we desire to use it in an
13064 integer mode do the conversion to integer. */
13065 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13067 unsigned HOST_WIDE_INT ival;
13068 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13069 gcc_unreachable ();
13070 immediate = gen_int_mode (ival, mode);
13073 machine_mode vmode;
13074 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13075 a 128 bit vector mode. */
13076 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13078 gcc_assert (!VECTOR_MODE_P (mode));
13079 vmode = aarch64_simd_container_mode (mode, width);
13080 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13081 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13084 /* Split operands into moves from op[1] + op[2] into op[0]. */
13086 void
13087 aarch64_split_combinev16qi (rtx operands[3])
13089 unsigned int dest = REGNO (operands[0]);
13090 unsigned int src1 = REGNO (operands[1]);
13091 unsigned int src2 = REGNO (operands[2]);
13092 machine_mode halfmode = GET_MODE (operands[1]);
13093 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13094 rtx destlo, desthi;
13096 gcc_assert (halfmode == V16QImode);
13098 if (src1 == dest && src2 == dest + halfregs)
13100 /* No-op move. Can't split to nothing; emit something. */
13101 emit_note (NOTE_INSN_DELETED);
13102 return;
13105 /* Preserve register attributes for variable tracking. */
13106 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13107 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13108 GET_MODE_SIZE (halfmode));
13110 /* Special case of reversed high/low parts. */
13111 if (reg_overlap_mentioned_p (operands[2], destlo)
13112 && reg_overlap_mentioned_p (operands[1], desthi))
13114 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13115 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13116 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13118 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13120 /* Try to avoid unnecessary moves if part of the result
13121 is in the right place already. */
13122 if (src1 != dest)
13123 emit_move_insn (destlo, operands[1]);
13124 if (src2 != dest + halfregs)
13125 emit_move_insn (desthi, operands[2]);
13127 else
13129 if (src2 != dest + halfregs)
13130 emit_move_insn (desthi, operands[2]);
13131 if (src1 != dest)
13132 emit_move_insn (destlo, operands[1]);
13136 /* vec_perm support. */
13138 #define MAX_VECT_LEN 16
13140 struct expand_vec_perm_d
13142 rtx target, op0, op1;
13143 unsigned char perm[MAX_VECT_LEN];
13144 machine_mode vmode;
13145 unsigned char nelt;
13146 bool one_vector_p;
13147 bool testing_p;
13150 /* Generate a variable permutation. */
13152 static void
13153 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13155 machine_mode vmode = GET_MODE (target);
13156 bool one_vector_p = rtx_equal_p (op0, op1);
13158 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13159 gcc_checking_assert (GET_MODE (op0) == vmode);
13160 gcc_checking_assert (GET_MODE (op1) == vmode);
13161 gcc_checking_assert (GET_MODE (sel) == vmode);
13162 gcc_checking_assert (TARGET_SIMD);
13164 if (one_vector_p)
13166 if (vmode == V8QImode)
13168 /* Expand the argument to a V16QI mode by duplicating it. */
13169 rtx pair = gen_reg_rtx (V16QImode);
13170 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13171 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13173 else
13175 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13178 else
13180 rtx pair;
13182 if (vmode == V8QImode)
13184 pair = gen_reg_rtx (V16QImode);
13185 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13186 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13188 else
13190 pair = gen_reg_rtx (OImode);
13191 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13192 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13197 void
13198 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13200 machine_mode vmode = GET_MODE (target);
13201 unsigned int nelt = GET_MODE_NUNITS (vmode);
13202 bool one_vector_p = rtx_equal_p (op0, op1);
13203 rtx mask;
13205 /* The TBL instruction does not use a modulo index, so we must take care
13206 of that ourselves. */
13207 mask = aarch64_simd_gen_const_vector_dup (vmode,
13208 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13209 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13211 /* For big-endian, we also need to reverse the index within the vector
13212 (but not which vector). */
13213 if (BYTES_BIG_ENDIAN)
13215 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13216 if (!one_vector_p)
13217 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13218 sel = expand_simple_binop (vmode, XOR, sel, mask,
13219 NULL, 0, OPTAB_LIB_WIDEN);
13221 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13224 /* Recognize patterns suitable for the TRN instructions. */
13225 static bool
13226 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13228 unsigned int i, odd, mask, nelt = d->nelt;
13229 rtx out, in0, in1, x;
13230 rtx (*gen) (rtx, rtx, rtx);
13231 machine_mode vmode = d->vmode;
13233 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13234 return false;
13236 /* Note that these are little-endian tests.
13237 We correct for big-endian later. */
13238 if (d->perm[0] == 0)
13239 odd = 0;
13240 else if (d->perm[0] == 1)
13241 odd = 1;
13242 else
13243 return false;
13244 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13246 for (i = 0; i < nelt; i += 2)
13248 if (d->perm[i] != i + odd)
13249 return false;
13250 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13251 return false;
13254 /* Success! */
13255 if (d->testing_p)
13256 return true;
13258 in0 = d->op0;
13259 in1 = d->op1;
13260 if (BYTES_BIG_ENDIAN)
13262 x = in0, in0 = in1, in1 = x;
13263 odd = !odd;
13265 out = d->target;
13267 if (odd)
13269 switch (vmode)
13271 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13272 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13273 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13274 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13275 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13276 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13277 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13278 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13279 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13280 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13281 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13282 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13283 default:
13284 return false;
13287 else
13289 switch (vmode)
13291 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13292 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13293 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13294 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13295 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13296 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13297 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13298 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13299 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13300 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13301 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13302 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13303 default:
13304 return false;
13308 emit_insn (gen (out, in0, in1));
13309 return true;
13312 /* Recognize patterns suitable for the UZP instructions. */
13313 static bool
13314 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13316 unsigned int i, odd, mask, nelt = d->nelt;
13317 rtx out, in0, in1, x;
13318 rtx (*gen) (rtx, rtx, rtx);
13319 machine_mode vmode = d->vmode;
13321 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13322 return false;
13324 /* Note that these are little-endian tests.
13325 We correct for big-endian later. */
13326 if (d->perm[0] == 0)
13327 odd = 0;
13328 else if (d->perm[0] == 1)
13329 odd = 1;
13330 else
13331 return false;
13332 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13334 for (i = 0; i < nelt; i++)
13336 unsigned elt = (i * 2 + odd) & mask;
13337 if (d->perm[i] != elt)
13338 return false;
13341 /* Success! */
13342 if (d->testing_p)
13343 return true;
13345 in0 = d->op0;
13346 in1 = d->op1;
13347 if (BYTES_BIG_ENDIAN)
13349 x = in0, in0 = in1, in1 = x;
13350 odd = !odd;
13352 out = d->target;
13354 if (odd)
13356 switch (vmode)
13358 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13359 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13360 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13361 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13362 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13363 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13364 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13365 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13366 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13367 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13368 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13369 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13370 default:
13371 return false;
13374 else
13376 switch (vmode)
13378 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13379 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13380 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13381 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13382 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13383 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13384 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13385 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13386 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13387 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13388 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13389 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13390 default:
13391 return false;
13395 emit_insn (gen (out, in0, in1));
13396 return true;
13399 /* Recognize patterns suitable for the ZIP instructions. */
13400 static bool
13401 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13403 unsigned int i, high, mask, nelt = d->nelt;
13404 rtx out, in0, in1, x;
13405 rtx (*gen) (rtx, rtx, rtx);
13406 machine_mode vmode = d->vmode;
13408 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13409 return false;
13411 /* Note that these are little-endian tests.
13412 We correct for big-endian later. */
13413 high = nelt / 2;
13414 if (d->perm[0] == high)
13415 /* Do Nothing. */
13417 else if (d->perm[0] == 0)
13418 high = 0;
13419 else
13420 return false;
13421 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13423 for (i = 0; i < nelt / 2; i++)
13425 unsigned elt = (i + high) & mask;
13426 if (d->perm[i * 2] != elt)
13427 return false;
13428 elt = (elt + nelt) & mask;
13429 if (d->perm[i * 2 + 1] != elt)
13430 return false;
13433 /* Success! */
13434 if (d->testing_p)
13435 return true;
13437 in0 = d->op0;
13438 in1 = d->op1;
13439 if (BYTES_BIG_ENDIAN)
13441 x = in0, in0 = in1, in1 = x;
13442 high = !high;
13444 out = d->target;
13446 if (high)
13448 switch (vmode)
13450 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13451 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13452 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13453 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13454 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13455 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13456 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13457 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13458 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13459 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13460 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13461 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13462 default:
13463 return false;
13466 else
13468 switch (vmode)
13470 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13471 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13472 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13473 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13474 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13475 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13476 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13477 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13478 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13479 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13480 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13481 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13482 default:
13483 return false;
13487 emit_insn (gen (out, in0, in1));
13488 return true;
13491 /* Recognize patterns for the EXT insn. */
13493 static bool
13494 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13496 unsigned int i, nelt = d->nelt;
13497 rtx (*gen) (rtx, rtx, rtx, rtx);
13498 rtx offset;
13500 unsigned int location = d->perm[0]; /* Always < nelt. */
13502 /* Check if the extracted indices are increasing by one. */
13503 for (i = 1; i < nelt; i++)
13505 unsigned int required = location + i;
13506 if (d->one_vector_p)
13508 /* We'll pass the same vector in twice, so allow indices to wrap. */
13509 required &= (nelt - 1);
13511 if (d->perm[i] != required)
13512 return false;
13515 switch (d->vmode)
13517 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13518 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13519 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13520 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13521 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13522 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13523 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13524 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13525 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13526 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13527 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13528 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13529 default:
13530 return false;
13533 /* Success! */
13534 if (d->testing_p)
13535 return true;
13537 /* The case where (location == 0) is a no-op for both big- and little-endian,
13538 and is removed by the mid-end at optimization levels -O1 and higher. */
13540 if (BYTES_BIG_ENDIAN && (location != 0))
13542 /* After setup, we want the high elements of the first vector (stored
13543 at the LSB end of the register), and the low elements of the second
13544 vector (stored at the MSB end of the register). So swap. */
13545 std::swap (d->op0, d->op1);
13546 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13547 location = nelt - location;
13550 offset = GEN_INT (location);
13551 emit_insn (gen (d->target, d->op0, d->op1, offset));
13552 return true;
13555 /* Recognize patterns for the REV insns. */
13557 static bool
13558 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13560 unsigned int i, j, diff, nelt = d->nelt;
13561 rtx (*gen) (rtx, rtx);
13563 if (!d->one_vector_p)
13564 return false;
13566 diff = d->perm[0];
13567 switch (diff)
13569 case 7:
13570 switch (d->vmode)
13572 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13573 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13574 default:
13575 return false;
13577 break;
13578 case 3:
13579 switch (d->vmode)
13581 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13582 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13583 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13584 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13585 default:
13586 return false;
13588 break;
13589 case 1:
13590 switch (d->vmode)
13592 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13593 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13594 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13595 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13596 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13597 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13598 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13599 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13600 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13601 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13602 default:
13603 return false;
13605 break;
13606 default:
13607 return false;
13610 for (i = 0; i < nelt ; i += diff + 1)
13611 for (j = 0; j <= diff; j += 1)
13613 /* This is guaranteed to be true as the value of diff
13614 is 7, 3, 1 and we should have enough elements in the
13615 queue to generate this. Getting a vector mask with a
13616 value of diff other than these values implies that
13617 something is wrong by the time we get here. */
13618 gcc_assert (i + j < nelt);
13619 if (d->perm[i + j] != i + diff - j)
13620 return false;
13623 /* Success! */
13624 if (d->testing_p)
13625 return true;
13627 emit_insn (gen (d->target, d->op0));
13628 return true;
13631 static bool
13632 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13634 rtx (*gen) (rtx, rtx, rtx);
13635 rtx out = d->target;
13636 rtx in0;
13637 machine_mode vmode = d->vmode;
13638 unsigned int i, elt, nelt = d->nelt;
13639 rtx lane;
13641 elt = d->perm[0];
13642 for (i = 1; i < nelt; i++)
13644 if (elt != d->perm[i])
13645 return false;
13648 /* The generic preparation in aarch64_expand_vec_perm_const_1
13649 swaps the operand order and the permute indices if it finds
13650 d->perm[0] to be in the second operand. Thus, we can always
13651 use d->op0 and need not do any extra arithmetic to get the
13652 correct lane number. */
13653 in0 = d->op0;
13654 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13656 switch (vmode)
13658 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13659 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13660 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13661 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13662 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13663 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13664 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13665 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13666 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13667 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13668 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13669 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13670 default:
13671 return false;
13674 emit_insn (gen (out, in0, lane));
13675 return true;
13678 static bool
13679 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13681 rtx rperm[MAX_VECT_LEN], sel;
13682 machine_mode vmode = d->vmode;
13683 unsigned int i, nelt = d->nelt;
13685 if (d->testing_p)
13686 return true;
13688 /* Generic code will try constant permutation twice. Once with the
13689 original mode and again with the elements lowered to QImode.
13690 So wait and don't do the selector expansion ourselves. */
13691 if (vmode != V8QImode && vmode != V16QImode)
13692 return false;
13694 for (i = 0; i < nelt; ++i)
13696 int nunits = GET_MODE_NUNITS (vmode);
13698 /* If big-endian and two vectors we end up with a weird mixed-endian
13699 mode on NEON. Reverse the index within each word but not the word
13700 itself. */
13701 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13702 : d->perm[i]);
13704 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13705 sel = force_reg (vmode, sel);
13707 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13708 return true;
13711 static bool
13712 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13714 /* The pattern matching functions above are written to look for a small
13715 number to begin the sequence (0, 1, N/2). If we begin with an index
13716 from the second operand, we can swap the operands. */
13717 if (d->perm[0] >= d->nelt)
13719 unsigned i, nelt = d->nelt;
13721 gcc_assert (nelt == (nelt & -nelt));
13722 for (i = 0; i < nelt; ++i)
13723 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13725 std::swap (d->op0, d->op1);
13728 if (TARGET_SIMD)
13730 if (aarch64_evpc_rev (d))
13731 return true;
13732 else if (aarch64_evpc_ext (d))
13733 return true;
13734 else if (aarch64_evpc_dup (d))
13735 return true;
13736 else if (aarch64_evpc_zip (d))
13737 return true;
13738 else if (aarch64_evpc_uzp (d))
13739 return true;
13740 else if (aarch64_evpc_trn (d))
13741 return true;
13742 return aarch64_evpc_tbl (d);
13744 return false;
13747 /* Expand a vec_perm_const pattern. */
13749 bool
13750 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13752 struct expand_vec_perm_d d;
13753 int i, nelt, which;
13755 d.target = target;
13756 d.op0 = op0;
13757 d.op1 = op1;
13759 d.vmode = GET_MODE (target);
13760 gcc_assert (VECTOR_MODE_P (d.vmode));
13761 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13762 d.testing_p = false;
13764 for (i = which = 0; i < nelt; ++i)
13766 rtx e = XVECEXP (sel, 0, i);
13767 int ei = INTVAL (e) & (2 * nelt - 1);
13768 which |= (ei < nelt ? 1 : 2);
13769 d.perm[i] = ei;
13772 switch (which)
13774 default:
13775 gcc_unreachable ();
13777 case 3:
13778 d.one_vector_p = false;
13779 if (!rtx_equal_p (op0, op1))
13780 break;
13782 /* The elements of PERM do not suggest that only the first operand
13783 is used, but both operands are identical. Allow easier matching
13784 of the permutation by folding the permutation into the single
13785 input vector. */
13786 /* Fall Through. */
13787 case 2:
13788 for (i = 0; i < nelt; ++i)
13789 d.perm[i] &= nelt - 1;
13790 d.op0 = op1;
13791 d.one_vector_p = true;
13792 break;
13794 case 1:
13795 d.op1 = op0;
13796 d.one_vector_p = true;
13797 break;
13800 return aarch64_expand_vec_perm_const_1 (&d);
13803 static bool
13804 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13805 const unsigned char *sel)
13807 struct expand_vec_perm_d d;
13808 unsigned int i, nelt, which;
13809 bool ret;
13811 d.vmode = vmode;
13812 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13813 d.testing_p = true;
13814 memcpy (d.perm, sel, nelt);
13816 /* Calculate whether all elements are in one vector. */
13817 for (i = which = 0; i < nelt; ++i)
13819 unsigned char e = d.perm[i];
13820 gcc_assert (e < 2 * nelt);
13821 which |= (e < nelt ? 1 : 2);
13824 /* If all elements are from the second vector, reindex as if from the
13825 first vector. */
13826 if (which == 2)
13827 for (i = 0; i < nelt; ++i)
13828 d.perm[i] -= nelt;
13830 /* Check whether the mask can be applied to a single vector. */
13831 d.one_vector_p = (which != 3);
13833 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13834 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13835 if (!d.one_vector_p)
13836 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13838 start_sequence ();
13839 ret = aarch64_expand_vec_perm_const_1 (&d);
13840 end_sequence ();
13842 return ret;
13846 aarch64_reverse_mask (machine_mode mode)
13848 /* We have to reverse each vector because we dont have
13849 a permuted load that can reverse-load according to ABI rules. */
13850 rtx mask;
13851 rtvec v = rtvec_alloc (16);
13852 int i, j;
13853 int nunits = GET_MODE_NUNITS (mode);
13854 int usize = GET_MODE_UNIT_SIZE (mode);
13856 gcc_assert (BYTES_BIG_ENDIAN);
13857 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13859 for (i = 0; i < nunits; i++)
13860 for (j = 0; j < usize; j++)
13861 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13862 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13863 return force_reg (V16QImode, mask);
13866 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13867 However due to issues with register allocation it is preferable to avoid
13868 tieing integer scalar and FP scalar modes. Executing integer operations
13869 in general registers is better than treating them as scalar vector
13870 operations. This reduces latency and avoids redundant int<->FP moves.
13871 So tie modes if they are either the same class, or vector modes with
13872 other vector modes, vector structs or any scalar mode.
13875 bool
13876 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13878 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13879 return true;
13881 /* We specifically want to allow elements of "structure" modes to
13882 be tieable to the structure. This more general condition allows
13883 other rarer situations too. */
13884 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13885 return true;
13887 /* Also allow any scalar modes with vectors. */
13888 if (aarch64_vector_mode_supported_p (mode1)
13889 || aarch64_vector_mode_supported_p (mode2))
13890 return true;
13892 return false;
13895 /* Return a new RTX holding the result of moving POINTER forward by
13896 AMOUNT bytes. */
13898 static rtx
13899 aarch64_move_pointer (rtx pointer, int amount)
13901 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13903 return adjust_automodify_address (pointer, GET_MODE (pointer),
13904 next, amount);
13907 /* Return a new RTX holding the result of moving POINTER forward by the
13908 size of the mode it points to. */
13910 static rtx
13911 aarch64_progress_pointer (rtx pointer)
13913 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13915 return aarch64_move_pointer (pointer, amount);
13918 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13919 MODE bytes. */
13921 static void
13922 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13923 machine_mode mode)
13925 rtx reg = gen_reg_rtx (mode);
13927 /* "Cast" the pointers to the correct mode. */
13928 *src = adjust_address (*src, mode, 0);
13929 *dst = adjust_address (*dst, mode, 0);
13930 /* Emit the memcpy. */
13931 emit_move_insn (reg, *src);
13932 emit_move_insn (*dst, reg);
13933 /* Move the pointers forward. */
13934 *src = aarch64_progress_pointer (*src);
13935 *dst = aarch64_progress_pointer (*dst);
13938 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13939 we succeed, otherwise return false. */
13941 bool
13942 aarch64_expand_movmem (rtx *operands)
13944 unsigned int n;
13945 rtx dst = operands[0];
13946 rtx src = operands[1];
13947 rtx base;
13948 bool speed_p = !optimize_function_for_size_p (cfun);
13950 /* When optimizing for size, give a better estimate of the length of a
13951 memcpy call, but use the default otherwise. */
13952 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13954 /* We can't do anything smart if the amount to copy is not constant. */
13955 if (!CONST_INT_P (operands[2]))
13956 return false;
13958 n = UINTVAL (operands[2]);
13960 /* Try to keep the number of instructions low. For cases below 16 bytes we
13961 need to make at most two moves. For cases above 16 bytes it will be one
13962 move for each 16 byte chunk, then at most two additional moves. */
13963 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13964 return false;
13966 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13967 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13969 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13970 src = adjust_automodify_address (src, VOIDmode, base, 0);
13972 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13973 1-byte chunk. */
13974 if (n < 4)
13976 if (n >= 2)
13978 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13979 n -= 2;
13982 if (n == 1)
13983 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13985 return true;
13988 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13989 4-byte chunk, partially overlapping with the previously copied chunk. */
13990 if (n < 8)
13992 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13993 n -= 4;
13994 if (n > 0)
13996 int move = n - 4;
13998 src = aarch64_move_pointer (src, move);
13999 dst = aarch64_move_pointer (dst, move);
14000 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14002 return true;
14005 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
14006 them, then (if applicable) an 8-byte chunk. */
14007 while (n >= 8)
14009 if (n / 16)
14011 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14012 n -= 16;
14014 else
14016 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14017 n -= 8;
14021 /* Finish the final bytes of the copy. We can always do this in one
14022 instruction. We either copy the exact amount we need, or partially
14023 overlap with the previous chunk we copied and copy 8-bytes. */
14024 if (n == 0)
14025 return true;
14026 else if (n == 1)
14027 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14028 else if (n == 2)
14029 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14030 else if (n == 4)
14031 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14032 else
14034 if (n == 3)
14036 src = aarch64_move_pointer (src, -1);
14037 dst = aarch64_move_pointer (dst, -1);
14038 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14040 else
14042 int move = n - 8;
14044 src = aarch64_move_pointer (src, move);
14045 dst = aarch64_move_pointer (dst, move);
14046 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14050 return true;
14053 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14054 SImode stores. Handle the case when the constant has identical
14055 bottom and top halves. This is beneficial when the two stores can be
14056 merged into an STP and we avoid synthesising potentially expensive
14057 immediates twice. Return true if such a split is possible. */
14059 bool
14060 aarch64_split_dimode_const_store (rtx dst, rtx src)
14062 rtx lo = gen_lowpart (SImode, src);
14063 rtx hi = gen_highpart_mode (SImode, DImode, src);
14065 bool size_p = optimize_function_for_size_p (cfun);
14067 if (!rtx_equal_p (lo, hi))
14068 return false;
14070 unsigned int orig_cost
14071 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14072 unsigned int lo_cost
14073 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14075 /* We want to transform:
14076 MOV x1, 49370
14077 MOVK x1, 0x140, lsl 16
14078 MOVK x1, 0xc0da, lsl 32
14079 MOVK x1, 0x140, lsl 48
14080 STR x1, [x0]
14081 into:
14082 MOV w1, 49370
14083 MOVK w1, 0x140, lsl 16
14084 STP w1, w1, [x0]
14085 So we want to perform this only when we save two instructions
14086 or more. When optimizing for size, however, accept any code size
14087 savings we can. */
14088 if (size_p && orig_cost <= lo_cost)
14089 return false;
14091 if (!size_p
14092 && (orig_cost <= lo_cost + 1))
14093 return false;
14095 rtx mem_lo = adjust_address (dst, SImode, 0);
14096 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14097 return false;
14099 rtx tmp_reg = gen_reg_rtx (SImode);
14100 aarch64_expand_mov_immediate (tmp_reg, lo);
14101 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14102 /* Don't emit an explicit store pair as this may not be always profitable.
14103 Let the sched-fusion logic decide whether to merge them. */
14104 emit_move_insn (mem_lo, tmp_reg);
14105 emit_move_insn (mem_hi, tmp_reg);
14107 return true;
14110 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14112 static unsigned HOST_WIDE_INT
14113 aarch64_asan_shadow_offset (void)
14115 return (HOST_WIDE_INT_1 << 36);
14118 static bool
14119 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14120 unsigned int align,
14121 enum by_pieces_operation op,
14122 bool speed_p)
14124 /* STORE_BY_PIECES can be used when copying a constant string, but
14125 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14126 For now we always fail this and let the move_by_pieces code copy
14127 the string from read-only memory. */
14128 if (op == STORE_BY_PIECES)
14129 return false;
14131 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14134 static rtx
14135 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14136 int code, tree treeop0, tree treeop1)
14138 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14139 rtx op0, op1;
14140 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14141 insn_code icode;
14142 struct expand_operand ops[4];
14144 start_sequence ();
14145 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14147 op_mode = GET_MODE (op0);
14148 if (op_mode == VOIDmode)
14149 op_mode = GET_MODE (op1);
14151 switch (op_mode)
14153 case E_QImode:
14154 case E_HImode:
14155 case E_SImode:
14156 cmp_mode = SImode;
14157 icode = CODE_FOR_cmpsi;
14158 break;
14160 case E_DImode:
14161 cmp_mode = DImode;
14162 icode = CODE_FOR_cmpdi;
14163 break;
14165 case E_SFmode:
14166 cmp_mode = SFmode;
14167 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14168 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14169 break;
14171 case E_DFmode:
14172 cmp_mode = DFmode;
14173 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14174 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14175 break;
14177 default:
14178 end_sequence ();
14179 return NULL_RTX;
14182 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14183 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14184 if (!op0 || !op1)
14186 end_sequence ();
14187 return NULL_RTX;
14189 *prep_seq = get_insns ();
14190 end_sequence ();
14192 create_fixed_operand (&ops[0], op0);
14193 create_fixed_operand (&ops[1], op1);
14195 start_sequence ();
14196 if (!maybe_expand_insn (icode, 2, ops))
14198 end_sequence ();
14199 return NULL_RTX;
14201 *gen_seq = get_insns ();
14202 end_sequence ();
14204 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14205 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14208 static rtx
14209 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14210 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14212 rtx op0, op1, target;
14213 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14214 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14215 insn_code icode;
14216 struct expand_operand ops[6];
14217 int aarch64_cond;
14219 push_to_sequence (*prep_seq);
14220 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14222 op_mode = GET_MODE (op0);
14223 if (op_mode == VOIDmode)
14224 op_mode = GET_MODE (op1);
14226 switch (op_mode)
14228 case E_QImode:
14229 case E_HImode:
14230 case E_SImode:
14231 cmp_mode = SImode;
14232 icode = CODE_FOR_ccmpsi;
14233 break;
14235 case E_DImode:
14236 cmp_mode = DImode;
14237 icode = CODE_FOR_ccmpdi;
14238 break;
14240 case E_SFmode:
14241 cmp_mode = SFmode;
14242 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14243 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14244 break;
14246 case E_DFmode:
14247 cmp_mode = DFmode;
14248 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14249 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14250 break;
14252 default:
14253 end_sequence ();
14254 return NULL_RTX;
14257 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14258 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14259 if (!op0 || !op1)
14261 end_sequence ();
14262 return NULL_RTX;
14264 *prep_seq = get_insns ();
14265 end_sequence ();
14267 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14268 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14270 if (bit_code != AND)
14272 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14273 GET_MODE (XEXP (prev, 0))),
14274 VOIDmode, XEXP (prev, 0), const0_rtx);
14275 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14278 create_fixed_operand (&ops[0], XEXP (prev, 0));
14279 create_fixed_operand (&ops[1], target);
14280 create_fixed_operand (&ops[2], op0);
14281 create_fixed_operand (&ops[3], op1);
14282 create_fixed_operand (&ops[4], prev);
14283 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14285 push_to_sequence (*gen_seq);
14286 if (!maybe_expand_insn (icode, 6, ops))
14288 end_sequence ();
14289 return NULL_RTX;
14292 *gen_seq = get_insns ();
14293 end_sequence ();
14295 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14298 #undef TARGET_GEN_CCMP_FIRST
14299 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14301 #undef TARGET_GEN_CCMP_NEXT
14302 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14304 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14305 instruction fusion of some sort. */
14307 static bool
14308 aarch64_macro_fusion_p (void)
14310 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14314 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14315 should be kept together during scheduling. */
14317 static bool
14318 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14320 rtx set_dest;
14321 rtx prev_set = single_set (prev);
14322 rtx curr_set = single_set (curr);
14323 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14324 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14326 if (!aarch64_macro_fusion_p ())
14327 return false;
14329 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14331 /* We are trying to match:
14332 prev (mov) == (set (reg r0) (const_int imm16))
14333 curr (movk) == (set (zero_extract (reg r0)
14334 (const_int 16)
14335 (const_int 16))
14336 (const_int imm16_1)) */
14338 set_dest = SET_DEST (curr_set);
14340 if (GET_CODE (set_dest) == ZERO_EXTRACT
14341 && CONST_INT_P (SET_SRC (curr_set))
14342 && CONST_INT_P (SET_SRC (prev_set))
14343 && CONST_INT_P (XEXP (set_dest, 2))
14344 && INTVAL (XEXP (set_dest, 2)) == 16
14345 && REG_P (XEXP (set_dest, 0))
14346 && REG_P (SET_DEST (prev_set))
14347 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14349 return true;
14353 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14356 /* We're trying to match:
14357 prev (adrp) == (set (reg r1)
14358 (high (symbol_ref ("SYM"))))
14359 curr (add) == (set (reg r0)
14360 (lo_sum (reg r1)
14361 (symbol_ref ("SYM"))))
14362 Note that r0 need not necessarily be the same as r1, especially
14363 during pre-regalloc scheduling. */
14365 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14366 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14368 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14369 && REG_P (XEXP (SET_SRC (curr_set), 0))
14370 && REGNO (XEXP (SET_SRC (curr_set), 0))
14371 == REGNO (SET_DEST (prev_set))
14372 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14373 XEXP (SET_SRC (curr_set), 1)))
14374 return true;
14378 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14381 /* We're trying to match:
14382 prev (movk) == (set (zero_extract (reg r0)
14383 (const_int 16)
14384 (const_int 32))
14385 (const_int imm16_1))
14386 curr (movk) == (set (zero_extract (reg r0)
14387 (const_int 16)
14388 (const_int 48))
14389 (const_int imm16_2)) */
14391 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14392 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14393 && REG_P (XEXP (SET_DEST (prev_set), 0))
14394 && REG_P (XEXP (SET_DEST (curr_set), 0))
14395 && REGNO (XEXP (SET_DEST (prev_set), 0))
14396 == REGNO (XEXP (SET_DEST (curr_set), 0))
14397 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14398 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14399 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14400 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14401 && CONST_INT_P (SET_SRC (prev_set))
14402 && CONST_INT_P (SET_SRC (curr_set)))
14403 return true;
14406 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14408 /* We're trying to match:
14409 prev (adrp) == (set (reg r0)
14410 (high (symbol_ref ("SYM"))))
14411 curr (ldr) == (set (reg r1)
14412 (mem (lo_sum (reg r0)
14413 (symbol_ref ("SYM")))))
14415 curr (ldr) == (set (reg r1)
14416 (zero_extend (mem
14417 (lo_sum (reg r0)
14418 (symbol_ref ("SYM")))))) */
14419 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14420 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14422 rtx curr_src = SET_SRC (curr_set);
14424 if (GET_CODE (curr_src) == ZERO_EXTEND)
14425 curr_src = XEXP (curr_src, 0);
14427 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14428 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14429 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14430 == REGNO (SET_DEST (prev_set))
14431 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14432 XEXP (SET_SRC (prev_set), 0)))
14433 return true;
14437 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14438 && aarch_crypto_can_dual_issue (prev, curr))
14439 return true;
14441 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14442 && any_condjump_p (curr))
14444 enum attr_type prev_type = get_attr_type (prev);
14446 unsigned int condreg1, condreg2;
14447 rtx cc_reg_1;
14448 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14449 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14451 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14452 && prev
14453 && modified_in_p (cc_reg_1, prev))
14455 /* FIXME: this misses some which is considered simple arthematic
14456 instructions for ThunderX. Simple shifts are missed here. */
14457 if (prev_type == TYPE_ALUS_SREG
14458 || prev_type == TYPE_ALUS_IMM
14459 || prev_type == TYPE_LOGICS_REG
14460 || prev_type == TYPE_LOGICS_IMM)
14461 return true;
14465 if (prev_set
14466 && curr_set
14467 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14468 && any_condjump_p (curr))
14470 /* We're trying to match:
14471 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14472 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14473 (const_int 0))
14474 (label_ref ("SYM"))
14475 (pc)) */
14476 if (SET_DEST (curr_set) == (pc_rtx)
14477 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14478 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14479 && REG_P (SET_DEST (prev_set))
14480 && REGNO (SET_DEST (prev_set))
14481 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14483 /* Fuse ALU operations followed by conditional branch instruction. */
14484 switch (get_attr_type (prev))
14486 case TYPE_ALU_IMM:
14487 case TYPE_ALU_SREG:
14488 case TYPE_ADC_REG:
14489 case TYPE_ADC_IMM:
14490 case TYPE_ADCS_REG:
14491 case TYPE_ADCS_IMM:
14492 case TYPE_LOGIC_REG:
14493 case TYPE_LOGIC_IMM:
14494 case TYPE_CSEL:
14495 case TYPE_ADR:
14496 case TYPE_MOV_IMM:
14497 case TYPE_SHIFT_REG:
14498 case TYPE_SHIFT_IMM:
14499 case TYPE_BFM:
14500 case TYPE_RBIT:
14501 case TYPE_REV:
14502 case TYPE_EXTEND:
14503 return true;
14505 default:;
14510 return false;
14513 /* Return true iff the instruction fusion described by OP is enabled. */
14515 bool
14516 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14518 return (aarch64_tune_params.fusible_ops & op) != 0;
14521 /* If MEM is in the form of [base+offset], extract the two parts
14522 of address and set to BASE and OFFSET, otherwise return false
14523 after clearing BASE and OFFSET. */
14525 bool
14526 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14528 rtx addr;
14530 gcc_assert (MEM_P (mem));
14532 addr = XEXP (mem, 0);
14534 if (REG_P (addr))
14536 *base = addr;
14537 *offset = const0_rtx;
14538 return true;
14541 if (GET_CODE (addr) == PLUS
14542 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14544 *base = XEXP (addr, 0);
14545 *offset = XEXP (addr, 1);
14546 return true;
14549 *base = NULL_RTX;
14550 *offset = NULL_RTX;
14552 return false;
14555 /* Types for scheduling fusion. */
14556 enum sched_fusion_type
14558 SCHED_FUSION_NONE = 0,
14559 SCHED_FUSION_LD_SIGN_EXTEND,
14560 SCHED_FUSION_LD_ZERO_EXTEND,
14561 SCHED_FUSION_LD,
14562 SCHED_FUSION_ST,
14563 SCHED_FUSION_NUM
14566 /* If INSN is a load or store of address in the form of [base+offset],
14567 extract the two parts and set to BASE and OFFSET. Return scheduling
14568 fusion type this INSN is. */
14570 static enum sched_fusion_type
14571 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14573 rtx x, dest, src;
14574 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14576 gcc_assert (INSN_P (insn));
14577 x = PATTERN (insn);
14578 if (GET_CODE (x) != SET)
14579 return SCHED_FUSION_NONE;
14581 src = SET_SRC (x);
14582 dest = SET_DEST (x);
14584 machine_mode dest_mode = GET_MODE (dest);
14586 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14587 return SCHED_FUSION_NONE;
14589 if (GET_CODE (src) == SIGN_EXTEND)
14591 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14592 src = XEXP (src, 0);
14593 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14594 return SCHED_FUSION_NONE;
14596 else if (GET_CODE (src) == ZERO_EXTEND)
14598 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14599 src = XEXP (src, 0);
14600 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14601 return SCHED_FUSION_NONE;
14604 if (GET_CODE (src) == MEM && REG_P (dest))
14605 extract_base_offset_in_addr (src, base, offset);
14606 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14608 fusion = SCHED_FUSION_ST;
14609 extract_base_offset_in_addr (dest, base, offset);
14611 else
14612 return SCHED_FUSION_NONE;
14614 if (*base == NULL_RTX || *offset == NULL_RTX)
14615 fusion = SCHED_FUSION_NONE;
14617 return fusion;
14620 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14622 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14623 and PRI are only calculated for these instructions. For other instruction,
14624 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14625 type instruction fusion can be added by returning different priorities.
14627 It's important that irrelevant instructions get the largest FUSION_PRI. */
14629 static void
14630 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14631 int *fusion_pri, int *pri)
14633 int tmp, off_val;
14634 rtx base, offset;
14635 enum sched_fusion_type fusion;
14637 gcc_assert (INSN_P (insn));
14639 tmp = max_pri - 1;
14640 fusion = fusion_load_store (insn, &base, &offset);
14641 if (fusion == SCHED_FUSION_NONE)
14643 *pri = tmp;
14644 *fusion_pri = tmp;
14645 return;
14648 /* Set FUSION_PRI according to fusion type and base register. */
14649 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14651 /* Calculate PRI. */
14652 tmp /= 2;
14654 /* INSN with smaller offset goes first. */
14655 off_val = (int)(INTVAL (offset));
14656 if (off_val >= 0)
14657 tmp -= (off_val & 0xfffff);
14658 else
14659 tmp += ((- off_val) & 0xfffff);
14661 *pri = tmp;
14662 return;
14665 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14666 Adjust priority of sha1h instructions so they are scheduled before
14667 other SHA1 instructions. */
14669 static int
14670 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14672 rtx x = PATTERN (insn);
14674 if (GET_CODE (x) == SET)
14676 x = SET_SRC (x);
14678 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14679 return priority + 10;
14682 return priority;
14685 /* Given OPERANDS of consecutive load/store, check if we can merge
14686 them into ldp/stp. LOAD is true if they are load instructions.
14687 MODE is the mode of memory operands. */
14689 bool
14690 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14691 machine_mode mode)
14693 HOST_WIDE_INT offval_1, offval_2, msize;
14694 enum reg_class rclass_1, rclass_2;
14695 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14697 if (load)
14699 mem_1 = operands[1];
14700 mem_2 = operands[3];
14701 reg_1 = operands[0];
14702 reg_2 = operands[2];
14703 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14704 if (REGNO (reg_1) == REGNO (reg_2))
14705 return false;
14707 else
14709 mem_1 = operands[0];
14710 mem_2 = operands[2];
14711 reg_1 = operands[1];
14712 reg_2 = operands[3];
14715 /* The mems cannot be volatile. */
14716 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14717 return false;
14719 /* If we have SImode and slow unaligned ldp,
14720 check the alignment to be at least 8 byte. */
14721 if (mode == SImode
14722 && (aarch64_tune_params.extra_tuning_flags
14723 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14724 && !optimize_size
14725 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14726 return false;
14728 /* Check if the addresses are in the form of [base+offset]. */
14729 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14730 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14731 return false;
14732 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14733 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14734 return false;
14736 /* Check if the bases are same. */
14737 if (!rtx_equal_p (base_1, base_2))
14738 return false;
14740 offval_1 = INTVAL (offset_1);
14741 offval_2 = INTVAL (offset_2);
14742 msize = GET_MODE_SIZE (mode);
14743 /* Check if the offsets are consecutive. */
14744 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14745 return false;
14747 /* Check if the addresses are clobbered by load. */
14748 if (load)
14750 if (reg_mentioned_p (reg_1, mem_1))
14751 return false;
14753 /* In increasing order, the last load can clobber the address. */
14754 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14755 return false;
14758 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14759 rclass_1 = FP_REGS;
14760 else
14761 rclass_1 = GENERAL_REGS;
14763 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14764 rclass_2 = FP_REGS;
14765 else
14766 rclass_2 = GENERAL_REGS;
14768 /* Check if the registers are of same class. */
14769 if (rclass_1 != rclass_2)
14770 return false;
14772 return true;
14775 /* Given OPERANDS of consecutive load/store, check if we can merge
14776 them into ldp/stp by adjusting the offset. LOAD is true if they
14777 are load instructions. MODE is the mode of memory operands.
14779 Given below consecutive stores:
14781 str w1, [xb, 0x100]
14782 str w1, [xb, 0x104]
14783 str w1, [xb, 0x108]
14784 str w1, [xb, 0x10c]
14786 Though the offsets are out of the range supported by stp, we can
14787 still pair them after adjusting the offset, like:
14789 add scratch, xb, 0x100
14790 stp w1, w1, [scratch]
14791 stp w1, w1, [scratch, 0x8]
14793 The peephole patterns detecting this opportunity should guarantee
14794 the scratch register is avaliable. */
14796 bool
14797 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14798 machine_mode mode)
14800 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14801 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14802 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14803 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14805 if (load)
14807 reg_1 = operands[0];
14808 mem_1 = operands[1];
14809 reg_2 = operands[2];
14810 mem_2 = operands[3];
14811 reg_3 = operands[4];
14812 mem_3 = operands[5];
14813 reg_4 = operands[6];
14814 mem_4 = operands[7];
14815 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14816 && REG_P (reg_3) && REG_P (reg_4));
14817 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14818 return false;
14820 else
14822 mem_1 = operands[0];
14823 reg_1 = operands[1];
14824 mem_2 = operands[2];
14825 reg_2 = operands[3];
14826 mem_3 = operands[4];
14827 reg_3 = operands[5];
14828 mem_4 = operands[6];
14829 reg_4 = operands[7];
14831 /* Skip if memory operand is by itslef valid for ldp/stp. */
14832 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14833 return false;
14835 /* The mems cannot be volatile. */
14836 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14837 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14838 return false;
14840 /* Check if the addresses are in the form of [base+offset]. */
14841 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14842 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14843 return false;
14844 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14845 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14846 return false;
14847 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14848 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14849 return false;
14850 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14851 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14852 return false;
14854 /* Check if the bases are same. */
14855 if (!rtx_equal_p (base_1, base_2)
14856 || !rtx_equal_p (base_2, base_3)
14857 || !rtx_equal_p (base_3, base_4))
14858 return false;
14860 offval_1 = INTVAL (offset_1);
14861 offval_2 = INTVAL (offset_2);
14862 offval_3 = INTVAL (offset_3);
14863 offval_4 = INTVAL (offset_4);
14864 msize = GET_MODE_SIZE (mode);
14865 /* Check if the offsets are consecutive. */
14866 if ((offval_1 != (offval_2 + msize)
14867 || offval_1 != (offval_3 + msize * 2)
14868 || offval_1 != (offval_4 + msize * 3))
14869 && (offval_4 != (offval_3 + msize)
14870 || offval_4 != (offval_2 + msize * 2)
14871 || offval_4 != (offval_1 + msize * 3)))
14872 return false;
14874 /* Check if the addresses are clobbered by load. */
14875 if (load)
14877 if (reg_mentioned_p (reg_1, mem_1)
14878 || reg_mentioned_p (reg_2, mem_2)
14879 || reg_mentioned_p (reg_3, mem_3))
14880 return false;
14882 /* In increasing order, the last load can clobber the address. */
14883 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14884 return false;
14887 /* If we have SImode and slow unaligned ldp,
14888 check the alignment to be at least 8 byte. */
14889 if (mode == SImode
14890 && (aarch64_tune_params.extra_tuning_flags
14891 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14892 && !optimize_size
14893 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14894 return false;
14896 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14897 rclass_1 = FP_REGS;
14898 else
14899 rclass_1 = GENERAL_REGS;
14901 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14902 rclass_2 = FP_REGS;
14903 else
14904 rclass_2 = GENERAL_REGS;
14906 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14907 rclass_3 = FP_REGS;
14908 else
14909 rclass_3 = GENERAL_REGS;
14911 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14912 rclass_4 = FP_REGS;
14913 else
14914 rclass_4 = GENERAL_REGS;
14916 /* Check if the registers are of same class. */
14917 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14918 return false;
14920 return true;
14923 /* Given OPERANDS of consecutive load/store, this function pairs them
14924 into ldp/stp after adjusting the offset. It depends on the fact
14925 that addresses of load/store instructions are in increasing order.
14926 MODE is the mode of memory operands. CODE is the rtl operator
14927 which should be applied to all memory operands, it's SIGN_EXTEND,
14928 ZERO_EXTEND or UNKNOWN. */
14930 bool
14931 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14932 machine_mode mode, RTX_CODE code)
14934 rtx base, offset, t1, t2;
14935 rtx mem_1, mem_2, mem_3, mem_4;
14936 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14938 if (load)
14940 mem_1 = operands[1];
14941 mem_2 = operands[3];
14942 mem_3 = operands[5];
14943 mem_4 = operands[7];
14945 else
14947 mem_1 = operands[0];
14948 mem_2 = operands[2];
14949 mem_3 = operands[4];
14950 mem_4 = operands[6];
14951 gcc_assert (code == UNKNOWN);
14954 extract_base_offset_in_addr (mem_1, &base, &offset);
14955 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14957 /* Adjust offset thus it can fit in ldp/stp instruction. */
14958 msize = GET_MODE_SIZE (mode);
14959 stp_off_limit = msize * 0x40;
14960 off_val = INTVAL (offset);
14961 abs_off = (off_val < 0) ? -off_val : off_val;
14962 new_off = abs_off % stp_off_limit;
14963 adj_off = abs_off - new_off;
14965 /* Further adjust to make sure all offsets are OK. */
14966 if ((new_off + msize * 2) >= stp_off_limit)
14968 adj_off += stp_off_limit;
14969 new_off -= stp_off_limit;
14972 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14973 if (adj_off >= 0x1000)
14974 return false;
14976 if (off_val < 0)
14978 adj_off = -adj_off;
14979 new_off = -new_off;
14982 /* Create new memory references. */
14983 mem_1 = change_address (mem_1, VOIDmode,
14984 plus_constant (DImode, operands[8], new_off));
14986 /* Check if the adjusted address is OK for ldp/stp. */
14987 if (!aarch64_mem_pair_operand (mem_1, mode))
14988 return false;
14990 msize = GET_MODE_SIZE (mode);
14991 mem_2 = change_address (mem_2, VOIDmode,
14992 plus_constant (DImode,
14993 operands[8],
14994 new_off + msize));
14995 mem_3 = change_address (mem_3, VOIDmode,
14996 plus_constant (DImode,
14997 operands[8],
14998 new_off + msize * 2));
14999 mem_4 = change_address (mem_4, VOIDmode,
15000 plus_constant (DImode,
15001 operands[8],
15002 new_off + msize * 3));
15004 if (code == ZERO_EXTEND)
15006 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15007 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15008 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15009 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15011 else if (code == SIGN_EXTEND)
15013 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15014 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15015 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15016 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15019 if (load)
15021 operands[1] = mem_1;
15022 operands[3] = mem_2;
15023 operands[5] = mem_3;
15024 operands[7] = mem_4;
15026 else
15028 operands[0] = mem_1;
15029 operands[2] = mem_2;
15030 operands[4] = mem_3;
15031 operands[6] = mem_4;
15034 /* Emit adjusting instruction. */
15035 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15036 /* Emit ldp/stp instructions. */
15037 t1 = gen_rtx_SET (operands[0], operands[1]);
15038 t2 = gen_rtx_SET (operands[2], operands[3]);
15039 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15040 t1 = gen_rtx_SET (operands[4], operands[5]);
15041 t2 = gen_rtx_SET (operands[6], operands[7]);
15042 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15043 return true;
15046 /* Return 1 if pseudo register should be created and used to hold
15047 GOT address for PIC code. */
15049 bool
15050 aarch64_use_pseudo_pic_reg (void)
15052 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15055 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15057 static int
15058 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15060 switch (XINT (x, 1))
15062 case UNSPEC_GOTSMALLPIC:
15063 case UNSPEC_GOTSMALLPIC28K:
15064 case UNSPEC_GOTTINYPIC:
15065 return 0;
15066 default:
15067 break;
15070 return default_unspec_may_trap_p (x, flags);
15074 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15075 return the log2 of that value. Otherwise return -1. */
15078 aarch64_fpconst_pow_of_2 (rtx x)
15080 const REAL_VALUE_TYPE *r;
15082 if (!CONST_DOUBLE_P (x))
15083 return -1;
15085 r = CONST_DOUBLE_REAL_VALUE (x);
15087 if (REAL_VALUE_NEGATIVE (*r)
15088 || REAL_VALUE_ISNAN (*r)
15089 || REAL_VALUE_ISINF (*r)
15090 || !real_isinteger (r, DFmode))
15091 return -1;
15093 return exact_log2 (real_to_integer (r));
15096 /* If X is a vector of equal CONST_DOUBLE values and that value is
15097 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15100 aarch64_vec_fpconst_pow_of_2 (rtx x)
15102 if (GET_CODE (x) != CONST_VECTOR)
15103 return -1;
15105 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15106 return -1;
15108 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15109 if (firstval <= 0)
15110 return -1;
15112 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15113 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15114 return -1;
15116 return firstval;
15119 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15120 to float.
15122 __fp16 always promotes through this hook.
15123 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15124 through the generic excess precision logic rather than here. */
15126 static tree
15127 aarch64_promoted_type (const_tree t)
15129 if (SCALAR_FLOAT_TYPE_P (t)
15130 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15131 return float_type_node;
15133 return NULL_TREE;
15136 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15138 static bool
15139 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15140 optimization_type opt_type)
15142 switch (op)
15144 case rsqrt_optab:
15145 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15147 default:
15148 return true;
15152 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15153 if MODE is HFmode, and punt to the generic implementation otherwise. */
15155 static bool
15156 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15158 return (mode == HFmode
15159 ? true
15160 : default_libgcc_floating_mode_supported_p (mode));
15163 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15164 if MODE is HFmode, and punt to the generic implementation otherwise. */
15166 static bool
15167 aarch64_scalar_mode_supported_p (scalar_mode mode)
15169 return (mode == HFmode
15170 ? true
15171 : default_scalar_mode_supported_p (mode));
15174 /* Set the value of FLT_EVAL_METHOD.
15175 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15177 0: evaluate all operations and constants, whose semantic type has at
15178 most the range and precision of type float, to the range and
15179 precision of float; evaluate all other operations and constants to
15180 the range and precision of the semantic type;
15182 N, where _FloatN is a supported interchange floating type
15183 evaluate all operations and constants, whose semantic type has at
15184 most the range and precision of _FloatN type, to the range and
15185 precision of the _FloatN type; evaluate all other operations and
15186 constants to the range and precision of the semantic type;
15188 If we have the ARMv8.2-A extensions then we support _Float16 in native
15189 precision, so we should set this to 16. Otherwise, we support the type,
15190 but want to evaluate expressions in float precision, so set this to
15191 0. */
15193 static enum flt_eval_method
15194 aarch64_excess_precision (enum excess_precision_type type)
15196 switch (type)
15198 case EXCESS_PRECISION_TYPE_FAST:
15199 case EXCESS_PRECISION_TYPE_STANDARD:
15200 /* We can calculate either in 16-bit range and precision or
15201 32-bit range and precision. Make that decision based on whether
15202 we have native support for the ARMv8.2-A 16-bit floating-point
15203 instructions or not. */
15204 return (TARGET_FP_F16INST
15205 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15206 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15207 case EXCESS_PRECISION_TYPE_IMPLICIT:
15208 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15209 default:
15210 gcc_unreachable ();
15212 return FLT_EVAL_METHOD_UNPREDICTABLE;
15215 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15216 scheduled for speculative execution. Reject the long-running division
15217 and square-root instructions. */
15219 static bool
15220 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15222 switch (get_attr_type (insn))
15224 case TYPE_SDIV:
15225 case TYPE_UDIV:
15226 case TYPE_FDIVS:
15227 case TYPE_FDIVD:
15228 case TYPE_FSQRTS:
15229 case TYPE_FSQRTD:
15230 case TYPE_NEON_FP_SQRT_S:
15231 case TYPE_NEON_FP_SQRT_D:
15232 case TYPE_NEON_FP_SQRT_S_Q:
15233 case TYPE_NEON_FP_SQRT_D_Q:
15234 case TYPE_NEON_FP_DIV_S:
15235 case TYPE_NEON_FP_DIV_D:
15236 case TYPE_NEON_FP_DIV_S_Q:
15237 case TYPE_NEON_FP_DIV_D_Q:
15238 return false;
15239 default:
15240 return true;
15244 /* Target-specific selftests. */
15246 #if CHECKING_P
15248 namespace selftest {
15250 /* Selftest for the RTL loader.
15251 Verify that the RTL loader copes with a dump from
15252 print_rtx_function. This is essentially just a test that class
15253 function_reader can handle a real dump, but it also verifies
15254 that lookup_reg_by_dump_name correctly handles hard regs.
15255 The presence of hard reg names in the dump means that the test is
15256 target-specific, hence it is in this file. */
15258 static void
15259 aarch64_test_loading_full_dump ()
15261 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15263 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15265 rtx_insn *insn_1 = get_insn_by_uid (1);
15266 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15268 rtx_insn *insn_15 = get_insn_by_uid (15);
15269 ASSERT_EQ (INSN, GET_CODE (insn_15));
15270 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15272 /* Verify crtl->return_rtx. */
15273 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15274 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15275 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15278 /* Run all target-specific selftests. */
15280 static void
15281 aarch64_run_selftests (void)
15283 aarch64_test_loading_full_dump ();
15286 } // namespace selftest
15288 #endif /* #if CHECKING_P */
15290 #undef TARGET_ADDRESS_COST
15291 #define TARGET_ADDRESS_COST aarch64_address_cost
15293 /* This hook will determines whether unnamed bitfields affect the alignment
15294 of the containing structure. The hook returns true if the structure
15295 should inherit the alignment requirements of an unnamed bitfield's
15296 type. */
15297 #undef TARGET_ALIGN_ANON_BITFIELD
15298 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15300 #undef TARGET_ASM_ALIGNED_DI_OP
15301 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15303 #undef TARGET_ASM_ALIGNED_HI_OP
15304 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15306 #undef TARGET_ASM_ALIGNED_SI_OP
15307 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15309 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15310 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15311 hook_bool_const_tree_hwi_hwi_const_tree_true
15313 #undef TARGET_ASM_FILE_START
15314 #define TARGET_ASM_FILE_START aarch64_start_file
15316 #undef TARGET_ASM_OUTPUT_MI_THUNK
15317 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15319 #undef TARGET_ASM_SELECT_RTX_SECTION
15320 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15322 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15323 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15325 #undef TARGET_BUILD_BUILTIN_VA_LIST
15326 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15328 #undef TARGET_CALLEE_COPIES
15329 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15331 #undef TARGET_CAN_ELIMINATE
15332 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15334 #undef TARGET_CAN_INLINE_P
15335 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15337 #undef TARGET_CANNOT_FORCE_CONST_MEM
15338 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15340 #undef TARGET_CASE_VALUES_THRESHOLD
15341 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15343 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15344 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15346 /* Only the least significant bit is used for initialization guard
15347 variables. */
15348 #undef TARGET_CXX_GUARD_MASK_BIT
15349 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15351 #undef TARGET_C_MODE_FOR_SUFFIX
15352 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15354 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15355 #undef TARGET_DEFAULT_TARGET_FLAGS
15356 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15357 #endif
15359 #undef TARGET_CLASS_MAX_NREGS
15360 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15362 #undef TARGET_BUILTIN_DECL
15363 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15365 #undef TARGET_BUILTIN_RECIPROCAL
15366 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15368 #undef TARGET_C_EXCESS_PRECISION
15369 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15371 #undef TARGET_EXPAND_BUILTIN
15372 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15374 #undef TARGET_EXPAND_BUILTIN_VA_START
15375 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15377 #undef TARGET_FOLD_BUILTIN
15378 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15380 #undef TARGET_FUNCTION_ARG
15381 #define TARGET_FUNCTION_ARG aarch64_function_arg
15383 #undef TARGET_FUNCTION_ARG_ADVANCE
15384 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15386 #undef TARGET_FUNCTION_ARG_BOUNDARY
15387 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15389 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15390 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15392 #undef TARGET_FUNCTION_VALUE
15393 #define TARGET_FUNCTION_VALUE aarch64_function_value
15395 #undef TARGET_FUNCTION_VALUE_REGNO_P
15396 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15398 #undef TARGET_FRAME_POINTER_REQUIRED
15399 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15401 #undef TARGET_GIMPLE_FOLD_BUILTIN
15402 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15404 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15405 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15407 #undef TARGET_INIT_BUILTINS
15408 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15410 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15411 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15412 aarch64_ira_change_pseudo_allocno_class
15414 #undef TARGET_LEGITIMATE_ADDRESS_P
15415 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15417 #undef TARGET_LEGITIMATE_CONSTANT_P
15418 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15420 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15421 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15422 aarch64_legitimize_address_displacement
15424 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15425 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15427 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15428 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15429 aarch64_libgcc_floating_mode_supported_p
15431 #undef TARGET_MANGLE_TYPE
15432 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15434 #undef TARGET_MEMORY_MOVE_COST
15435 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15437 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15438 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15440 #undef TARGET_MUST_PASS_IN_STACK
15441 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15443 /* This target hook should return true if accesses to volatile bitfields
15444 should use the narrowest mode possible. It should return false if these
15445 accesses should use the bitfield container type. */
15446 #undef TARGET_NARROW_VOLATILE_BITFIELD
15447 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15449 #undef TARGET_OPTION_OVERRIDE
15450 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15452 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15453 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15454 aarch64_override_options_after_change
15456 #undef TARGET_OPTION_SAVE
15457 #define TARGET_OPTION_SAVE aarch64_option_save
15459 #undef TARGET_OPTION_RESTORE
15460 #define TARGET_OPTION_RESTORE aarch64_option_restore
15462 #undef TARGET_OPTION_PRINT
15463 #define TARGET_OPTION_PRINT aarch64_option_print
15465 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15466 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15468 #undef TARGET_SET_CURRENT_FUNCTION
15469 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15471 #undef TARGET_PASS_BY_REFERENCE
15472 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15474 #undef TARGET_PREFERRED_RELOAD_CLASS
15475 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15477 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15478 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15480 #undef TARGET_PROMOTED_TYPE
15481 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15483 #undef TARGET_SECONDARY_RELOAD
15484 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15486 #undef TARGET_SHIFT_TRUNCATION_MASK
15487 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15489 #undef TARGET_SETUP_INCOMING_VARARGS
15490 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15492 #undef TARGET_STRUCT_VALUE_RTX
15493 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15495 #undef TARGET_REGISTER_MOVE_COST
15496 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15498 #undef TARGET_RETURN_IN_MEMORY
15499 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15501 #undef TARGET_RETURN_IN_MSB
15502 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15504 #undef TARGET_RTX_COSTS
15505 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15507 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15508 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15510 #undef TARGET_SCHED_ISSUE_RATE
15511 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15513 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15514 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15515 aarch64_sched_first_cycle_multipass_dfa_lookahead
15517 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15518 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15519 aarch64_first_cycle_multipass_dfa_lookahead_guard
15521 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15522 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15523 aarch64_get_separate_components
15525 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15526 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15527 aarch64_components_for_bb
15529 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15530 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15531 aarch64_disqualify_components
15533 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15534 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15535 aarch64_emit_prologue_components
15537 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15538 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15539 aarch64_emit_epilogue_components
15541 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15542 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15543 aarch64_set_handled_components
15545 #undef TARGET_TRAMPOLINE_INIT
15546 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15548 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15549 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15551 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15552 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15554 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15555 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15556 aarch64_builtin_support_vector_misalignment
15558 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15559 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15561 #undef TARGET_VECTORIZE_ADD_STMT_COST
15562 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15564 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15565 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15566 aarch64_builtin_vectorization_cost
15568 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15569 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15571 #undef TARGET_VECTORIZE_BUILTINS
15572 #define TARGET_VECTORIZE_BUILTINS
15574 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15575 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15576 aarch64_builtin_vectorized_function
15578 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15579 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15580 aarch64_autovectorize_vector_sizes
15582 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15583 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15584 aarch64_atomic_assign_expand_fenv
15586 /* Section anchor support. */
15588 #undef TARGET_MIN_ANCHOR_OFFSET
15589 #define TARGET_MIN_ANCHOR_OFFSET -256
15591 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15592 byte offset; we can do much more for larger data types, but have no way
15593 to determine the size of the access. We assume accesses are aligned. */
15594 #undef TARGET_MAX_ANCHOR_OFFSET
15595 #define TARGET_MAX_ANCHOR_OFFSET 4095
15597 #undef TARGET_VECTOR_ALIGNMENT
15598 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15600 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15601 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15602 aarch64_simd_vector_alignment_reachable
15604 /* vec_perm support. */
15606 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15607 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15608 aarch64_vectorize_vec_perm_const_ok
15610 #undef TARGET_INIT_LIBFUNCS
15611 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15613 #undef TARGET_FIXED_CONDITION_CODE_REGS
15614 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15616 #undef TARGET_FLAGS_REGNUM
15617 #define TARGET_FLAGS_REGNUM CC_REGNUM
15619 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15620 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15622 #undef TARGET_ASAN_SHADOW_OFFSET
15623 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15625 #undef TARGET_LEGITIMIZE_ADDRESS
15626 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15628 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15629 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15630 aarch64_use_by_pieces_infrastructure_p
15632 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15633 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15635 #undef TARGET_CAN_USE_DOLOOP_P
15636 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15638 #undef TARGET_SCHED_ADJUST_PRIORITY
15639 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15641 #undef TARGET_SCHED_MACRO_FUSION_P
15642 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15644 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15645 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15647 #undef TARGET_SCHED_FUSION_PRIORITY
15648 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15650 #undef TARGET_UNSPEC_MAY_TRAP_P
15651 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15653 #undef TARGET_USE_PSEUDO_PIC_REG
15654 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15656 #undef TARGET_PRINT_OPERAND
15657 #define TARGET_PRINT_OPERAND aarch64_print_operand
15659 #undef TARGET_PRINT_OPERAND_ADDRESS
15660 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15662 #undef TARGET_OPTAB_SUPPORTED_P
15663 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15665 #undef TARGET_OMIT_STRUCT_RETURN_REG
15666 #define TARGET_OMIT_STRUCT_RETURN_REG true
15668 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15669 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15670 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15672 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15673 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15674 aarch64_hard_regno_call_part_clobbered
15676 #if CHECKING_P
15677 #undef TARGET_RUN_TARGET_SELFTESTS
15678 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15679 #endif /* #if CHECKING_P */
15681 struct gcc_target targetm = TARGET_INITIALIZER;
15683 #include "gt-aarch64.h"