Turn FUNCTION_ARG_PADDING into a target hook
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob65a8df1a1ff47952a2047d78957e445b4f18b7df
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
145 const unsigned char *sel);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings =
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
869 /* A processor implementing AArch64. */
870 struct processor
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
929 aarch64_cc;
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
992 machine_mode mode;
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
997 if (best_class != ALL_REGS)
998 return best_class;
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1045 return mode == OImode || mode == CImode || mode == XImode;
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1067 return false;
1070 /* Implement HARD_REGNO_NREGS. */
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1075 switch (aarch64_regno_regclass (regno))
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1083 gcc_unreachable ();
1086 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return true;
1106 if (FP_REGNUM_P (regno))
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return
1110 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111 else
1112 return true;
1115 return false;
1118 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1119 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1120 clobbers the top 64 bits when restoring the bottom 64 bits. */
1122 static bool
1123 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1125 return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1128 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1129 machine_mode
1130 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1131 machine_mode mode)
1133 /* Handle modes that fit within single registers. */
1134 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1136 if (GET_MODE_SIZE (mode) >= 4)
1137 return mode;
1138 else
1139 return SImode;
1141 /* Fall back to generic for multi-reg and very large modes. */
1142 else
1143 return choose_hard_reg_mode (regno, nregs, false);
1146 /* Return true if calls to DECL should be treated as
1147 long-calls (ie called via a register). */
1148 static bool
1149 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1151 return false;
1154 /* Return true if calls to symbol-ref SYM should be treated as
1155 long-calls (ie called via a register). */
1156 bool
1157 aarch64_is_long_call_p (rtx sym)
1159 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1162 /* Return true if calls to symbol-ref SYM should not go through
1163 plt stubs. */
1165 bool
1166 aarch64_is_noplt_call_p (rtx sym)
1168 const_tree decl = SYMBOL_REF_DECL (sym);
1170 if (flag_pic
1171 && decl
1172 && (!flag_plt
1173 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1174 && !targetm.binds_local_p (decl))
1175 return true;
1177 return false;
1180 /* Return true if the offsets to a zero/sign-extract operation
1181 represent an expression that matches an extend operation. The
1182 operands represent the paramters from
1184 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1185 bool
1186 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1187 rtx extract_imm)
1189 HOST_WIDE_INT mult_val, extract_val;
1191 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1192 return false;
1194 mult_val = INTVAL (mult_imm);
1195 extract_val = INTVAL (extract_imm);
1197 if (extract_val > 8
1198 && extract_val < GET_MODE_BITSIZE (mode)
1199 && exact_log2 (extract_val & ~7) > 0
1200 && (extract_val & 7) <= 4
1201 && mult_val == (1 << (extract_val & 7)))
1202 return true;
1204 return false;
1207 /* Emit an insn that's a simple single-set. Both the operands must be
1208 known to be valid. */
1209 inline static rtx_insn *
1210 emit_set_insn (rtx x, rtx y)
1212 return emit_insn (gen_rtx_SET (x, y));
1215 /* X and Y are two things to compare using CODE. Emit the compare insn and
1216 return the rtx for register 0 in the proper mode. */
1218 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1220 machine_mode mode = SELECT_CC_MODE (code, x, y);
1221 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1223 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1224 return cc_reg;
1227 /* Build the SYMBOL_REF for __tls_get_addr. */
1229 static GTY(()) rtx tls_get_addr_libfunc;
1232 aarch64_tls_get_addr (void)
1234 if (!tls_get_addr_libfunc)
1235 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1236 return tls_get_addr_libfunc;
1239 /* Return the TLS model to use for ADDR. */
1241 static enum tls_model
1242 tls_symbolic_operand_type (rtx addr)
1244 enum tls_model tls_kind = TLS_MODEL_NONE;
1245 rtx sym, addend;
1247 if (GET_CODE (addr) == CONST)
1249 split_const (addr, &sym, &addend);
1250 if (GET_CODE (sym) == SYMBOL_REF)
1251 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1253 else if (GET_CODE (addr) == SYMBOL_REF)
1254 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1256 return tls_kind;
1259 /* We'll allow lo_sum's in addresses in our legitimate addresses
1260 so that combine would take care of combining addresses where
1261 necessary, but for generation purposes, we'll generate the address
1262 as :
1263 RTL Absolute
1264 tmp = hi (symbol_ref); adrp x1, foo
1265 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1268 PIC TLS
1269 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1270 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1271 bl __tls_get_addr
1274 Load TLS symbol, depending on TLS mechanism and TLS access model.
1276 Global Dynamic - Traditional TLS:
1277 adrp tmp, :tlsgd:imm
1278 add dest, tmp, #:tlsgd_lo12:imm
1279 bl __tls_get_addr
1281 Global Dynamic - TLS Descriptors:
1282 adrp dest, :tlsdesc:imm
1283 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1284 add dest, dest, #:tlsdesc_lo12:imm
1285 blr tmp
1286 mrs tp, tpidr_el0
1287 add dest, dest, tp
1289 Initial Exec:
1290 mrs tp, tpidr_el0
1291 adrp tmp, :gottprel:imm
1292 ldr dest, [tmp, #:gottprel_lo12:imm]
1293 add dest, dest, tp
1295 Local Exec:
1296 mrs tp, tpidr_el0
1297 add t0, tp, #:tprel_hi12:imm, lsl #12
1298 add t0, t0, #:tprel_lo12_nc:imm
1301 static void
1302 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1303 enum aarch64_symbol_type type)
1305 switch (type)
1307 case SYMBOL_SMALL_ABSOLUTE:
1309 /* In ILP32, the mode of dest can be either SImode or DImode. */
1310 rtx tmp_reg = dest;
1311 machine_mode mode = GET_MODE (dest);
1313 gcc_assert (mode == Pmode || mode == ptr_mode);
1315 if (can_create_pseudo_p ())
1316 tmp_reg = gen_reg_rtx (mode);
1318 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1319 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1320 return;
1323 case SYMBOL_TINY_ABSOLUTE:
1324 emit_insn (gen_rtx_SET (dest, imm));
1325 return;
1327 case SYMBOL_SMALL_GOT_28K:
1329 machine_mode mode = GET_MODE (dest);
1330 rtx gp_rtx = pic_offset_table_rtx;
1331 rtx insn;
1332 rtx mem;
1334 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1335 here before rtl expand. Tree IVOPT will generate rtl pattern to
1336 decide rtx costs, in which case pic_offset_table_rtx is not
1337 initialized. For that case no need to generate the first adrp
1338 instruction as the final cost for global variable access is
1339 one instruction. */
1340 if (gp_rtx != NULL)
1342 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1343 using the page base as GOT base, the first page may be wasted,
1344 in the worst scenario, there is only 28K space for GOT).
1346 The generate instruction sequence for accessing global variable
1349 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1351 Only one instruction needed. But we must initialize
1352 pic_offset_table_rtx properly. We generate initialize insn for
1353 every global access, and allow CSE to remove all redundant.
1355 The final instruction sequences will look like the following
1356 for multiply global variables access.
1358 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1360 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1361 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1362 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1363 ... */
1365 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1366 crtl->uses_pic_offset_table = 1;
1367 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1369 if (mode != GET_MODE (gp_rtx))
1370 gp_rtx = gen_lowpart (mode, gp_rtx);
1374 if (mode == ptr_mode)
1376 if (mode == DImode)
1377 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1378 else
1379 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1381 mem = XVECEXP (SET_SRC (insn), 0, 0);
1383 else
1385 gcc_assert (mode == Pmode);
1387 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1388 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1391 /* The operand is expected to be MEM. Whenever the related insn
1392 pattern changed, above code which calculate mem should be
1393 updated. */
1394 gcc_assert (GET_CODE (mem) == MEM);
1395 MEM_READONLY_P (mem) = 1;
1396 MEM_NOTRAP_P (mem) = 1;
1397 emit_insn (insn);
1398 return;
1401 case SYMBOL_SMALL_GOT_4G:
1403 /* In ILP32, the mode of dest can be either SImode or DImode,
1404 while the got entry is always of SImode size. The mode of
1405 dest depends on how dest is used: if dest is assigned to a
1406 pointer (e.g. in the memory), it has SImode; it may have
1407 DImode if dest is dereferenced to access the memeory.
1408 This is why we have to handle three different ldr_got_small
1409 patterns here (two patterns for ILP32). */
1411 rtx insn;
1412 rtx mem;
1413 rtx tmp_reg = dest;
1414 machine_mode mode = GET_MODE (dest);
1416 if (can_create_pseudo_p ())
1417 tmp_reg = gen_reg_rtx (mode);
1419 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1420 if (mode == ptr_mode)
1422 if (mode == DImode)
1423 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1424 else
1425 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1427 mem = XVECEXP (SET_SRC (insn), 0, 0);
1429 else
1431 gcc_assert (mode == Pmode);
1433 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1434 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1437 gcc_assert (GET_CODE (mem) == MEM);
1438 MEM_READONLY_P (mem) = 1;
1439 MEM_NOTRAP_P (mem) = 1;
1440 emit_insn (insn);
1441 return;
1444 case SYMBOL_SMALL_TLSGD:
1446 rtx_insn *insns;
1447 machine_mode mode = GET_MODE (dest);
1448 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1450 start_sequence ();
1451 if (TARGET_ILP32)
1452 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1453 else
1454 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1455 insns = get_insns ();
1456 end_sequence ();
1458 RTL_CONST_CALL_P (insns) = 1;
1459 emit_libcall_block (insns, dest, result, imm);
1460 return;
1463 case SYMBOL_SMALL_TLSDESC:
1465 machine_mode mode = GET_MODE (dest);
1466 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1467 rtx tp;
1469 gcc_assert (mode == Pmode || mode == ptr_mode);
1471 /* In ILP32, the got entry is always of SImode size. Unlike
1472 small GOT, the dest is fixed at reg 0. */
1473 if (TARGET_ILP32)
1474 emit_insn (gen_tlsdesc_small_si (imm));
1475 else
1476 emit_insn (gen_tlsdesc_small_di (imm));
1477 tp = aarch64_load_tp (NULL);
1479 if (mode != Pmode)
1480 tp = gen_lowpart (mode, tp);
1482 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1483 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1484 return;
1487 case SYMBOL_SMALL_TLSIE:
1489 /* In ILP32, the mode of dest can be either SImode or DImode,
1490 while the got entry is always of SImode size. The mode of
1491 dest depends on how dest is used: if dest is assigned to a
1492 pointer (e.g. in the memory), it has SImode; it may have
1493 DImode if dest is dereferenced to access the memeory.
1494 This is why we have to handle three different tlsie_small
1495 patterns here (two patterns for ILP32). */
1496 machine_mode mode = GET_MODE (dest);
1497 rtx tmp_reg = gen_reg_rtx (mode);
1498 rtx tp = aarch64_load_tp (NULL);
1500 if (mode == ptr_mode)
1502 if (mode == DImode)
1503 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1504 else
1506 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1507 tp = gen_lowpart (mode, tp);
1510 else
1512 gcc_assert (mode == Pmode);
1513 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1516 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1517 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518 return;
1521 case SYMBOL_TLSLE12:
1522 case SYMBOL_TLSLE24:
1523 case SYMBOL_TLSLE32:
1524 case SYMBOL_TLSLE48:
1526 machine_mode mode = GET_MODE (dest);
1527 rtx tp = aarch64_load_tp (NULL);
1529 if (mode != Pmode)
1530 tp = gen_lowpart (mode, tp);
1532 switch (type)
1534 case SYMBOL_TLSLE12:
1535 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1536 (dest, tp, imm));
1537 break;
1538 case SYMBOL_TLSLE24:
1539 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1540 (dest, tp, imm));
1541 break;
1542 case SYMBOL_TLSLE32:
1543 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1544 (dest, imm));
1545 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1546 (dest, dest, tp));
1547 break;
1548 case SYMBOL_TLSLE48:
1549 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1550 (dest, imm));
1551 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1552 (dest, dest, tp));
1553 break;
1554 default:
1555 gcc_unreachable ();
1558 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1559 return;
1562 case SYMBOL_TINY_GOT:
1563 emit_insn (gen_ldr_got_tiny (dest, imm));
1564 return;
1566 case SYMBOL_TINY_TLSIE:
1568 machine_mode mode = GET_MODE (dest);
1569 rtx tp = aarch64_load_tp (NULL);
1571 if (mode == ptr_mode)
1573 if (mode == DImode)
1574 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1575 else
1577 tp = gen_lowpart (mode, tp);
1578 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1581 else
1583 gcc_assert (mode == Pmode);
1584 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1587 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1588 return;
1591 default:
1592 gcc_unreachable ();
1596 /* Emit a move from SRC to DEST. Assume that the move expanders can
1597 handle all moves if !can_create_pseudo_p (). The distinction is
1598 important because, unlike emit_move_insn, the move expanders know
1599 how to force Pmode objects into the constant pool even when the
1600 constant pool address is not itself legitimate. */
1601 static rtx
1602 aarch64_emit_move (rtx dest, rtx src)
1604 return (can_create_pseudo_p ()
1605 ? emit_move_insn (dest, src)
1606 : emit_move_insn_1 (dest, src));
1609 /* Split a 128-bit move operation into two 64-bit move operations,
1610 taking care to handle partial overlap of register to register
1611 copies. Special cases are needed when moving between GP regs and
1612 FP regs. SRC can be a register, constant or memory; DST a register
1613 or memory. If either operand is memory it must not have any side
1614 effects. */
1615 void
1616 aarch64_split_128bit_move (rtx dst, rtx src)
1618 rtx dst_lo, dst_hi;
1619 rtx src_lo, src_hi;
1621 machine_mode mode = GET_MODE (dst);
1623 gcc_assert (mode == TImode || mode == TFmode);
1624 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1625 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1627 if (REG_P (dst) && REG_P (src))
1629 int src_regno = REGNO (src);
1630 int dst_regno = REGNO (dst);
1632 /* Handle FP <-> GP regs. */
1633 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1635 src_lo = gen_lowpart (word_mode, src);
1636 src_hi = gen_highpart (word_mode, src);
1638 if (mode == TImode)
1640 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1641 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1643 else
1645 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1646 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1648 return;
1650 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1652 dst_lo = gen_lowpart (word_mode, dst);
1653 dst_hi = gen_highpart (word_mode, dst);
1655 if (mode == TImode)
1657 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1658 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1660 else
1662 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1663 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1665 return;
1669 dst_lo = gen_lowpart (word_mode, dst);
1670 dst_hi = gen_highpart (word_mode, dst);
1671 src_lo = gen_lowpart (word_mode, src);
1672 src_hi = gen_highpart_mode (word_mode, mode, src);
1674 /* At most one pairing may overlap. */
1675 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1677 aarch64_emit_move (dst_hi, src_hi);
1678 aarch64_emit_move (dst_lo, src_lo);
1680 else
1682 aarch64_emit_move (dst_lo, src_lo);
1683 aarch64_emit_move (dst_hi, src_hi);
1687 bool
1688 aarch64_split_128bit_move_p (rtx dst, rtx src)
1690 return (! REG_P (src)
1691 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1694 /* Split a complex SIMD combine. */
1696 void
1697 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1699 machine_mode src_mode = GET_MODE (src1);
1700 machine_mode dst_mode = GET_MODE (dst);
1702 gcc_assert (VECTOR_MODE_P (dst_mode));
1703 gcc_assert (register_operand (dst, dst_mode)
1704 && register_operand (src1, src_mode)
1705 && register_operand (src2, src_mode));
1707 rtx (*gen) (rtx, rtx, rtx);
1709 switch (src_mode)
1711 case E_V8QImode:
1712 gen = gen_aarch64_simd_combinev8qi;
1713 break;
1714 case E_V4HImode:
1715 gen = gen_aarch64_simd_combinev4hi;
1716 break;
1717 case E_V2SImode:
1718 gen = gen_aarch64_simd_combinev2si;
1719 break;
1720 case E_V4HFmode:
1721 gen = gen_aarch64_simd_combinev4hf;
1722 break;
1723 case E_V2SFmode:
1724 gen = gen_aarch64_simd_combinev2sf;
1725 break;
1726 case E_DImode:
1727 gen = gen_aarch64_simd_combinedi;
1728 break;
1729 case E_DFmode:
1730 gen = gen_aarch64_simd_combinedf;
1731 break;
1732 default:
1733 gcc_unreachable ();
1736 emit_insn (gen (dst, src1, src2));
1737 return;
1740 /* Split a complex SIMD move. */
1742 void
1743 aarch64_split_simd_move (rtx dst, rtx src)
1745 machine_mode src_mode = GET_MODE (src);
1746 machine_mode dst_mode = GET_MODE (dst);
1748 gcc_assert (VECTOR_MODE_P (dst_mode));
1750 if (REG_P (dst) && REG_P (src))
1752 rtx (*gen) (rtx, rtx);
1754 gcc_assert (VECTOR_MODE_P (src_mode));
1756 switch (src_mode)
1758 case E_V16QImode:
1759 gen = gen_aarch64_split_simd_movv16qi;
1760 break;
1761 case E_V8HImode:
1762 gen = gen_aarch64_split_simd_movv8hi;
1763 break;
1764 case E_V4SImode:
1765 gen = gen_aarch64_split_simd_movv4si;
1766 break;
1767 case E_V2DImode:
1768 gen = gen_aarch64_split_simd_movv2di;
1769 break;
1770 case E_V8HFmode:
1771 gen = gen_aarch64_split_simd_movv8hf;
1772 break;
1773 case E_V4SFmode:
1774 gen = gen_aarch64_split_simd_movv4sf;
1775 break;
1776 case E_V2DFmode:
1777 gen = gen_aarch64_split_simd_movv2df;
1778 break;
1779 default:
1780 gcc_unreachable ();
1783 emit_insn (gen (dst, src));
1784 return;
1788 bool
1789 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1790 machine_mode ymode, rtx y)
1792 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1793 gcc_assert (r != NULL);
1794 return rtx_equal_p (x, r);
1798 static rtx
1799 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1801 if (can_create_pseudo_p ())
1802 return force_reg (mode, value);
1803 else
1805 x = aarch64_emit_move (x, value);
1806 return x;
1811 static rtx
1812 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1814 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1816 rtx high;
1817 /* Load the full offset into a register. This
1818 might be improvable in the future. */
1819 high = GEN_INT (offset);
1820 offset = 0;
1821 high = aarch64_force_temporary (mode, temp, high);
1822 reg = aarch64_force_temporary (mode, temp,
1823 gen_rtx_PLUS (mode, high, reg));
1825 return plus_constant (mode, reg, offset);
1828 static int
1829 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1830 machine_mode mode)
1832 int i;
1833 unsigned HOST_WIDE_INT val, val2, mask;
1834 int one_match, zero_match;
1835 int num_insns;
1837 val = INTVAL (imm);
1839 if (aarch64_move_imm (val, mode))
1841 if (generate)
1842 emit_insn (gen_rtx_SET (dest, imm));
1843 return 1;
1846 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1847 (with XXXX non-zero). In that case check to see if the move can be done in
1848 a smaller mode. */
1849 val2 = val & 0xffffffff;
1850 if (mode == DImode
1851 && aarch64_move_imm (val2, SImode)
1852 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1854 if (generate)
1855 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1857 /* Check if we have to emit a second instruction by checking to see
1858 if any of the upper 32 bits of the original DI mode value is set. */
1859 if (val == val2)
1860 return 1;
1862 i = (val >> 48) ? 48 : 32;
1864 if (generate)
1865 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1866 GEN_INT ((val >> i) & 0xffff)));
1868 return 2;
1871 if ((val >> 32) == 0 || mode == SImode)
1873 if (generate)
1875 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1876 if (mode == SImode)
1877 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1878 GEN_INT ((val >> 16) & 0xffff)));
1879 else
1880 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1881 GEN_INT ((val >> 16) & 0xffff)));
1883 return 2;
1886 /* Remaining cases are all for DImode. */
1888 mask = 0xffff;
1889 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1890 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1891 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1892 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1894 if (zero_match != 2 && one_match != 2)
1896 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1897 For a 64-bit bitmask try whether changing 16 bits to all ones or
1898 zeroes creates a valid bitmask. To check any repeated bitmask,
1899 try using 16 bits from the other 32-bit half of val. */
1901 for (i = 0; i < 64; i += 16, mask <<= 16)
1903 val2 = val & ~mask;
1904 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1905 break;
1906 val2 = val | mask;
1907 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1908 break;
1909 val2 = val2 & ~mask;
1910 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1911 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1912 break;
1914 if (i != 64)
1916 if (generate)
1918 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1919 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1920 GEN_INT ((val >> i) & 0xffff)));
1922 return 2;
1926 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1927 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1928 otherwise skip zero bits. */
1930 num_insns = 1;
1931 mask = 0xffff;
1932 val2 = one_match > zero_match ? ~val : val;
1933 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1935 if (generate)
1936 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1937 ? (val | ~(mask << i))
1938 : (val & (mask << i)))));
1939 for (i += 16; i < 64; i += 16)
1941 if ((val2 & (mask << i)) == 0)
1942 continue;
1943 if (generate)
1944 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1945 GEN_INT ((val >> i) & 0xffff)));
1946 num_insns ++;
1949 return num_insns;
1953 void
1954 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1956 machine_mode mode = GET_MODE (dest);
1958 gcc_assert (mode == SImode || mode == DImode);
1960 /* Check on what type of symbol it is. */
1961 if (GET_CODE (imm) == SYMBOL_REF
1962 || GET_CODE (imm) == LABEL_REF
1963 || GET_CODE (imm) == CONST)
1965 rtx mem, base, offset;
1966 enum aarch64_symbol_type sty;
1968 /* If we have (const (plus symbol offset)), separate out the offset
1969 before we start classifying the symbol. */
1970 split_const (imm, &base, &offset);
1972 sty = aarch64_classify_symbol (base, offset);
1973 switch (sty)
1975 case SYMBOL_FORCE_TO_MEM:
1976 if (offset != const0_rtx
1977 && targetm.cannot_force_const_mem (mode, imm))
1979 gcc_assert (can_create_pseudo_p ());
1980 base = aarch64_force_temporary (mode, dest, base);
1981 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1982 aarch64_emit_move (dest, base);
1983 return;
1986 mem = force_const_mem (ptr_mode, imm);
1987 gcc_assert (mem);
1989 /* If we aren't generating PC relative literals, then
1990 we need to expand the literal pool access carefully.
1991 This is something that needs to be done in a number
1992 of places, so could well live as a separate function. */
1993 if (!aarch64_pcrelative_literal_loads)
1995 gcc_assert (can_create_pseudo_p ());
1996 base = gen_reg_rtx (ptr_mode);
1997 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1998 if (ptr_mode != Pmode)
1999 base = convert_memory_address (Pmode, base);
2000 mem = gen_rtx_MEM (ptr_mode, base);
2003 if (mode != ptr_mode)
2004 mem = gen_rtx_ZERO_EXTEND (mode, mem);
2006 emit_insn (gen_rtx_SET (dest, mem));
2008 return;
2010 case SYMBOL_SMALL_TLSGD:
2011 case SYMBOL_SMALL_TLSDESC:
2012 case SYMBOL_SMALL_TLSIE:
2013 case SYMBOL_SMALL_GOT_28K:
2014 case SYMBOL_SMALL_GOT_4G:
2015 case SYMBOL_TINY_GOT:
2016 case SYMBOL_TINY_TLSIE:
2017 if (offset != const0_rtx)
2019 gcc_assert(can_create_pseudo_p ());
2020 base = aarch64_force_temporary (mode, dest, base);
2021 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2022 aarch64_emit_move (dest, base);
2023 return;
2025 /* FALLTHRU */
2027 case SYMBOL_SMALL_ABSOLUTE:
2028 case SYMBOL_TINY_ABSOLUTE:
2029 case SYMBOL_TLSLE12:
2030 case SYMBOL_TLSLE24:
2031 case SYMBOL_TLSLE32:
2032 case SYMBOL_TLSLE48:
2033 aarch64_load_symref_appropriately (dest, imm, sty);
2034 return;
2036 default:
2037 gcc_unreachable ();
2041 if (!CONST_INT_P (imm))
2043 if (GET_CODE (imm) == HIGH)
2044 emit_insn (gen_rtx_SET (dest, imm));
2045 else
2047 rtx mem = force_const_mem (mode, imm);
2048 gcc_assert (mem);
2049 emit_insn (gen_rtx_SET (dest, mem));
2052 return;
2055 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2058 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2059 temporary value if necessary. FRAME_RELATED_P should be true if
2060 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2061 to the generated instructions. If SCRATCHREG is known to hold
2062 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2063 immediate again.
2065 Since this function may be used to adjust the stack pointer, we must
2066 ensure that it cannot cause transient stack deallocation (for example
2067 by first incrementing SP and then decrementing when adjusting by a
2068 large immediate). */
2070 static void
2071 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2072 HOST_WIDE_INT delta, bool frame_related_p,
2073 bool emit_move_imm)
2075 HOST_WIDE_INT mdelta = abs_hwi (delta);
2076 rtx this_rtx = gen_rtx_REG (mode, regnum);
2077 rtx_insn *insn;
2079 if (!mdelta)
2080 return;
2082 /* Single instruction adjustment. */
2083 if (aarch64_uimm12_shift (mdelta))
2085 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2086 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2087 return;
2090 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2091 Only do this if mdelta is not a 16-bit move as adjusting using a move
2092 is better. */
2093 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2095 HOST_WIDE_INT low_off = mdelta & 0xfff;
2097 low_off = delta < 0 ? -low_off : low_off;
2098 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2099 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2100 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2101 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2102 return;
2105 /* Emit a move immediate if required and an addition/subtraction. */
2106 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2107 if (emit_move_imm)
2108 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2109 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2110 : gen_add2_insn (this_rtx, scratch_rtx));
2111 if (frame_related_p)
2113 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2114 rtx adj = plus_constant (mode, this_rtx, delta);
2115 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2119 static inline void
2120 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2121 HOST_WIDE_INT delta)
2123 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2126 static inline void
2127 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2129 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2130 true, emit_move_imm);
2133 static inline void
2134 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2136 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2137 frame_related_p, true);
2140 static bool
2141 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2142 tree exp ATTRIBUTE_UNUSED)
2144 /* Currently, always true. */
2145 return true;
2148 /* Implement TARGET_PASS_BY_REFERENCE. */
2150 static bool
2151 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2152 machine_mode mode,
2153 const_tree type,
2154 bool named ATTRIBUTE_UNUSED)
2156 HOST_WIDE_INT size;
2157 machine_mode dummymode;
2158 int nregs;
2160 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2161 size = (mode == BLKmode && type)
2162 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2164 /* Aggregates are passed by reference based on their size. */
2165 if (type && AGGREGATE_TYPE_P (type))
2167 size = int_size_in_bytes (type);
2170 /* Variable sized arguments are always returned by reference. */
2171 if (size < 0)
2172 return true;
2174 /* Can this be a candidate to be passed in fp/simd register(s)? */
2175 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2176 &dummymode, &nregs,
2177 NULL))
2178 return false;
2180 /* Arguments which are variable sized or larger than 2 registers are
2181 passed by reference unless they are a homogenous floating point
2182 aggregate. */
2183 return size > 2 * UNITS_PER_WORD;
2186 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2187 static bool
2188 aarch64_return_in_msb (const_tree valtype)
2190 machine_mode dummy_mode;
2191 int dummy_int;
2193 /* Never happens in little-endian mode. */
2194 if (!BYTES_BIG_ENDIAN)
2195 return false;
2197 /* Only composite types smaller than or equal to 16 bytes can
2198 be potentially returned in registers. */
2199 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2200 || int_size_in_bytes (valtype) <= 0
2201 || int_size_in_bytes (valtype) > 16)
2202 return false;
2204 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2205 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2206 is always passed/returned in the least significant bits of fp/simd
2207 register(s). */
2208 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2209 &dummy_mode, &dummy_int, NULL))
2210 return false;
2212 return true;
2215 /* Implement TARGET_FUNCTION_VALUE.
2216 Define how to find the value returned by a function. */
2218 static rtx
2219 aarch64_function_value (const_tree type, const_tree func,
2220 bool outgoing ATTRIBUTE_UNUSED)
2222 machine_mode mode;
2223 int unsignedp;
2224 int count;
2225 machine_mode ag_mode;
2227 mode = TYPE_MODE (type);
2228 if (INTEGRAL_TYPE_P (type))
2229 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2231 if (aarch64_return_in_msb (type))
2233 HOST_WIDE_INT size = int_size_in_bytes (type);
2235 if (size % UNITS_PER_WORD != 0)
2237 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2238 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2242 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2243 &ag_mode, &count, NULL))
2245 if (!aarch64_composite_type_p (type, mode))
2247 gcc_assert (count == 1 && mode == ag_mode);
2248 return gen_rtx_REG (mode, V0_REGNUM);
2250 else
2252 int i;
2253 rtx par;
2255 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2256 for (i = 0; i < count; i++)
2258 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2259 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2260 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2261 XVECEXP (par, 0, i) = tmp;
2263 return par;
2266 else
2267 return gen_rtx_REG (mode, R0_REGNUM);
2270 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2271 Return true if REGNO is the number of a hard register in which the values
2272 of called function may come back. */
2274 static bool
2275 aarch64_function_value_regno_p (const unsigned int regno)
2277 /* Maximum of 16 bytes can be returned in the general registers. Examples
2278 of 16-byte return values are: 128-bit integers and 16-byte small
2279 structures (excluding homogeneous floating-point aggregates). */
2280 if (regno == R0_REGNUM || regno == R1_REGNUM)
2281 return true;
2283 /* Up to four fp/simd registers can return a function value, e.g. a
2284 homogeneous floating-point aggregate having four members. */
2285 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2286 return TARGET_FLOAT;
2288 return false;
2291 /* Implement TARGET_RETURN_IN_MEMORY.
2293 If the type T of the result of a function is such that
2294 void func (T arg)
2295 would require that arg be passed as a value in a register (or set of
2296 registers) according to the parameter passing rules, then the result
2297 is returned in the same registers as would be used for such an
2298 argument. */
2300 static bool
2301 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2303 HOST_WIDE_INT size;
2304 machine_mode ag_mode;
2305 int count;
2307 if (!AGGREGATE_TYPE_P (type)
2308 && TREE_CODE (type) != COMPLEX_TYPE
2309 && TREE_CODE (type) != VECTOR_TYPE)
2310 /* Simple scalar types always returned in registers. */
2311 return false;
2313 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2314 type,
2315 &ag_mode,
2316 &count,
2317 NULL))
2318 return false;
2320 /* Types larger than 2 registers returned in memory. */
2321 size = int_size_in_bytes (type);
2322 return (size < 0 || size > 2 * UNITS_PER_WORD);
2325 static bool
2326 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2327 const_tree type, int *nregs)
2329 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2330 return aarch64_vfp_is_call_or_return_candidate (mode,
2331 type,
2332 &pcum->aapcs_vfp_rmode,
2333 nregs,
2334 NULL);
2337 /* Given MODE and TYPE of a function argument, return the alignment in
2338 bits. The idea is to suppress any stronger alignment requested by
2339 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2340 This is a helper function for local use only. */
2342 static unsigned int
2343 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2345 if (!type)
2346 return GET_MODE_ALIGNMENT (mode);
2348 if (integer_zerop (TYPE_SIZE (type)))
2349 return 0;
2351 gcc_assert (TYPE_MODE (type) == mode);
2353 if (!AGGREGATE_TYPE_P (type))
2354 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2356 if (TREE_CODE (type) == ARRAY_TYPE)
2357 return TYPE_ALIGN (TREE_TYPE (type));
2359 unsigned int alignment = 0;
2360 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2361 if (TREE_CODE (field) == FIELD_DECL)
2362 alignment = std::max (alignment, DECL_ALIGN (field));
2364 return alignment;
2367 /* Layout a function argument according to the AAPCS64 rules. The rule
2368 numbers refer to the rule numbers in the AAPCS64. */
2370 static void
2371 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2372 const_tree type,
2373 bool named ATTRIBUTE_UNUSED)
2375 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2376 int ncrn, nvrn, nregs;
2377 bool allocate_ncrn, allocate_nvrn;
2378 HOST_WIDE_INT size;
2380 /* We need to do this once per argument. */
2381 if (pcum->aapcs_arg_processed)
2382 return;
2384 pcum->aapcs_arg_processed = true;
2386 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2387 size
2388 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2389 UNITS_PER_WORD);
2391 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2392 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2393 mode,
2394 type,
2395 &nregs);
2397 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2398 The following code thus handles passing by SIMD/FP registers first. */
2400 nvrn = pcum->aapcs_nvrn;
2402 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2403 and homogenous short-vector aggregates (HVA). */
2404 if (allocate_nvrn)
2406 if (!TARGET_FLOAT)
2407 aarch64_err_no_fpadvsimd (mode, "argument");
2409 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2411 pcum->aapcs_nextnvrn = nvrn + nregs;
2412 if (!aarch64_composite_type_p (type, mode))
2414 gcc_assert (nregs == 1);
2415 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2417 else
2419 rtx par;
2420 int i;
2421 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2422 for (i = 0; i < nregs; i++)
2424 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2425 V0_REGNUM + nvrn + i);
2426 tmp = gen_rtx_EXPR_LIST
2427 (VOIDmode, tmp,
2428 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2429 XVECEXP (par, 0, i) = tmp;
2431 pcum->aapcs_reg = par;
2433 return;
2435 else
2437 /* C.3 NSRN is set to 8. */
2438 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2439 goto on_stack;
2443 ncrn = pcum->aapcs_ncrn;
2444 nregs = size / UNITS_PER_WORD;
2446 /* C6 - C9. though the sign and zero extension semantics are
2447 handled elsewhere. This is the case where the argument fits
2448 entirely general registers. */
2449 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2452 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2454 /* C.8 if the argument has an alignment of 16 then the NGRN is
2455 rounded up to the next even number. */
2456 if (nregs == 2
2457 && ncrn % 2
2458 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2459 comparison is there because for > 16 * BITS_PER_UNIT
2460 alignment nregs should be > 2 and therefore it should be
2461 passed by reference rather than value. */
2462 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2464 ++ncrn;
2465 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2468 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2469 A reg is still generated for it, but the caller should be smart
2470 enough not to use it. */
2471 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2472 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2473 else
2475 rtx par;
2476 int i;
2478 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2479 for (i = 0; i < nregs; i++)
2481 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2482 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2483 GEN_INT (i * UNITS_PER_WORD));
2484 XVECEXP (par, 0, i) = tmp;
2486 pcum->aapcs_reg = par;
2489 pcum->aapcs_nextncrn = ncrn + nregs;
2490 return;
2493 /* C.11 */
2494 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2496 /* The argument is passed on stack; record the needed number of words for
2497 this argument and align the total size if necessary. */
2498 on_stack:
2499 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2501 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2502 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2503 16 / UNITS_PER_WORD);
2504 return;
2507 /* Implement TARGET_FUNCTION_ARG. */
2509 static rtx
2510 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2511 const_tree type, bool named)
2513 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2514 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2516 if (mode == VOIDmode)
2517 return NULL_RTX;
2519 aarch64_layout_arg (pcum_v, mode, type, named);
2520 return pcum->aapcs_reg;
2523 void
2524 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2525 const_tree fntype ATTRIBUTE_UNUSED,
2526 rtx libname ATTRIBUTE_UNUSED,
2527 const_tree fndecl ATTRIBUTE_UNUSED,
2528 unsigned n_named ATTRIBUTE_UNUSED)
2530 pcum->aapcs_ncrn = 0;
2531 pcum->aapcs_nvrn = 0;
2532 pcum->aapcs_nextncrn = 0;
2533 pcum->aapcs_nextnvrn = 0;
2534 pcum->pcs_variant = ARM_PCS_AAPCS64;
2535 pcum->aapcs_reg = NULL_RTX;
2536 pcum->aapcs_arg_processed = false;
2537 pcum->aapcs_stack_words = 0;
2538 pcum->aapcs_stack_size = 0;
2540 if (!TARGET_FLOAT
2541 && fndecl && TREE_PUBLIC (fndecl)
2542 && fntype && fntype != error_mark_node)
2544 const_tree type = TREE_TYPE (fntype);
2545 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2546 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2547 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2548 &mode, &nregs, NULL))
2549 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2551 return;
2554 static void
2555 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2556 machine_mode mode,
2557 const_tree type,
2558 bool named)
2560 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2561 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2563 aarch64_layout_arg (pcum_v, mode, type, named);
2564 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2565 != (pcum->aapcs_stack_words != 0));
2566 pcum->aapcs_arg_processed = false;
2567 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2568 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2569 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2570 pcum->aapcs_stack_words = 0;
2571 pcum->aapcs_reg = NULL_RTX;
2575 bool
2576 aarch64_function_arg_regno_p (unsigned regno)
2578 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2579 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2582 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2583 PARM_BOUNDARY bits of alignment, but will be given anything up
2584 to STACK_BOUNDARY bits if the type requires it. This makes sure
2585 that both before and after the layout of each argument, the Next
2586 Stacked Argument Address (NSAA) will have a minimum alignment of
2587 8 bytes. */
2589 static unsigned int
2590 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2592 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2593 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2596 /* Implement TARGET_FUNCTION_ARG_PADDING.
2598 Small aggregate types are placed in the lowest memory address.
2600 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2602 static pad_direction
2603 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2605 /* On little-endian targets, the least significant byte of every stack
2606 argument is passed at the lowest byte address of the stack slot. */
2607 if (!BYTES_BIG_ENDIAN)
2608 return PAD_UPWARD;
2610 /* Otherwise, integral, floating-point and pointer types are padded downward:
2611 the least significant byte of a stack argument is passed at the highest
2612 byte address of the stack slot. */
2613 if (type
2614 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2615 || POINTER_TYPE_P (type))
2616 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2617 return PAD_DOWNWARD;
2619 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2620 return PAD_UPWARD;
2623 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2625 It specifies padding for the last (may also be the only)
2626 element of a block move between registers and memory. If
2627 assuming the block is in the memory, padding upward means that
2628 the last element is padded after its highest significant byte,
2629 while in downward padding, the last element is padded at the
2630 its least significant byte side.
2632 Small aggregates and small complex types are always padded
2633 upwards.
2635 We don't need to worry about homogeneous floating-point or
2636 short-vector aggregates; their move is not affected by the
2637 padding direction determined here. Regardless of endianness,
2638 each element of such an aggregate is put in the least
2639 significant bits of a fp/simd register.
2641 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2642 register has useful data, and return the opposite if the most
2643 significant byte does. */
2645 bool
2646 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2647 bool first ATTRIBUTE_UNUSED)
2650 /* Small composite types are always padded upward. */
2651 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2653 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2654 : GET_MODE_SIZE (mode));
2655 if (size < 2 * UNITS_PER_WORD)
2656 return true;
2659 /* Otherwise, use the default padding. */
2660 return !BYTES_BIG_ENDIAN;
2663 static scalar_int_mode
2664 aarch64_libgcc_cmp_return_mode (void)
2666 return SImode;
2669 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2671 /* We use the 12-bit shifted immediate arithmetic instructions so values
2672 must be multiple of (1 << 12), i.e. 4096. */
2673 #define ARITH_FACTOR 4096
2675 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2676 #error Cannot use simple address calculation for stack probing
2677 #endif
2679 /* The pair of scratch registers used for stack probing. */
2680 #define PROBE_STACK_FIRST_REG 9
2681 #define PROBE_STACK_SECOND_REG 10
2683 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2684 inclusive. These are offsets from the current stack pointer. */
2686 static void
2687 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2689 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2691 /* See the same assertion on PROBE_INTERVAL above. */
2692 gcc_assert ((first % ARITH_FACTOR) == 0);
2694 /* See if we have a constant small number of probes to generate. If so,
2695 that's the easy case. */
2696 if (size <= PROBE_INTERVAL)
2698 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2700 emit_set_insn (reg1,
2701 plus_constant (Pmode,
2702 stack_pointer_rtx, -(first + base)));
2703 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2706 /* The run-time loop is made up of 8 insns in the generic case while the
2707 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2708 else if (size <= 4 * PROBE_INTERVAL)
2710 HOST_WIDE_INT i, rem;
2712 emit_set_insn (reg1,
2713 plus_constant (Pmode,
2714 stack_pointer_rtx,
2715 -(first + PROBE_INTERVAL)));
2716 emit_stack_probe (reg1);
2718 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2719 it exceeds SIZE. If only two probes are needed, this will not
2720 generate any code. Then probe at FIRST + SIZE. */
2721 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2723 emit_set_insn (reg1,
2724 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2725 emit_stack_probe (reg1);
2728 rem = size - (i - PROBE_INTERVAL);
2729 if (rem > 256)
2731 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2733 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2734 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2736 else
2737 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2740 /* Otherwise, do the same as above, but in a loop. Note that we must be
2741 extra careful with variables wrapping around because we might be at
2742 the very top (or the very bottom) of the address space and we have
2743 to be able to handle this case properly; in particular, we use an
2744 equality test for the loop condition. */
2745 else
2747 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2749 /* Step 1: round SIZE to the previous multiple of the interval. */
2751 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2754 /* Step 2: compute initial and final value of the loop counter. */
2756 /* TEST_ADDR = SP + FIRST. */
2757 emit_set_insn (reg1,
2758 plus_constant (Pmode, stack_pointer_rtx, -first));
2760 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2761 HOST_WIDE_INT adjustment = - (first + rounded_size);
2762 if (! aarch64_uimm12_shift (adjustment))
2764 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2765 true, Pmode);
2766 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2768 else
2770 emit_set_insn (reg2,
2771 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2774 /* Step 3: the loop
2778 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2779 probe at TEST_ADDR
2781 while (TEST_ADDR != LAST_ADDR)
2783 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2784 until it is equal to ROUNDED_SIZE. */
2786 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2789 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2790 that SIZE is equal to ROUNDED_SIZE. */
2792 if (size != rounded_size)
2794 HOST_WIDE_INT rem = size - rounded_size;
2796 if (rem > 256)
2798 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2800 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2801 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2803 else
2804 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2808 /* Make sure nothing is scheduled before we are done. */
2809 emit_insn (gen_blockage ());
2812 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2813 absolute addresses. */
2815 const char *
2816 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2818 static int labelno = 0;
2819 char loop_lab[32];
2820 rtx xops[2];
2822 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2824 /* Loop. */
2825 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2827 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2828 xops[0] = reg1;
2829 xops[1] = GEN_INT (PROBE_INTERVAL);
2830 output_asm_insn ("sub\t%0, %0, %1", xops);
2832 /* Probe at TEST_ADDR. */
2833 output_asm_insn ("str\txzr, [%0]", xops);
2835 /* Test if TEST_ADDR == LAST_ADDR. */
2836 xops[1] = reg2;
2837 output_asm_insn ("cmp\t%0, %1", xops);
2839 /* Branch. */
2840 fputs ("\tb.ne\t", asm_out_file);
2841 assemble_name_raw (asm_out_file, loop_lab);
2842 fputc ('\n', asm_out_file);
2844 return "";
2847 static bool
2848 aarch64_frame_pointer_required (void)
2850 /* In aarch64_override_options_after_change
2851 flag_omit_leaf_frame_pointer turns off the frame pointer by
2852 default. Turn it back on now if we've not got a leaf
2853 function. */
2854 if (flag_omit_leaf_frame_pointer
2855 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2856 return true;
2858 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2859 if (crtl->calls_eh_return)
2860 return true;
2862 return false;
2865 /* Mark the registers that need to be saved by the callee and calculate
2866 the size of the callee-saved registers area and frame record (both FP
2867 and LR may be omitted). */
2868 static void
2869 aarch64_layout_frame (void)
2871 HOST_WIDE_INT offset = 0;
2872 int regno, last_fp_reg = INVALID_REGNUM;
2874 if (reload_completed && cfun->machine->frame.laid_out)
2875 return;
2877 #define SLOT_NOT_REQUIRED (-2)
2878 #define SLOT_REQUIRED (-1)
2880 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2881 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2883 /* First mark all the registers that really need to be saved... */
2884 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2885 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2887 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2888 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2890 /* ... that includes the eh data registers (if needed)... */
2891 if (crtl->calls_eh_return)
2892 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2893 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2894 = SLOT_REQUIRED;
2896 /* ... and any callee saved register that dataflow says is live. */
2897 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2898 if (df_regs_ever_live_p (regno)
2899 && (regno == R30_REGNUM
2900 || !call_used_regs[regno]))
2901 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2903 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2904 if (df_regs_ever_live_p (regno)
2905 && !call_used_regs[regno])
2907 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2908 last_fp_reg = regno;
2911 if (frame_pointer_needed)
2913 /* FP and LR are placed in the linkage record. */
2914 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2915 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2916 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2917 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2918 offset += 2 * UNITS_PER_WORD;
2921 /* Now assign stack slots for them. */
2922 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2923 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2925 cfun->machine->frame.reg_offset[regno] = offset;
2926 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2927 cfun->machine->frame.wb_candidate1 = regno;
2928 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2929 cfun->machine->frame.wb_candidate2 = regno;
2930 offset += UNITS_PER_WORD;
2933 HOST_WIDE_INT max_int_offset = offset;
2934 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2935 bool has_align_gap = offset != max_int_offset;
2937 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2938 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2940 /* If there is an alignment gap between integer and fp callee-saves,
2941 allocate the last fp register to it if possible. */
2942 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2944 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2945 break;
2948 cfun->machine->frame.reg_offset[regno] = offset;
2949 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2950 cfun->machine->frame.wb_candidate1 = regno;
2951 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2952 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2953 cfun->machine->frame.wb_candidate2 = regno;
2954 offset += UNITS_PER_WORD;
2957 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2959 cfun->machine->frame.saved_regs_size = offset;
2961 HOST_WIDE_INT varargs_and_saved_regs_size
2962 = offset + cfun->machine->frame.saved_varargs_size;
2964 cfun->machine->frame.hard_fp_offset
2965 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2966 STACK_BOUNDARY / BITS_PER_UNIT);
2968 cfun->machine->frame.frame_size
2969 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2970 + crtl->outgoing_args_size,
2971 STACK_BOUNDARY / BITS_PER_UNIT);
2973 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2975 cfun->machine->frame.initial_adjust = 0;
2976 cfun->machine->frame.final_adjust = 0;
2977 cfun->machine->frame.callee_adjust = 0;
2978 cfun->machine->frame.callee_offset = 0;
2980 HOST_WIDE_INT max_push_offset = 0;
2981 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2982 max_push_offset = 512;
2983 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2984 max_push_offset = 256;
2986 if (cfun->machine->frame.frame_size < max_push_offset
2987 && crtl->outgoing_args_size == 0)
2989 /* Simple, small frame with no outgoing arguments:
2990 stp reg1, reg2, [sp, -frame_size]!
2991 stp reg3, reg4, [sp, 16] */
2992 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2994 else if ((crtl->outgoing_args_size
2995 + cfun->machine->frame.saved_regs_size < 512)
2996 && !(cfun->calls_alloca
2997 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2999 /* Frame with small outgoing arguments:
3000 sub sp, sp, frame_size
3001 stp reg1, reg2, [sp, outgoing_args_size]
3002 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3003 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3004 cfun->machine->frame.callee_offset
3005 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3007 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3009 /* Frame with large outgoing arguments but a small local area:
3010 stp reg1, reg2, [sp, -hard_fp_offset]!
3011 stp reg3, reg4, [sp, 16]
3012 sub sp, sp, outgoing_args_size */
3013 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3014 cfun->machine->frame.final_adjust
3015 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3017 else if (!frame_pointer_needed
3018 && varargs_and_saved_regs_size < max_push_offset)
3020 /* Frame with large local area and outgoing arguments (this pushes the
3021 callee-saves first, followed by the locals and outgoing area):
3022 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3023 stp reg3, reg4, [sp, 16]
3024 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3025 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3026 cfun->machine->frame.final_adjust
3027 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3028 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3029 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3031 else
3033 /* Frame with large local area and outgoing arguments using frame pointer:
3034 sub sp, sp, hard_fp_offset
3035 stp x29, x30, [sp, 0]
3036 add x29, sp, 0
3037 stp reg3, reg4, [sp, 16]
3038 sub sp, sp, outgoing_args_size */
3039 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3040 cfun->machine->frame.final_adjust
3041 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3044 cfun->machine->frame.laid_out = true;
3047 /* Return true if the register REGNO is saved on entry to
3048 the current function. */
3050 static bool
3051 aarch64_register_saved_on_entry (int regno)
3053 return cfun->machine->frame.reg_offset[regno] >= 0;
3056 /* Return the next register up from REGNO up to LIMIT for the callee
3057 to save. */
3059 static unsigned
3060 aarch64_next_callee_save (unsigned regno, unsigned limit)
3062 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3063 regno ++;
3064 return regno;
3067 /* Push the register number REGNO of mode MODE to the stack with write-back
3068 adjusting the stack by ADJUSTMENT. */
3070 static void
3071 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3072 HOST_WIDE_INT adjustment)
3074 rtx base_rtx = stack_pointer_rtx;
3075 rtx insn, reg, mem;
3077 reg = gen_rtx_REG (mode, regno);
3078 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3079 plus_constant (Pmode, base_rtx, -adjustment));
3080 mem = gen_frame_mem (mode, mem);
3082 insn = emit_move_insn (mem, reg);
3083 RTX_FRAME_RELATED_P (insn) = 1;
3086 /* Generate and return an instruction to store the pair of registers
3087 REG and REG2 of mode MODE to location BASE with write-back adjusting
3088 the stack location BASE by ADJUSTMENT. */
3090 static rtx
3091 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3092 HOST_WIDE_INT adjustment)
3094 switch (mode)
3096 case E_DImode:
3097 return gen_storewb_pairdi_di (base, base, reg, reg2,
3098 GEN_INT (-adjustment),
3099 GEN_INT (UNITS_PER_WORD - adjustment));
3100 case E_DFmode:
3101 return gen_storewb_pairdf_di (base, base, reg, reg2,
3102 GEN_INT (-adjustment),
3103 GEN_INT (UNITS_PER_WORD - adjustment));
3104 default:
3105 gcc_unreachable ();
3109 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3110 stack pointer by ADJUSTMENT. */
3112 static void
3113 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3115 rtx_insn *insn;
3116 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3118 if (regno2 == INVALID_REGNUM)
3119 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3121 rtx reg1 = gen_rtx_REG (mode, regno1);
3122 rtx reg2 = gen_rtx_REG (mode, regno2);
3124 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3125 reg2, adjustment));
3126 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3127 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3128 RTX_FRAME_RELATED_P (insn) = 1;
3131 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3132 adjusting it by ADJUSTMENT afterwards. */
3134 static rtx
3135 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3136 HOST_WIDE_INT adjustment)
3138 switch (mode)
3140 case E_DImode:
3141 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3142 GEN_INT (UNITS_PER_WORD));
3143 case E_DFmode:
3144 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3145 GEN_INT (UNITS_PER_WORD));
3146 default:
3147 gcc_unreachable ();
3151 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3152 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3153 into CFI_OPS. */
3155 static void
3156 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3157 rtx *cfi_ops)
3159 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3160 rtx reg1 = gen_rtx_REG (mode, regno1);
3162 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3164 if (regno2 == INVALID_REGNUM)
3166 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3167 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3168 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3170 else
3172 rtx reg2 = gen_rtx_REG (mode, regno2);
3173 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3174 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3175 reg2, adjustment));
3179 /* Generate and return a store pair instruction of mode MODE to store
3180 register REG1 to MEM1 and register REG2 to MEM2. */
3182 static rtx
3183 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3184 rtx reg2)
3186 switch (mode)
3188 case E_DImode:
3189 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3191 case E_DFmode:
3192 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3194 default:
3195 gcc_unreachable ();
3199 /* Generate and regurn a load pair isntruction of mode MODE to load register
3200 REG1 from MEM1 and register REG2 from MEM2. */
3202 static rtx
3203 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3204 rtx mem2)
3206 switch (mode)
3208 case E_DImode:
3209 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3211 case E_DFmode:
3212 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3214 default:
3215 gcc_unreachable ();
3219 /* Return TRUE if return address signing should be enabled for the current
3220 function, otherwise return FALSE. */
3222 bool
3223 aarch64_return_address_signing_enabled (void)
3225 /* This function should only be called after frame laid out. */
3226 gcc_assert (cfun->machine->frame.laid_out);
3228 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3229 if it's LR is pushed onto stack. */
3230 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3231 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3232 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3235 /* Emit code to save the callee-saved registers from register number START
3236 to LIMIT to the stack at the location starting at offset START_OFFSET,
3237 skipping any write-back candidates if SKIP_WB is true. */
3239 static void
3240 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3241 unsigned start, unsigned limit, bool skip_wb)
3243 rtx_insn *insn;
3244 unsigned regno;
3245 unsigned regno2;
3247 for (regno = aarch64_next_callee_save (start, limit);
3248 regno <= limit;
3249 regno = aarch64_next_callee_save (regno + 1, limit))
3251 rtx reg, mem;
3252 HOST_WIDE_INT offset;
3254 if (skip_wb
3255 && (regno == cfun->machine->frame.wb_candidate1
3256 || regno == cfun->machine->frame.wb_candidate2))
3257 continue;
3259 if (cfun->machine->reg_is_wrapped_separately[regno])
3260 continue;
3262 reg = gen_rtx_REG (mode, regno);
3263 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3264 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3265 offset));
3267 regno2 = aarch64_next_callee_save (regno + 1, limit);
3269 if (regno2 <= limit
3270 && !cfun->machine->reg_is_wrapped_separately[regno2]
3271 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3272 == cfun->machine->frame.reg_offset[regno2]))
3275 rtx reg2 = gen_rtx_REG (mode, regno2);
3276 rtx mem2;
3278 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3279 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3280 offset));
3281 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3282 reg2));
3284 /* The first part of a frame-related parallel insn is
3285 always assumed to be relevant to the frame
3286 calculations; subsequent parts, are only
3287 frame-related if explicitly marked. */
3288 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3289 regno = regno2;
3291 else
3292 insn = emit_move_insn (mem, reg);
3294 RTX_FRAME_RELATED_P (insn) = 1;
3298 /* Emit code to restore the callee registers of mode MODE from register
3299 number START up to and including LIMIT. Restore from the stack offset
3300 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3301 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3303 static void
3304 aarch64_restore_callee_saves (machine_mode mode,
3305 HOST_WIDE_INT start_offset, unsigned start,
3306 unsigned limit, bool skip_wb, rtx *cfi_ops)
3308 rtx base_rtx = stack_pointer_rtx;
3309 unsigned regno;
3310 unsigned regno2;
3311 HOST_WIDE_INT offset;
3313 for (regno = aarch64_next_callee_save (start, limit);
3314 regno <= limit;
3315 regno = aarch64_next_callee_save (regno + 1, limit))
3317 if (cfun->machine->reg_is_wrapped_separately[regno])
3318 continue;
3320 rtx reg, mem;
3322 if (skip_wb
3323 && (regno == cfun->machine->frame.wb_candidate1
3324 || regno == cfun->machine->frame.wb_candidate2))
3325 continue;
3327 reg = gen_rtx_REG (mode, regno);
3328 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3329 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3331 regno2 = aarch64_next_callee_save (regno + 1, limit);
3333 if (regno2 <= limit
3334 && !cfun->machine->reg_is_wrapped_separately[regno2]
3335 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3336 == cfun->machine->frame.reg_offset[regno2]))
3338 rtx reg2 = gen_rtx_REG (mode, regno2);
3339 rtx mem2;
3341 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3342 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3343 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3345 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3346 regno = regno2;
3348 else
3349 emit_move_insn (reg, mem);
3350 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3354 static inline bool
3355 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3356 HOST_WIDE_INT offset)
3358 return offset >= -256 && offset < 256;
3361 static inline bool
3362 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3364 return (offset >= 0
3365 && offset < 4096 * GET_MODE_SIZE (mode)
3366 && offset % GET_MODE_SIZE (mode) == 0);
3369 bool
3370 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3372 return (offset >= -64 * GET_MODE_SIZE (mode)
3373 && offset < 64 * GET_MODE_SIZE (mode)
3374 && offset % GET_MODE_SIZE (mode) == 0);
3377 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3379 static sbitmap
3380 aarch64_get_separate_components (void)
3382 aarch64_layout_frame ();
3384 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3385 bitmap_clear (components);
3387 /* The registers we need saved to the frame. */
3388 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3389 if (aarch64_register_saved_on_entry (regno))
3391 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3392 if (!frame_pointer_needed)
3393 offset += cfun->machine->frame.frame_size
3394 - cfun->machine->frame.hard_fp_offset;
3395 /* Check that we can access the stack slot of the register with one
3396 direct load with no adjustments needed. */
3397 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3398 bitmap_set_bit (components, regno);
3401 /* Don't mess with the hard frame pointer. */
3402 if (frame_pointer_needed)
3403 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3405 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3406 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3407 /* If aarch64_layout_frame has chosen registers to store/restore with
3408 writeback don't interfere with them to avoid having to output explicit
3409 stack adjustment instructions. */
3410 if (reg2 != INVALID_REGNUM)
3411 bitmap_clear_bit (components, reg2);
3412 if (reg1 != INVALID_REGNUM)
3413 bitmap_clear_bit (components, reg1);
3415 bitmap_clear_bit (components, LR_REGNUM);
3416 bitmap_clear_bit (components, SP_REGNUM);
3418 return components;
3421 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3423 static sbitmap
3424 aarch64_components_for_bb (basic_block bb)
3426 bitmap in = DF_LIVE_IN (bb);
3427 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3428 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3430 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3431 bitmap_clear (components);
3433 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3434 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3435 if ((!call_used_regs[regno])
3436 && (bitmap_bit_p (in, regno)
3437 || bitmap_bit_p (gen, regno)
3438 || bitmap_bit_p (kill, regno)))
3439 bitmap_set_bit (components, regno);
3441 return components;
3444 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3445 Nothing to do for aarch64. */
3447 static void
3448 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3452 /* Return the next set bit in BMP from START onwards. Return the total number
3453 of bits in BMP if no set bit is found at or after START. */
3455 static unsigned int
3456 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3458 unsigned int nbits = SBITMAP_SIZE (bmp);
3459 if (start == nbits)
3460 return start;
3462 gcc_assert (start < nbits);
3463 for (unsigned int i = start; i < nbits; i++)
3464 if (bitmap_bit_p (bmp, i))
3465 return i;
3467 return nbits;
3470 /* Do the work for aarch64_emit_prologue_components and
3471 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3472 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3473 for these components or the epilogue sequence. That is, it determines
3474 whether we should emit stores or loads and what kind of CFA notes to attach
3475 to the insns. Otherwise the logic for the two sequences is very
3476 similar. */
3478 static void
3479 aarch64_process_components (sbitmap components, bool prologue_p)
3481 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3482 ? HARD_FRAME_POINTER_REGNUM
3483 : STACK_POINTER_REGNUM);
3485 unsigned last_regno = SBITMAP_SIZE (components);
3486 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3487 rtx_insn *insn = NULL;
3489 while (regno != last_regno)
3491 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3492 so DFmode for the vector registers is enough. */
3493 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3494 rtx reg = gen_rtx_REG (mode, regno);
3495 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3496 if (!frame_pointer_needed)
3497 offset += cfun->machine->frame.frame_size
3498 - cfun->machine->frame.hard_fp_offset;
3499 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3500 rtx mem = gen_frame_mem (mode, addr);
3502 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3503 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3504 /* No more registers to handle after REGNO.
3505 Emit a single save/restore and exit. */
3506 if (regno2 == last_regno)
3508 insn = emit_insn (set);
3509 RTX_FRAME_RELATED_P (insn) = 1;
3510 if (prologue_p)
3511 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3512 else
3513 add_reg_note (insn, REG_CFA_RESTORE, reg);
3514 break;
3517 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3518 /* The next register is not of the same class or its offset is not
3519 mergeable with the current one into a pair. */
3520 if (!satisfies_constraint_Ump (mem)
3521 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3522 || (offset2 - cfun->machine->frame.reg_offset[regno])
3523 != GET_MODE_SIZE (mode))
3525 insn = emit_insn (set);
3526 RTX_FRAME_RELATED_P (insn) = 1;
3527 if (prologue_p)
3528 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3529 else
3530 add_reg_note (insn, REG_CFA_RESTORE, reg);
3532 regno = regno2;
3533 continue;
3536 /* REGNO2 can be saved/restored in a pair with REGNO. */
3537 rtx reg2 = gen_rtx_REG (mode, regno2);
3538 if (!frame_pointer_needed)
3539 offset2 += cfun->machine->frame.frame_size
3540 - cfun->machine->frame.hard_fp_offset;
3541 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3542 rtx mem2 = gen_frame_mem (mode, addr2);
3543 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3544 : gen_rtx_SET (reg2, mem2);
3546 if (prologue_p)
3547 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3548 else
3549 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3551 RTX_FRAME_RELATED_P (insn) = 1;
3552 if (prologue_p)
3554 add_reg_note (insn, REG_CFA_OFFSET, set);
3555 add_reg_note (insn, REG_CFA_OFFSET, set2);
3557 else
3559 add_reg_note (insn, REG_CFA_RESTORE, reg);
3560 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3563 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3567 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3569 static void
3570 aarch64_emit_prologue_components (sbitmap components)
3572 aarch64_process_components (components, true);
3575 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3577 static void
3578 aarch64_emit_epilogue_components (sbitmap components)
3580 aarch64_process_components (components, false);
3583 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3585 static void
3586 aarch64_set_handled_components (sbitmap components)
3588 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3589 if (bitmap_bit_p (components, regno))
3590 cfun->machine->reg_is_wrapped_separately[regno] = true;
3593 /* AArch64 stack frames generated by this compiler look like:
3595 +-------------------------------+
3597 | incoming stack arguments |
3599 +-------------------------------+
3600 | | <-- incoming stack pointer (aligned)
3601 | callee-allocated save area |
3602 | for register varargs |
3604 +-------------------------------+
3605 | local variables | <-- frame_pointer_rtx
3607 +-------------------------------+
3608 | padding0 | \
3609 +-------------------------------+ |
3610 | callee-saved registers | | frame.saved_regs_size
3611 +-------------------------------+ |
3612 | LR' | |
3613 +-------------------------------+ |
3614 | FP' | / <- hard_frame_pointer_rtx (aligned)
3615 +-------------------------------+
3616 | dynamic allocation |
3617 +-------------------------------+
3618 | padding |
3619 +-------------------------------+
3620 | outgoing stack arguments | <-- arg_pointer
3622 +-------------------------------+
3623 | | <-- stack_pointer_rtx (aligned)
3625 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3626 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3627 unchanged. */
3629 /* Generate the prologue instructions for entry into a function.
3630 Establish the stack frame by decreasing the stack pointer with a
3631 properly calculated size and, if necessary, create a frame record
3632 filled with the values of LR and previous frame pointer. The
3633 current FP is also set up if it is in use. */
3635 void
3636 aarch64_expand_prologue (void)
3638 aarch64_layout_frame ();
3640 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3641 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3642 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3643 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3644 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3645 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3646 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3647 rtx_insn *insn;
3649 /* Sign return address for functions. */
3650 if (aarch64_return_address_signing_enabled ())
3652 insn = emit_insn (gen_pacisp ());
3653 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3654 RTX_FRAME_RELATED_P (insn) = 1;
3657 if (flag_stack_usage_info)
3658 current_function_static_stack_size = frame_size;
3660 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3662 if (crtl->is_leaf && !cfun->calls_alloca)
3664 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3665 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3666 frame_size - STACK_CHECK_PROTECT);
3668 else if (frame_size > 0)
3669 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3672 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3674 if (callee_adjust != 0)
3675 aarch64_push_regs (reg1, reg2, callee_adjust);
3677 if (frame_pointer_needed)
3679 if (callee_adjust == 0)
3680 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3681 R30_REGNUM, false);
3682 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3683 stack_pointer_rtx,
3684 GEN_INT (callee_offset)));
3685 RTX_FRAME_RELATED_P (insn) = 1;
3686 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3689 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3690 callee_adjust != 0 || frame_pointer_needed);
3691 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3692 callee_adjust != 0 || frame_pointer_needed);
3693 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3696 /* Return TRUE if we can use a simple_return insn.
3698 This function checks whether the callee saved stack is empty, which
3699 means no restore actions are need. The pro_and_epilogue will use
3700 this to check whether shrink-wrapping opt is feasible. */
3702 bool
3703 aarch64_use_return_insn_p (void)
3705 if (!reload_completed)
3706 return false;
3708 if (crtl->profile)
3709 return false;
3711 aarch64_layout_frame ();
3713 return cfun->machine->frame.frame_size == 0;
3716 /* Generate the epilogue instructions for returning from a function.
3717 This is almost exactly the reverse of the prolog sequence, except
3718 that we need to insert barriers to avoid scheduling loads that read
3719 from a deallocated stack, and we optimize the unwind records by
3720 emitting them all together if possible. */
3721 void
3722 aarch64_expand_epilogue (bool for_sibcall)
3724 aarch64_layout_frame ();
3726 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3727 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3728 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3729 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3730 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3731 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3732 rtx cfi_ops = NULL;
3733 rtx_insn *insn;
3735 /* We need to add memory barrier to prevent read from deallocated stack. */
3736 bool need_barrier_p = (get_frame_size ()
3737 + cfun->machine->frame.saved_varargs_size) != 0;
3739 /* Emit a barrier to prevent loads from a deallocated stack. */
3740 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3741 || crtl->calls_eh_return)
3743 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3744 need_barrier_p = false;
3747 /* Restore the stack pointer from the frame pointer if it may not
3748 be the same as the stack pointer. */
3749 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3751 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3752 hard_frame_pointer_rtx,
3753 GEN_INT (-callee_offset)));
3754 /* If writeback is used when restoring callee-saves, the CFA
3755 is restored on the instruction doing the writeback. */
3756 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3758 else
3759 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3761 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3762 callee_adjust != 0, &cfi_ops);
3763 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3764 callee_adjust != 0, &cfi_ops);
3766 if (need_barrier_p)
3767 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3769 if (callee_adjust != 0)
3770 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3772 if (callee_adjust != 0 || initial_adjust > 65536)
3774 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3775 insn = get_last_insn ();
3776 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3777 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3778 RTX_FRAME_RELATED_P (insn) = 1;
3779 cfi_ops = NULL;
3782 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3784 if (cfi_ops)
3786 /* Emit delayed restores and reset the CFA to be SP. */
3787 insn = get_last_insn ();
3788 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3789 REG_NOTES (insn) = cfi_ops;
3790 RTX_FRAME_RELATED_P (insn) = 1;
3793 /* We prefer to emit the combined return/authenticate instruction RETAA,
3794 however there are three cases in which we must instead emit an explicit
3795 authentication instruction.
3797 1) Sibcalls don't return in a normal way, so if we're about to call one
3798 we must authenticate.
3800 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3801 generating code for !TARGET_ARMV8_3 we can't use it and must
3802 explicitly authenticate.
3804 3) On an eh_return path we make extra stack adjustments to update the
3805 canonical frame address to be the exception handler's CFA. We want
3806 to authenticate using the CFA of the function which calls eh_return.
3808 if (aarch64_return_address_signing_enabled ()
3809 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3811 insn = emit_insn (gen_autisp ());
3812 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3813 RTX_FRAME_RELATED_P (insn) = 1;
3816 /* Stack adjustment for exception handler. */
3817 if (crtl->calls_eh_return)
3819 /* We need to unwind the stack by the offset computed by
3820 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3821 to be SP; letting the CFA move during this adjustment
3822 is just as correct as retaining the CFA from the body
3823 of the function. Therefore, do nothing special. */
3824 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3827 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3828 if (!for_sibcall)
3829 emit_jump_insn (ret_rtx);
3832 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3833 normally or return to a previous frame after unwinding.
3835 An EH return uses a single shared return sequence. The epilogue is
3836 exactly like a normal epilogue except that it has an extra input
3837 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3838 that must be applied after the frame has been destroyed. An extra label
3839 is inserted before the epilogue which initializes this register to zero,
3840 and this is the entry point for a normal return.
3842 An actual EH return updates the return address, initializes the stack
3843 adjustment and jumps directly into the epilogue (bypassing the zeroing
3844 of the adjustment). Since the return address is typically saved on the
3845 stack when a function makes a call, the saved LR must be updated outside
3846 the epilogue.
3848 This poses problems as the store is generated well before the epilogue,
3849 so the offset of LR is not known yet. Also optimizations will remove the
3850 store as it appears dead, even after the epilogue is generated (as the
3851 base or offset for loading LR is different in many cases).
3853 To avoid these problems this implementation forces the frame pointer
3854 in eh_return functions so that the location of LR is fixed and known early.
3855 It also marks the store volatile, so no optimization is permitted to
3856 remove the store. */
3858 aarch64_eh_return_handler_rtx (void)
3860 rtx tmp = gen_frame_mem (Pmode,
3861 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3863 /* Mark the store volatile, so no optimization is permitted to remove it. */
3864 MEM_VOLATILE_P (tmp) = true;
3865 return tmp;
3868 /* Output code to add DELTA to the first argument, and then jump
3869 to FUNCTION. Used for C++ multiple inheritance. */
3870 static void
3871 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3872 HOST_WIDE_INT delta,
3873 HOST_WIDE_INT vcall_offset,
3874 tree function)
3876 /* The this pointer is always in x0. Note that this differs from
3877 Arm where the this pointer maybe bumped to r1 if r0 is required
3878 to return a pointer to an aggregate. On AArch64 a result value
3879 pointer will be in x8. */
3880 int this_regno = R0_REGNUM;
3881 rtx this_rtx, temp0, temp1, addr, funexp;
3882 rtx_insn *insn;
3884 reload_completed = 1;
3885 emit_note (NOTE_INSN_PROLOGUE_END);
3887 if (vcall_offset == 0)
3888 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3889 else
3891 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3893 this_rtx = gen_rtx_REG (Pmode, this_regno);
3894 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3895 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3897 addr = this_rtx;
3898 if (delta != 0)
3900 if (delta >= -256 && delta < 256)
3901 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3902 plus_constant (Pmode, this_rtx, delta));
3903 else
3904 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3907 if (Pmode == ptr_mode)
3908 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3909 else
3910 aarch64_emit_move (temp0,
3911 gen_rtx_ZERO_EXTEND (Pmode,
3912 gen_rtx_MEM (ptr_mode, addr)));
3914 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3915 addr = plus_constant (Pmode, temp0, vcall_offset);
3916 else
3918 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3919 Pmode);
3920 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3923 if (Pmode == ptr_mode)
3924 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3925 else
3926 aarch64_emit_move (temp1,
3927 gen_rtx_SIGN_EXTEND (Pmode,
3928 gen_rtx_MEM (ptr_mode, addr)));
3930 emit_insn (gen_add2_insn (this_rtx, temp1));
3933 /* Generate a tail call to the target function. */
3934 if (!TREE_USED (function))
3936 assemble_external (function);
3937 TREE_USED (function) = 1;
3939 funexp = XEXP (DECL_RTL (function), 0);
3940 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3941 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3942 SIBLING_CALL_P (insn) = 1;
3944 insn = get_insns ();
3945 shorten_branches (insn);
3946 final_start_function (insn, file, 1);
3947 final (insn, file, 1);
3948 final_end_function ();
3950 /* Stop pretending to be a post-reload pass. */
3951 reload_completed = 0;
3954 static bool
3955 aarch64_tls_referenced_p (rtx x)
3957 if (!TARGET_HAVE_TLS)
3958 return false;
3959 subrtx_iterator::array_type array;
3960 FOR_EACH_SUBRTX (iter, array, x, ALL)
3962 const_rtx x = *iter;
3963 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3964 return true;
3965 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3966 TLS offsets, not real symbol references. */
3967 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3968 iter.skip_subrtxes ();
3970 return false;
3974 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3975 a left shift of 0 or 12 bits. */
3976 bool
3977 aarch64_uimm12_shift (HOST_WIDE_INT val)
3979 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3980 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3985 /* Return true if val is an immediate that can be loaded into a
3986 register by a MOVZ instruction. */
3987 static bool
3988 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3990 if (GET_MODE_SIZE (mode) > 4)
3992 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3993 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3994 return 1;
3996 else
3998 /* Ignore sign extension. */
3999 val &= (HOST_WIDE_INT) 0xffffffff;
4001 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4002 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4005 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4007 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4009 0x0000000100000001ull,
4010 0x0001000100010001ull,
4011 0x0101010101010101ull,
4012 0x1111111111111111ull,
4013 0x5555555555555555ull,
4017 /* Return true if val is a valid bitmask immediate. */
4019 bool
4020 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4022 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4023 int bits;
4025 /* Check for a single sequence of one bits and return quickly if so.
4026 The special cases of all ones and all zeroes returns false. */
4027 val = (unsigned HOST_WIDE_INT) val_in;
4028 tmp = val + (val & -val);
4030 if (tmp == (tmp & -tmp))
4031 return (val + 1) > 1;
4033 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4034 if (mode == SImode)
4035 val = (val << 32) | (val & 0xffffffff);
4037 /* Invert if the immediate doesn't start with a zero bit - this means we
4038 only need to search for sequences of one bits. */
4039 if (val & 1)
4040 val = ~val;
4042 /* Find the first set bit and set tmp to val with the first sequence of one
4043 bits removed. Return success if there is a single sequence of ones. */
4044 first_one = val & -val;
4045 tmp = val & (val + first_one);
4047 if (tmp == 0)
4048 return true;
4050 /* Find the next set bit and compute the difference in bit position. */
4051 next_one = tmp & -tmp;
4052 bits = clz_hwi (first_one) - clz_hwi (next_one);
4053 mask = val ^ tmp;
4055 /* Check the bit position difference is a power of 2, and that the first
4056 sequence of one bits fits within 'bits' bits. */
4057 if ((mask >> bits) != 0 || bits != (bits & -bits))
4058 return false;
4060 /* Check the sequence of one bits is repeated 64/bits times. */
4061 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4064 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4065 Assumed precondition: VAL_IN Is not zero. */
4067 unsigned HOST_WIDE_INT
4068 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4070 int lowest_bit_set = ctz_hwi (val_in);
4071 int highest_bit_set = floor_log2 (val_in);
4072 gcc_assert (val_in != 0);
4074 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4075 (HOST_WIDE_INT_1U << lowest_bit_set));
4078 /* Create constant where bits outside of lowest bit set to highest bit set
4079 are set to 1. */
4081 unsigned HOST_WIDE_INT
4082 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4084 return val_in | ~aarch64_and_split_imm1 (val_in);
4087 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4089 bool
4090 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4092 if (aarch64_bitmask_imm (val_in, mode))
4093 return false;
4095 if (aarch64_move_imm (val_in, mode))
4096 return false;
4098 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4100 return aarch64_bitmask_imm (imm2, mode);
4103 /* Return true if val is an immediate that can be loaded into a
4104 register in a single instruction. */
4105 bool
4106 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4108 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4109 return 1;
4110 return aarch64_bitmask_imm (val, mode);
4113 static bool
4114 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4116 rtx base, offset;
4118 if (GET_CODE (x) == HIGH)
4119 return true;
4121 split_const (x, &base, &offset);
4122 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4124 if (aarch64_classify_symbol (base, offset)
4125 != SYMBOL_FORCE_TO_MEM)
4126 return true;
4127 else
4128 /* Avoid generating a 64-bit relocation in ILP32; leave
4129 to aarch64_expand_mov_immediate to handle it properly. */
4130 return mode != ptr_mode;
4133 return aarch64_tls_referenced_p (x);
4136 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4137 The expansion for a table switch is quite expensive due to the number
4138 of instructions, the table lookup and hard to predict indirect jump.
4139 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4140 set, otherwise use tables for > 16 cases as a tradeoff between size and
4141 performance. When optimizing for size, use the default setting. */
4143 static unsigned int
4144 aarch64_case_values_threshold (void)
4146 /* Use the specified limit for the number of cases before using jump
4147 tables at higher optimization levels. */
4148 if (optimize > 2
4149 && selected_cpu->tune->max_case_values != 0)
4150 return selected_cpu->tune->max_case_values;
4151 else
4152 return optimize_size ? default_case_values_threshold () : 17;
4155 /* Return true if register REGNO is a valid index register.
4156 STRICT_P is true if REG_OK_STRICT is in effect. */
4158 bool
4159 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4161 if (!HARD_REGISTER_NUM_P (regno))
4163 if (!strict_p)
4164 return true;
4166 if (!reg_renumber)
4167 return false;
4169 regno = reg_renumber[regno];
4171 return GP_REGNUM_P (regno);
4174 /* Return true if register REGNO is a valid base register for mode MODE.
4175 STRICT_P is true if REG_OK_STRICT is in effect. */
4177 bool
4178 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4180 if (!HARD_REGISTER_NUM_P (regno))
4182 if (!strict_p)
4183 return true;
4185 if (!reg_renumber)
4186 return false;
4188 regno = reg_renumber[regno];
4191 /* The fake registers will be eliminated to either the stack or
4192 hard frame pointer, both of which are usually valid base registers.
4193 Reload deals with the cases where the eliminated form isn't valid. */
4194 return (GP_REGNUM_P (regno)
4195 || regno == SP_REGNUM
4196 || regno == FRAME_POINTER_REGNUM
4197 || regno == ARG_POINTER_REGNUM);
4200 /* Return true if X is a valid base register for mode MODE.
4201 STRICT_P is true if REG_OK_STRICT is in effect. */
4203 static bool
4204 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4206 if (!strict_p
4207 && GET_CODE (x) == SUBREG
4208 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4209 x = SUBREG_REG (x);
4211 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4214 /* Return true if address offset is a valid index. If it is, fill in INFO
4215 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4217 static bool
4218 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4219 machine_mode mode, bool strict_p)
4221 enum aarch64_address_type type;
4222 rtx index;
4223 int shift;
4225 /* (reg:P) */
4226 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4227 && GET_MODE (x) == Pmode)
4229 type = ADDRESS_REG_REG;
4230 index = x;
4231 shift = 0;
4233 /* (sign_extend:DI (reg:SI)) */
4234 else if ((GET_CODE (x) == SIGN_EXTEND
4235 || GET_CODE (x) == ZERO_EXTEND)
4236 && GET_MODE (x) == DImode
4237 && GET_MODE (XEXP (x, 0)) == SImode)
4239 type = (GET_CODE (x) == SIGN_EXTEND)
4240 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4241 index = XEXP (x, 0);
4242 shift = 0;
4244 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4245 else if (GET_CODE (x) == MULT
4246 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4247 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4248 && GET_MODE (XEXP (x, 0)) == DImode
4249 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4250 && CONST_INT_P (XEXP (x, 1)))
4252 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4253 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4254 index = XEXP (XEXP (x, 0), 0);
4255 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4257 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4258 else if (GET_CODE (x) == ASHIFT
4259 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4260 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4261 && GET_MODE (XEXP (x, 0)) == DImode
4262 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4263 && CONST_INT_P (XEXP (x, 1)))
4265 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4266 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267 index = XEXP (XEXP (x, 0), 0);
4268 shift = INTVAL (XEXP (x, 1));
4270 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4271 else if ((GET_CODE (x) == SIGN_EXTRACT
4272 || GET_CODE (x) == ZERO_EXTRACT)
4273 && GET_MODE (x) == DImode
4274 && GET_CODE (XEXP (x, 0)) == MULT
4275 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4276 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4278 type = (GET_CODE (x) == SIGN_EXTRACT)
4279 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280 index = XEXP (XEXP (x, 0), 0);
4281 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4282 if (INTVAL (XEXP (x, 1)) != 32 + shift
4283 || INTVAL (XEXP (x, 2)) != 0)
4284 shift = -1;
4286 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4287 (const_int 0xffffffff<<shift)) */
4288 else if (GET_CODE (x) == AND
4289 && GET_MODE (x) == DImode
4290 && GET_CODE (XEXP (x, 0)) == MULT
4291 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4292 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4293 && CONST_INT_P (XEXP (x, 1)))
4295 type = ADDRESS_REG_UXTW;
4296 index = XEXP (XEXP (x, 0), 0);
4297 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4298 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4299 shift = -1;
4301 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4302 else if ((GET_CODE (x) == SIGN_EXTRACT
4303 || GET_CODE (x) == ZERO_EXTRACT)
4304 && GET_MODE (x) == DImode
4305 && GET_CODE (XEXP (x, 0)) == ASHIFT
4306 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4307 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4309 type = (GET_CODE (x) == SIGN_EXTRACT)
4310 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4311 index = XEXP (XEXP (x, 0), 0);
4312 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4313 if (INTVAL (XEXP (x, 1)) != 32 + shift
4314 || INTVAL (XEXP (x, 2)) != 0)
4315 shift = -1;
4317 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4318 (const_int 0xffffffff<<shift)) */
4319 else if (GET_CODE (x) == AND
4320 && GET_MODE (x) == DImode
4321 && GET_CODE (XEXP (x, 0)) == ASHIFT
4322 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4323 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4324 && CONST_INT_P (XEXP (x, 1)))
4326 type = ADDRESS_REG_UXTW;
4327 index = XEXP (XEXP (x, 0), 0);
4328 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4329 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4330 shift = -1;
4332 /* (mult:P (reg:P) (const_int scale)) */
4333 else if (GET_CODE (x) == MULT
4334 && GET_MODE (x) == Pmode
4335 && GET_MODE (XEXP (x, 0)) == Pmode
4336 && CONST_INT_P (XEXP (x, 1)))
4338 type = ADDRESS_REG_REG;
4339 index = XEXP (x, 0);
4340 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4342 /* (ashift:P (reg:P) (const_int shift)) */
4343 else if (GET_CODE (x) == ASHIFT
4344 && GET_MODE (x) == Pmode
4345 && GET_MODE (XEXP (x, 0)) == Pmode
4346 && CONST_INT_P (XEXP (x, 1)))
4348 type = ADDRESS_REG_REG;
4349 index = XEXP (x, 0);
4350 shift = INTVAL (XEXP (x, 1));
4352 else
4353 return false;
4355 if (!strict_p
4356 && GET_CODE (index) == SUBREG
4357 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4358 index = SUBREG_REG (index);
4360 if ((shift == 0 ||
4361 (shift > 0 && shift <= 3
4362 && (1 << shift) == GET_MODE_SIZE (mode)))
4363 && REG_P (index)
4364 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4366 info->type = type;
4367 info->offset = index;
4368 info->shift = shift;
4369 return true;
4372 return false;
4375 /* Return true if MODE is one of the modes for which we
4376 support LDP/STP operations. */
4378 static bool
4379 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4381 return mode == SImode || mode == DImode
4382 || mode == SFmode || mode == DFmode
4383 || (aarch64_vector_mode_supported_p (mode)
4384 && GET_MODE_SIZE (mode) == 8);
4387 /* Return true if REGNO is a virtual pointer register, or an eliminable
4388 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4389 include stack_pointer or hard_frame_pointer. */
4390 static bool
4391 virt_or_elim_regno_p (unsigned regno)
4393 return ((regno >= FIRST_VIRTUAL_REGISTER
4394 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4395 || regno == FRAME_POINTER_REGNUM
4396 || regno == ARG_POINTER_REGNUM);
4399 /* Return true if X is a valid address for machine mode MODE. If it is,
4400 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4401 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4403 static bool
4404 aarch64_classify_address (struct aarch64_address_info *info,
4405 rtx x, machine_mode mode,
4406 RTX_CODE outer_code, bool strict_p)
4408 enum rtx_code code = GET_CODE (x);
4409 rtx op0, op1;
4411 /* On BE, we use load/store pair for all large int mode load/stores.
4412 TI/TFmode may also use a load/store pair. */
4413 bool load_store_pair_p = (outer_code == PARALLEL
4414 || mode == TImode
4415 || mode == TFmode
4416 || (BYTES_BIG_ENDIAN
4417 && aarch64_vect_struct_mode_p (mode)));
4419 bool allow_reg_index_p =
4420 !load_store_pair_p
4421 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4422 && !aarch64_vect_struct_mode_p (mode);
4424 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4425 REG addressing. */
4426 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4427 && (code != POST_INC && code != REG))
4428 return false;
4430 switch (code)
4432 case REG:
4433 case SUBREG:
4434 info->type = ADDRESS_REG_IMM;
4435 info->base = x;
4436 info->offset = const0_rtx;
4437 return aarch64_base_register_rtx_p (x, strict_p);
4439 case PLUS:
4440 op0 = XEXP (x, 0);
4441 op1 = XEXP (x, 1);
4443 if (! strict_p
4444 && REG_P (op0)
4445 && virt_or_elim_regno_p (REGNO (op0))
4446 && CONST_INT_P (op1))
4448 info->type = ADDRESS_REG_IMM;
4449 info->base = op0;
4450 info->offset = op1;
4452 return true;
4455 if (GET_MODE_SIZE (mode) != 0
4456 && CONST_INT_P (op1)
4457 && aarch64_base_register_rtx_p (op0, strict_p))
4459 HOST_WIDE_INT offset = INTVAL (op1);
4461 info->type = ADDRESS_REG_IMM;
4462 info->base = op0;
4463 info->offset = op1;
4465 /* TImode and TFmode values are allowed in both pairs of X
4466 registers and individual Q registers. The available
4467 address modes are:
4468 X,X: 7-bit signed scaled offset
4469 Q: 9-bit signed offset
4470 We conservatively require an offset representable in either mode.
4471 When performing the check for pairs of X registers i.e. LDP/STP
4472 pass down DImode since that is the natural size of the LDP/STP
4473 instruction memory accesses. */
4474 if (mode == TImode || mode == TFmode)
4475 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4476 && (offset_9bit_signed_unscaled_p (mode, offset)
4477 || offset_12bit_unsigned_scaled_p (mode, offset)));
4479 /* A 7bit offset check because OImode will emit a ldp/stp
4480 instruction (only big endian will get here).
4481 For ldp/stp instructions, the offset is scaled for the size of a
4482 single element of the pair. */
4483 if (mode == OImode)
4484 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4486 /* Three 9/12 bit offsets checks because CImode will emit three
4487 ldr/str instructions (only big endian will get here). */
4488 if (mode == CImode)
4489 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4490 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4491 || offset_12bit_unsigned_scaled_p (V16QImode,
4492 offset + 32)));
4494 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4495 instructions (only big endian will get here). */
4496 if (mode == XImode)
4497 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4498 && aarch64_offset_7bit_signed_scaled_p (TImode,
4499 offset + 32));
4501 if (load_store_pair_p)
4502 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4503 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4504 else
4505 return (offset_9bit_signed_unscaled_p (mode, offset)
4506 || offset_12bit_unsigned_scaled_p (mode, offset));
4509 if (allow_reg_index_p)
4511 /* Look for base + (scaled/extended) index register. */
4512 if (aarch64_base_register_rtx_p (op0, strict_p)
4513 && aarch64_classify_index (info, op1, mode, strict_p))
4515 info->base = op0;
4516 return true;
4518 if (aarch64_base_register_rtx_p (op1, strict_p)
4519 && aarch64_classify_index (info, op0, mode, strict_p))
4521 info->base = op1;
4522 return true;
4526 return false;
4528 case POST_INC:
4529 case POST_DEC:
4530 case PRE_INC:
4531 case PRE_DEC:
4532 info->type = ADDRESS_REG_WB;
4533 info->base = XEXP (x, 0);
4534 info->offset = NULL_RTX;
4535 return aarch64_base_register_rtx_p (info->base, strict_p);
4537 case POST_MODIFY:
4538 case PRE_MODIFY:
4539 info->type = ADDRESS_REG_WB;
4540 info->base = XEXP (x, 0);
4541 if (GET_CODE (XEXP (x, 1)) == PLUS
4542 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4543 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4544 && aarch64_base_register_rtx_p (info->base, strict_p))
4546 HOST_WIDE_INT offset;
4547 info->offset = XEXP (XEXP (x, 1), 1);
4548 offset = INTVAL (info->offset);
4550 /* TImode and TFmode values are allowed in both pairs of X
4551 registers and individual Q registers. The available
4552 address modes are:
4553 X,X: 7-bit signed scaled offset
4554 Q: 9-bit signed offset
4555 We conservatively require an offset representable in either mode.
4557 if (mode == TImode || mode == TFmode)
4558 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4559 && offset_9bit_signed_unscaled_p (mode, offset));
4561 if (load_store_pair_p)
4562 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4563 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4564 else
4565 return offset_9bit_signed_unscaled_p (mode, offset);
4567 return false;
4569 case CONST:
4570 case SYMBOL_REF:
4571 case LABEL_REF:
4572 /* load literal: pc-relative constant pool entry. Only supported
4573 for SI mode or larger. */
4574 info->type = ADDRESS_SYMBOLIC;
4576 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4578 rtx sym, addend;
4580 split_const (x, &sym, &addend);
4581 return ((GET_CODE (sym) == LABEL_REF
4582 || (GET_CODE (sym) == SYMBOL_REF
4583 && CONSTANT_POOL_ADDRESS_P (sym)
4584 && aarch64_pcrelative_literal_loads)));
4586 return false;
4588 case LO_SUM:
4589 info->type = ADDRESS_LO_SUM;
4590 info->base = XEXP (x, 0);
4591 info->offset = XEXP (x, 1);
4592 if (allow_reg_index_p
4593 && aarch64_base_register_rtx_p (info->base, strict_p))
4595 rtx sym, offs;
4596 split_const (info->offset, &sym, &offs);
4597 if (GET_CODE (sym) == SYMBOL_REF
4598 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4600 /* The symbol and offset must be aligned to the access size. */
4601 unsigned int align;
4602 unsigned int ref_size;
4604 if (CONSTANT_POOL_ADDRESS_P (sym))
4605 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4606 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4608 tree exp = SYMBOL_REF_DECL (sym);
4609 align = TYPE_ALIGN (TREE_TYPE (exp));
4610 align = CONSTANT_ALIGNMENT (exp, align);
4612 else if (SYMBOL_REF_DECL (sym))
4613 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4614 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4615 && SYMBOL_REF_BLOCK (sym) != NULL)
4616 align = SYMBOL_REF_BLOCK (sym)->alignment;
4617 else
4618 align = BITS_PER_UNIT;
4620 ref_size = GET_MODE_SIZE (mode);
4621 if (ref_size == 0)
4622 ref_size = GET_MODE_SIZE (DImode);
4624 return ((INTVAL (offs) & (ref_size - 1)) == 0
4625 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4628 return false;
4630 default:
4631 return false;
4635 /* Return true if the address X is valid for a PRFM instruction.
4636 STRICT_P is true if we should do strict checking with
4637 aarch64_classify_address. */
4639 bool
4640 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4642 struct aarch64_address_info addr;
4644 /* PRFM accepts the same addresses as DImode... */
4645 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4646 if (!res)
4647 return false;
4649 /* ... except writeback forms. */
4650 return addr.type != ADDRESS_REG_WB;
4653 bool
4654 aarch64_symbolic_address_p (rtx x)
4656 rtx offset;
4658 split_const (x, &x, &offset);
4659 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4662 /* Classify the base of symbolic expression X. */
4664 enum aarch64_symbol_type
4665 aarch64_classify_symbolic_expression (rtx x)
4667 rtx offset;
4669 split_const (x, &x, &offset);
4670 return aarch64_classify_symbol (x, offset);
4674 /* Return TRUE if X is a legitimate address for accessing memory in
4675 mode MODE. */
4676 static bool
4677 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4679 struct aarch64_address_info addr;
4681 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4684 /* Return TRUE if X is a legitimate address for accessing memory in
4685 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4686 pair operation. */
4687 bool
4688 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4689 RTX_CODE outer_code, bool strict_p)
4691 struct aarch64_address_info addr;
4693 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4696 /* Split an out-of-range address displacement into a base and offset.
4697 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4698 to increase opportunities for sharing the base address of different sizes.
4699 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4700 static bool
4701 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4703 HOST_WIDE_INT offset = INTVAL (*disp);
4704 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4706 if (mode == TImode || mode == TFmode
4707 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4708 base = (offset + 0x100) & ~0x1ff;
4710 *off = GEN_INT (base);
4711 *disp = GEN_INT (offset - base);
4712 return true;
4715 /* Return the binary representation of floating point constant VALUE in INTVAL.
4716 If the value cannot be converted, return false without setting INTVAL.
4717 The conversion is done in the given MODE. */
4718 bool
4719 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4722 /* We make a general exception for 0. */
4723 if (aarch64_float_const_zero_rtx_p (value))
4725 *intval = 0;
4726 return true;
4729 machine_mode mode = GET_MODE (value);
4730 if (GET_CODE (value) != CONST_DOUBLE
4731 || !SCALAR_FLOAT_MODE_P (mode)
4732 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4733 /* Only support up to DF mode. */
4734 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4735 return false;
4737 unsigned HOST_WIDE_INT ival = 0;
4739 long res[2];
4740 real_to_target (res,
4741 CONST_DOUBLE_REAL_VALUE (value),
4742 REAL_MODE_FORMAT (mode));
4744 if (mode == DFmode)
4746 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4747 ival = zext_hwi (res[order], 32);
4748 ival |= (zext_hwi (res[1 - order], 32) << 32);
4750 else
4751 ival = zext_hwi (res[0], 32);
4753 *intval = ival;
4754 return true;
4757 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4758 single MOV(+MOVK) followed by an FMOV. */
4759 bool
4760 aarch64_float_const_rtx_p (rtx x)
4762 machine_mode mode = GET_MODE (x);
4763 if (mode == VOIDmode)
4764 return false;
4766 /* Determine whether it's cheaper to write float constants as
4767 mov/movk pairs over ldr/adrp pairs. */
4768 unsigned HOST_WIDE_INT ival;
4770 if (GET_CODE (x) == CONST_DOUBLE
4771 && SCALAR_FLOAT_MODE_P (mode)
4772 && aarch64_reinterpret_float_as_int (x, &ival))
4774 machine_mode imode = (mode == HFmode
4775 ? SImode
4776 : int_mode_for_mode (mode).require ());
4777 int num_instr = aarch64_internal_mov_immediate
4778 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4779 return num_instr < 3;
4782 return false;
4785 /* Return TRUE if rtx X is immediate constant 0.0 */
4786 bool
4787 aarch64_float_const_zero_rtx_p (rtx x)
4789 if (GET_MODE (x) == VOIDmode)
4790 return false;
4792 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4793 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4794 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4797 /* Return TRUE if rtx X is immediate constant that fits in a single
4798 MOVI immediate operation. */
4799 bool
4800 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4802 if (!TARGET_SIMD)
4803 return false;
4805 machine_mode vmode, imode;
4806 unsigned HOST_WIDE_INT ival;
4808 if (GET_CODE (x) == CONST_DOUBLE
4809 && SCALAR_FLOAT_MODE_P (mode))
4811 if (!aarch64_reinterpret_float_as_int (x, &ival))
4812 return false;
4814 /* We make a general exception for 0. */
4815 if (aarch64_float_const_zero_rtx_p (x))
4816 return true;
4818 imode = int_mode_for_mode (mode).require ();
4820 else if (GET_CODE (x) == CONST_INT
4821 && SCALAR_INT_MODE_P (mode))
4823 imode = mode;
4824 ival = INTVAL (x);
4826 else
4827 return false;
4829 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4830 a 128 bit vector mode. */
4831 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4833 vmode = aarch64_simd_container_mode (imode, width);
4834 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4836 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4840 /* Return the fixed registers used for condition codes. */
4842 static bool
4843 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4845 *p1 = CC_REGNUM;
4846 *p2 = INVALID_REGNUM;
4847 return true;
4850 /* This function is used by the call expanders of the machine description.
4851 RESULT is the register in which the result is returned. It's NULL for
4852 "call" and "sibcall".
4853 MEM is the location of the function call.
4854 SIBCALL indicates whether this function call is normal call or sibling call.
4855 It will generate different pattern accordingly. */
4857 void
4858 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4860 rtx call, callee, tmp;
4861 rtvec vec;
4862 machine_mode mode;
4864 gcc_assert (MEM_P (mem));
4865 callee = XEXP (mem, 0);
4866 mode = GET_MODE (callee);
4867 gcc_assert (mode == Pmode);
4869 /* Decide if we should generate indirect calls by loading the
4870 address of the callee into a register before performing
4871 the branch-and-link. */
4872 if (SYMBOL_REF_P (callee)
4873 ? (aarch64_is_long_call_p (callee)
4874 || aarch64_is_noplt_call_p (callee))
4875 : !REG_P (callee))
4876 XEXP (mem, 0) = force_reg (mode, callee);
4878 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4880 if (result != NULL_RTX)
4881 call = gen_rtx_SET (result, call);
4883 if (sibcall)
4884 tmp = ret_rtx;
4885 else
4886 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4888 vec = gen_rtvec (2, call, tmp);
4889 call = gen_rtx_PARALLEL (VOIDmode, vec);
4891 aarch64_emit_call_insn (call);
4894 /* Emit call insn with PAT and do aarch64-specific handling. */
4896 void
4897 aarch64_emit_call_insn (rtx pat)
4899 rtx insn = emit_call_insn (pat);
4901 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4902 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4903 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4906 machine_mode
4907 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4909 /* All floating point compares return CCFP if it is an equality
4910 comparison, and CCFPE otherwise. */
4911 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4913 switch (code)
4915 case EQ:
4916 case NE:
4917 case UNORDERED:
4918 case ORDERED:
4919 case UNLT:
4920 case UNLE:
4921 case UNGT:
4922 case UNGE:
4923 case UNEQ:
4924 case LTGT:
4925 return CCFPmode;
4927 case LT:
4928 case LE:
4929 case GT:
4930 case GE:
4931 return CCFPEmode;
4933 default:
4934 gcc_unreachable ();
4938 /* Equality comparisons of short modes against zero can be performed
4939 using the TST instruction with the appropriate bitmask. */
4940 if (y == const0_rtx && REG_P (x)
4941 && (code == EQ || code == NE)
4942 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4943 return CC_NZmode;
4945 /* Similarly, comparisons of zero_extends from shorter modes can
4946 be performed using an ANDS with an immediate mask. */
4947 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4948 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4949 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4950 && (code == EQ || code == NE))
4951 return CC_NZmode;
4953 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4954 && y == const0_rtx
4955 && (code == EQ || code == NE || code == LT || code == GE)
4956 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4957 || GET_CODE (x) == NEG
4958 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4959 && CONST_INT_P (XEXP (x, 2)))))
4960 return CC_NZmode;
4962 /* A compare with a shifted operand. Because of canonicalization,
4963 the comparison will have to be swapped when we emit the assembly
4964 code. */
4965 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4966 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4967 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4968 || GET_CODE (x) == LSHIFTRT
4969 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4970 return CC_SWPmode;
4972 /* Similarly for a negated operand, but we can only do this for
4973 equalities. */
4974 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4975 && (REG_P (y) || GET_CODE (y) == SUBREG)
4976 && (code == EQ || code == NE)
4977 && GET_CODE (x) == NEG)
4978 return CC_Zmode;
4980 /* A test for unsigned overflow. */
4981 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4982 && code == NE
4983 && GET_CODE (x) == PLUS
4984 && GET_CODE (y) == ZERO_EXTEND)
4985 return CC_Cmode;
4987 /* For everything else, return CCmode. */
4988 return CCmode;
4991 static int
4992 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4995 aarch64_get_condition_code (rtx x)
4997 machine_mode mode = GET_MODE (XEXP (x, 0));
4998 enum rtx_code comp_code = GET_CODE (x);
5000 if (GET_MODE_CLASS (mode) != MODE_CC)
5001 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5002 return aarch64_get_condition_code_1 (mode, comp_code);
5005 static int
5006 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5008 switch (mode)
5010 case E_CCFPmode:
5011 case E_CCFPEmode:
5012 switch (comp_code)
5014 case GE: return AARCH64_GE;
5015 case GT: return AARCH64_GT;
5016 case LE: return AARCH64_LS;
5017 case LT: return AARCH64_MI;
5018 case NE: return AARCH64_NE;
5019 case EQ: return AARCH64_EQ;
5020 case ORDERED: return AARCH64_VC;
5021 case UNORDERED: return AARCH64_VS;
5022 case UNLT: return AARCH64_LT;
5023 case UNLE: return AARCH64_LE;
5024 case UNGT: return AARCH64_HI;
5025 case UNGE: return AARCH64_PL;
5026 default: return -1;
5028 break;
5030 case E_CCmode:
5031 switch (comp_code)
5033 case NE: return AARCH64_NE;
5034 case EQ: return AARCH64_EQ;
5035 case GE: return AARCH64_GE;
5036 case GT: return AARCH64_GT;
5037 case LE: return AARCH64_LE;
5038 case LT: return AARCH64_LT;
5039 case GEU: return AARCH64_CS;
5040 case GTU: return AARCH64_HI;
5041 case LEU: return AARCH64_LS;
5042 case LTU: return AARCH64_CC;
5043 default: return -1;
5045 break;
5047 case E_CC_SWPmode:
5048 switch (comp_code)
5050 case NE: return AARCH64_NE;
5051 case EQ: return AARCH64_EQ;
5052 case GE: return AARCH64_LE;
5053 case GT: return AARCH64_LT;
5054 case LE: return AARCH64_GE;
5055 case LT: return AARCH64_GT;
5056 case GEU: return AARCH64_LS;
5057 case GTU: return AARCH64_CC;
5058 case LEU: return AARCH64_CS;
5059 case LTU: return AARCH64_HI;
5060 default: return -1;
5062 break;
5064 case E_CC_NZmode:
5065 switch (comp_code)
5067 case NE: return AARCH64_NE;
5068 case EQ: return AARCH64_EQ;
5069 case GE: return AARCH64_PL;
5070 case LT: return AARCH64_MI;
5071 default: return -1;
5073 break;
5075 case E_CC_Zmode:
5076 switch (comp_code)
5078 case NE: return AARCH64_NE;
5079 case EQ: return AARCH64_EQ;
5080 default: return -1;
5082 break;
5084 case E_CC_Cmode:
5085 switch (comp_code)
5087 case NE: return AARCH64_CS;
5088 case EQ: return AARCH64_CC;
5089 default: return -1;
5091 break;
5093 default:
5094 return -1;
5097 return -1;
5100 bool
5101 aarch64_const_vec_all_same_in_range_p (rtx x,
5102 HOST_WIDE_INT minval,
5103 HOST_WIDE_INT maxval)
5105 HOST_WIDE_INT firstval;
5106 int count, i;
5108 if (GET_CODE (x) != CONST_VECTOR
5109 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5110 return false;
5112 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5113 if (firstval < minval || firstval > maxval)
5114 return false;
5116 count = CONST_VECTOR_NUNITS (x);
5117 for (i = 1; i < count; i++)
5118 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5119 return false;
5121 return true;
5124 bool
5125 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5127 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5131 /* N Z C V. */
5132 #define AARCH64_CC_V 1
5133 #define AARCH64_CC_C (1 << 1)
5134 #define AARCH64_CC_Z (1 << 2)
5135 #define AARCH64_CC_N (1 << 3)
5137 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5138 static const int aarch64_nzcv_codes[] =
5140 0, /* EQ, Z == 1. */
5141 AARCH64_CC_Z, /* NE, Z == 0. */
5142 0, /* CS, C == 1. */
5143 AARCH64_CC_C, /* CC, C == 0. */
5144 0, /* MI, N == 1. */
5145 AARCH64_CC_N, /* PL, N == 0. */
5146 0, /* VS, V == 1. */
5147 AARCH64_CC_V, /* VC, V == 0. */
5148 0, /* HI, C ==1 && Z == 0. */
5149 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5150 AARCH64_CC_V, /* GE, N == V. */
5151 0, /* LT, N != V. */
5152 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5153 0, /* LE, !(Z == 0 && N == V). */
5154 0, /* AL, Any. */
5155 0 /* NV, Any. */
5158 /* Print operand X to file F in a target specific manner according to CODE.
5159 The acceptable formatting commands given by CODE are:
5160 'c': An integer or symbol address without a preceding #
5161 sign.
5162 'e': Print the sign/zero-extend size as a character 8->b,
5163 16->h, 32->w.
5164 'p': Prints N such that 2^N == X (X must be power of 2 and
5165 const int).
5166 'P': Print the number of non-zero bits in X (a const_int).
5167 'H': Print the higher numbered register of a pair (TImode)
5168 of regs.
5169 'm': Print a condition (eq, ne, etc).
5170 'M': Same as 'm', but invert condition.
5171 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5172 'S/T/U/V': Print a FP/SIMD register name for a register list.
5173 The register printed is the FP/SIMD register name
5174 of X + 0/1/2/3 for S/T/U/V.
5175 'R': Print a scalar FP/SIMD register name + 1.
5176 'X': Print bottom 16 bits of integer constant in hex.
5177 'w/x': Print a general register name or the zero register
5178 (32-bit or 64-bit).
5179 '0': Print a normal operand, if it's a general register,
5180 then we assume DImode.
5181 'k': Print NZCV for conditional compare instructions.
5182 'A': Output address constant representing the first
5183 argument of X, specifying a relocation offset
5184 if appropriate.
5185 'L': Output constant address specified by X
5186 with a relocation offset if appropriate.
5187 'G': Prints address of X, specifying a PC relative
5188 relocation mode if appropriate. */
5190 static void
5191 aarch64_print_operand (FILE *f, rtx x, int code)
5193 switch (code)
5195 case 'c':
5196 switch (GET_CODE (x))
5198 case CONST_INT:
5199 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5200 break;
5202 case SYMBOL_REF:
5203 output_addr_const (f, x);
5204 break;
5206 case CONST:
5207 if (GET_CODE (XEXP (x, 0)) == PLUS
5208 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5210 output_addr_const (f, x);
5211 break;
5213 /* Fall through. */
5215 default:
5216 output_operand_lossage ("Unsupported operand for code '%c'", code);
5218 break;
5220 case 'e':
5222 int n;
5224 if (!CONST_INT_P (x)
5225 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5227 output_operand_lossage ("invalid operand for '%%%c'", code);
5228 return;
5231 switch (n)
5233 case 3:
5234 fputc ('b', f);
5235 break;
5236 case 4:
5237 fputc ('h', f);
5238 break;
5239 case 5:
5240 fputc ('w', f);
5241 break;
5242 default:
5243 output_operand_lossage ("invalid operand for '%%%c'", code);
5244 return;
5247 break;
5249 case 'p':
5251 int n;
5253 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5255 output_operand_lossage ("invalid operand for '%%%c'", code);
5256 return;
5259 asm_fprintf (f, "%d", n);
5261 break;
5263 case 'P':
5264 if (!CONST_INT_P (x))
5266 output_operand_lossage ("invalid operand for '%%%c'", code);
5267 return;
5270 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5271 break;
5273 case 'H':
5274 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5276 output_operand_lossage ("invalid operand for '%%%c'", code);
5277 return;
5280 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5281 break;
5283 case 'M':
5284 case 'm':
5286 int cond_code;
5287 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5288 if (x == const_true_rtx)
5290 if (code == 'M')
5291 fputs ("nv", f);
5292 return;
5295 if (!COMPARISON_P (x))
5297 output_operand_lossage ("invalid operand for '%%%c'", code);
5298 return;
5301 cond_code = aarch64_get_condition_code (x);
5302 gcc_assert (cond_code >= 0);
5303 if (code == 'M')
5304 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5305 fputs (aarch64_condition_codes[cond_code], f);
5307 break;
5309 case 'b':
5310 case 'h':
5311 case 's':
5312 case 'd':
5313 case 'q':
5314 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5316 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5317 return;
5319 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5320 break;
5322 case 'S':
5323 case 'T':
5324 case 'U':
5325 case 'V':
5326 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5328 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5329 return;
5331 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5332 break;
5334 case 'R':
5335 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5337 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5338 return;
5340 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5341 break;
5343 case 'X':
5344 if (!CONST_INT_P (x))
5346 output_operand_lossage ("invalid operand for '%%%c'", code);
5347 return;
5349 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5350 break;
5352 case 'w':
5353 case 'x':
5354 if (x == const0_rtx
5355 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5357 asm_fprintf (f, "%czr", code);
5358 break;
5361 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5363 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5364 break;
5367 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5369 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5370 break;
5373 /* Fall through */
5375 case 0:
5376 if (x == NULL)
5378 output_operand_lossage ("missing operand");
5379 return;
5382 switch (GET_CODE (x))
5384 case REG:
5385 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5386 break;
5388 case MEM:
5389 output_address (GET_MODE (x), XEXP (x, 0));
5390 /* Check all memory references are Pmode - even with ILP32. */
5391 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5392 break;
5394 case CONST:
5395 case LABEL_REF:
5396 case SYMBOL_REF:
5397 output_addr_const (asm_out_file, x);
5398 break;
5400 case CONST_INT:
5401 asm_fprintf (f, "%wd", INTVAL (x));
5402 break;
5404 case CONST_VECTOR:
5405 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5407 gcc_assert (
5408 aarch64_const_vec_all_same_in_range_p (x,
5409 HOST_WIDE_INT_MIN,
5410 HOST_WIDE_INT_MAX));
5411 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5413 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5415 fputc ('0', f);
5417 else
5418 gcc_unreachable ();
5419 break;
5421 case CONST_DOUBLE:
5422 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5423 be getting CONST_DOUBLEs holding integers. */
5424 gcc_assert (GET_MODE (x) != VOIDmode);
5425 if (aarch64_float_const_zero_rtx_p (x))
5427 fputc ('0', f);
5428 break;
5430 else if (aarch64_float_const_representable_p (x))
5432 #define buf_size 20
5433 char float_buf[buf_size] = {'\0'};
5434 real_to_decimal_for_mode (float_buf,
5435 CONST_DOUBLE_REAL_VALUE (x),
5436 buf_size, buf_size,
5437 1, GET_MODE (x));
5438 asm_fprintf (asm_out_file, "%s", float_buf);
5439 break;
5440 #undef buf_size
5442 output_operand_lossage ("invalid constant");
5443 return;
5444 default:
5445 output_operand_lossage ("invalid operand");
5446 return;
5448 break;
5450 case 'A':
5451 if (GET_CODE (x) == HIGH)
5452 x = XEXP (x, 0);
5454 switch (aarch64_classify_symbolic_expression (x))
5456 case SYMBOL_SMALL_GOT_4G:
5457 asm_fprintf (asm_out_file, ":got:");
5458 break;
5460 case SYMBOL_SMALL_TLSGD:
5461 asm_fprintf (asm_out_file, ":tlsgd:");
5462 break;
5464 case SYMBOL_SMALL_TLSDESC:
5465 asm_fprintf (asm_out_file, ":tlsdesc:");
5466 break;
5468 case SYMBOL_SMALL_TLSIE:
5469 asm_fprintf (asm_out_file, ":gottprel:");
5470 break;
5472 case SYMBOL_TLSLE24:
5473 asm_fprintf (asm_out_file, ":tprel:");
5474 break;
5476 case SYMBOL_TINY_GOT:
5477 gcc_unreachable ();
5478 break;
5480 default:
5481 break;
5483 output_addr_const (asm_out_file, x);
5484 break;
5486 case 'L':
5487 switch (aarch64_classify_symbolic_expression (x))
5489 case SYMBOL_SMALL_GOT_4G:
5490 asm_fprintf (asm_out_file, ":lo12:");
5491 break;
5493 case SYMBOL_SMALL_TLSGD:
5494 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5495 break;
5497 case SYMBOL_SMALL_TLSDESC:
5498 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5499 break;
5501 case SYMBOL_SMALL_TLSIE:
5502 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5503 break;
5505 case SYMBOL_TLSLE12:
5506 asm_fprintf (asm_out_file, ":tprel_lo12:");
5507 break;
5509 case SYMBOL_TLSLE24:
5510 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5511 break;
5513 case SYMBOL_TINY_GOT:
5514 asm_fprintf (asm_out_file, ":got:");
5515 break;
5517 case SYMBOL_TINY_TLSIE:
5518 asm_fprintf (asm_out_file, ":gottprel:");
5519 break;
5521 default:
5522 break;
5524 output_addr_const (asm_out_file, x);
5525 break;
5527 case 'G':
5528 switch (aarch64_classify_symbolic_expression (x))
5530 case SYMBOL_TLSLE24:
5531 asm_fprintf (asm_out_file, ":tprel_hi12:");
5532 break;
5533 default:
5534 break;
5536 output_addr_const (asm_out_file, x);
5537 break;
5539 case 'k':
5541 HOST_WIDE_INT cond_code;
5543 if (!CONST_INT_P (x))
5545 output_operand_lossage ("invalid operand for '%%%c'", code);
5546 return;
5549 cond_code = INTVAL (x);
5550 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5551 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5553 break;
5555 default:
5556 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5557 return;
5561 static void
5562 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5564 struct aarch64_address_info addr;
5566 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5567 switch (addr.type)
5569 case ADDRESS_REG_IMM:
5570 if (addr.offset == const0_rtx)
5571 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5572 else
5573 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5574 INTVAL (addr.offset));
5575 return;
5577 case ADDRESS_REG_REG:
5578 if (addr.shift == 0)
5579 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5580 reg_names [REGNO (addr.offset)]);
5581 else
5582 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5583 reg_names [REGNO (addr.offset)], addr.shift);
5584 return;
5586 case ADDRESS_REG_UXTW:
5587 if (addr.shift == 0)
5588 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5589 REGNO (addr.offset) - R0_REGNUM);
5590 else
5591 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5592 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5593 return;
5595 case ADDRESS_REG_SXTW:
5596 if (addr.shift == 0)
5597 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5598 REGNO (addr.offset) - R0_REGNUM);
5599 else
5600 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5601 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5602 return;
5604 case ADDRESS_REG_WB:
5605 switch (GET_CODE (x))
5607 case PRE_INC:
5608 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5609 GET_MODE_SIZE (mode));
5610 return;
5611 case POST_INC:
5612 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5613 GET_MODE_SIZE (mode));
5614 return;
5615 case PRE_DEC:
5616 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5617 GET_MODE_SIZE (mode));
5618 return;
5619 case POST_DEC:
5620 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5621 GET_MODE_SIZE (mode));
5622 return;
5623 case PRE_MODIFY:
5624 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5625 INTVAL (addr.offset));
5626 return;
5627 case POST_MODIFY:
5628 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5629 INTVAL (addr.offset));
5630 return;
5631 default:
5632 break;
5634 break;
5636 case ADDRESS_LO_SUM:
5637 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5638 output_addr_const (f, addr.offset);
5639 asm_fprintf (f, "]");
5640 return;
5642 case ADDRESS_SYMBOLIC:
5643 break;
5646 output_addr_const (f, x);
5649 bool
5650 aarch64_label_mentioned_p (rtx x)
5652 const char *fmt;
5653 int i;
5655 if (GET_CODE (x) == LABEL_REF)
5656 return true;
5658 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5659 referencing instruction, but they are constant offsets, not
5660 symbols. */
5661 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5662 return false;
5664 fmt = GET_RTX_FORMAT (GET_CODE (x));
5665 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5667 if (fmt[i] == 'E')
5669 int j;
5671 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5672 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5673 return 1;
5675 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5676 return 1;
5679 return 0;
5682 /* Implement REGNO_REG_CLASS. */
5684 enum reg_class
5685 aarch64_regno_regclass (unsigned regno)
5687 if (GP_REGNUM_P (regno))
5688 return GENERAL_REGS;
5690 if (regno == SP_REGNUM)
5691 return STACK_REG;
5693 if (regno == FRAME_POINTER_REGNUM
5694 || regno == ARG_POINTER_REGNUM)
5695 return POINTER_REGS;
5697 if (FP_REGNUM_P (regno))
5698 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5700 return NO_REGS;
5703 static rtx
5704 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5706 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5707 where mask is selected by alignment and size of the offset.
5708 We try to pick as large a range for the offset as possible to
5709 maximize the chance of a CSE. However, for aligned addresses
5710 we limit the range to 4k so that structures with different sized
5711 elements are likely to use the same base. We need to be careful
5712 not to split a CONST for some forms of address expression, otherwise
5713 it will generate sub-optimal code. */
5715 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5717 rtx base = XEXP (x, 0);
5718 rtx offset_rtx = XEXP (x, 1);
5719 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5721 if (GET_CODE (base) == PLUS)
5723 rtx op0 = XEXP (base, 0);
5724 rtx op1 = XEXP (base, 1);
5726 /* Force any scaling into a temp for CSE. */
5727 op0 = force_reg (Pmode, op0);
5728 op1 = force_reg (Pmode, op1);
5730 /* Let the pointer register be in op0. */
5731 if (REG_POINTER (op1))
5732 std::swap (op0, op1);
5734 /* If the pointer is virtual or frame related, then we know that
5735 virtual register instantiation or register elimination is going
5736 to apply a second constant. We want the two constants folded
5737 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5738 if (virt_or_elim_regno_p (REGNO (op0)))
5740 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5741 NULL_RTX, true, OPTAB_DIRECT);
5742 return gen_rtx_PLUS (Pmode, base, op1);
5745 /* Otherwise, in order to encourage CSE (and thence loop strength
5746 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5747 base = expand_binop (Pmode, add_optab, op0, op1,
5748 NULL_RTX, true, OPTAB_DIRECT);
5749 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5752 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5753 HOST_WIDE_INT base_offset;
5754 if (GET_MODE_SIZE (mode) > 16)
5755 base_offset = (offset + 0x400) & ~0x7f0;
5756 /* For offsets aren't a multiple of the access size, the limit is
5757 -256...255. */
5758 else if (offset & (GET_MODE_SIZE (mode) - 1))
5760 base_offset = (offset + 0x100) & ~0x1ff;
5762 /* BLKmode typically uses LDP of X-registers. */
5763 if (mode == BLKmode)
5764 base_offset = (offset + 512) & ~0x3ff;
5766 /* Small negative offsets are supported. */
5767 else if (IN_RANGE (offset, -256, 0))
5768 base_offset = 0;
5769 else if (mode == TImode || mode == TFmode)
5770 base_offset = (offset + 0x100) & ~0x1ff;
5771 /* Use 12-bit offset by access size. */
5772 else
5773 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5775 if (base_offset != 0)
5777 base = plus_constant (Pmode, base, base_offset);
5778 base = force_operand (base, NULL_RTX);
5779 return plus_constant (Pmode, base, offset - base_offset);
5783 return x;
5786 /* Return the reload icode required for a constant pool in mode. */
5787 static enum insn_code
5788 aarch64_constant_pool_reload_icode (machine_mode mode)
5790 switch (mode)
5792 case E_SFmode:
5793 return CODE_FOR_aarch64_reload_movcpsfdi;
5795 case E_DFmode:
5796 return CODE_FOR_aarch64_reload_movcpdfdi;
5798 case E_TFmode:
5799 return CODE_FOR_aarch64_reload_movcptfdi;
5801 case E_V8QImode:
5802 return CODE_FOR_aarch64_reload_movcpv8qidi;
5804 case E_V16QImode:
5805 return CODE_FOR_aarch64_reload_movcpv16qidi;
5807 case E_V4HImode:
5808 return CODE_FOR_aarch64_reload_movcpv4hidi;
5810 case E_V8HImode:
5811 return CODE_FOR_aarch64_reload_movcpv8hidi;
5813 case E_V2SImode:
5814 return CODE_FOR_aarch64_reload_movcpv2sidi;
5816 case E_V4SImode:
5817 return CODE_FOR_aarch64_reload_movcpv4sidi;
5819 case E_V2DImode:
5820 return CODE_FOR_aarch64_reload_movcpv2didi;
5822 case E_V2DFmode:
5823 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5825 default:
5826 gcc_unreachable ();
5829 gcc_unreachable ();
5831 static reg_class_t
5832 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5833 reg_class_t rclass,
5834 machine_mode mode,
5835 secondary_reload_info *sri)
5838 /* If we have to disable direct literal pool loads and stores because the
5839 function is too big, then we need a scratch register. */
5840 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5841 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5842 || targetm.vector_mode_supported_p (GET_MODE (x)))
5843 && !aarch64_pcrelative_literal_loads)
5845 sri->icode = aarch64_constant_pool_reload_icode (mode);
5846 return NO_REGS;
5849 /* Without the TARGET_SIMD instructions we cannot move a Q register
5850 to a Q register directly. We need a scratch. */
5851 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5852 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5853 && reg_class_subset_p (rclass, FP_REGS))
5855 if (mode == TFmode)
5856 sri->icode = CODE_FOR_aarch64_reload_movtf;
5857 else if (mode == TImode)
5858 sri->icode = CODE_FOR_aarch64_reload_movti;
5859 return NO_REGS;
5862 /* A TFmode or TImode memory access should be handled via an FP_REGS
5863 because AArch64 has richer addressing modes for LDR/STR instructions
5864 than LDP/STP instructions. */
5865 if (TARGET_FLOAT && rclass == GENERAL_REGS
5866 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5867 return FP_REGS;
5869 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5870 return GENERAL_REGS;
5872 return NO_REGS;
5875 static bool
5876 aarch64_can_eliminate (const int from, const int to)
5878 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5879 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5881 if (frame_pointer_needed)
5883 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5884 return true;
5885 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5886 return false;
5887 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5888 && !cfun->calls_alloca)
5889 return true;
5890 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5891 return true;
5893 return false;
5895 else
5897 /* If we decided that we didn't need a leaf frame pointer but then used
5898 LR in the function, then we'll want a frame pointer after all, so
5899 prevent this elimination to ensure a frame pointer is used. */
5900 if (to == STACK_POINTER_REGNUM
5901 && flag_omit_leaf_frame_pointer
5902 && df_regs_ever_live_p (LR_REGNUM))
5903 return false;
5906 return true;
5909 HOST_WIDE_INT
5910 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5912 aarch64_layout_frame ();
5914 if (to == HARD_FRAME_POINTER_REGNUM)
5916 if (from == ARG_POINTER_REGNUM)
5917 return cfun->machine->frame.hard_fp_offset;
5919 if (from == FRAME_POINTER_REGNUM)
5920 return cfun->machine->frame.hard_fp_offset
5921 - cfun->machine->frame.locals_offset;
5924 if (to == STACK_POINTER_REGNUM)
5926 if (from == FRAME_POINTER_REGNUM)
5927 return cfun->machine->frame.frame_size
5928 - cfun->machine->frame.locals_offset;
5931 return cfun->machine->frame.frame_size;
5934 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5935 previous frame. */
5938 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5940 if (count != 0)
5941 return const0_rtx;
5942 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5946 static void
5947 aarch64_asm_trampoline_template (FILE *f)
5949 if (TARGET_ILP32)
5951 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5952 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5954 else
5956 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5957 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5959 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5960 assemble_aligned_integer (4, const0_rtx);
5961 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5962 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5965 static void
5966 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5968 rtx fnaddr, mem, a_tramp;
5969 const int tramp_code_sz = 16;
5971 /* Don't need to copy the trailing D-words, we fill those in below. */
5972 emit_block_move (m_tramp, assemble_trampoline_template (),
5973 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5974 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5975 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5976 if (GET_MODE (fnaddr) != ptr_mode)
5977 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5978 emit_move_insn (mem, fnaddr);
5980 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5981 emit_move_insn (mem, chain_value);
5983 /* XXX We should really define a "clear_cache" pattern and use
5984 gen_clear_cache(). */
5985 a_tramp = XEXP (m_tramp, 0);
5986 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5987 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
5988 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5989 ptr_mode);
5992 static unsigned char
5993 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5995 switch (regclass)
5997 case CALLER_SAVE_REGS:
5998 case POINTER_REGS:
5999 case GENERAL_REGS:
6000 case ALL_REGS:
6001 case FP_REGS:
6002 case FP_LO_REGS:
6003 return
6004 aarch64_vector_mode_p (mode)
6005 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6006 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6007 case STACK_REG:
6008 return 1;
6010 case NO_REGS:
6011 return 0;
6013 default:
6014 break;
6016 gcc_unreachable ();
6019 static reg_class_t
6020 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6022 if (regclass == POINTER_REGS)
6023 return GENERAL_REGS;
6025 if (regclass == STACK_REG)
6027 if (REG_P(x)
6028 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6029 return regclass;
6031 return NO_REGS;
6034 /* Register eliminiation can result in a request for
6035 SP+constant->FP_REGS. We cannot support such operations which
6036 use SP as source and an FP_REG as destination, so reject out
6037 right now. */
6038 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6040 rtx lhs = XEXP (x, 0);
6042 /* Look through a possible SUBREG introduced by ILP32. */
6043 if (GET_CODE (lhs) == SUBREG)
6044 lhs = SUBREG_REG (lhs);
6046 gcc_assert (REG_P (lhs));
6047 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6048 POINTER_REGS));
6049 return NO_REGS;
6052 return regclass;
6055 void
6056 aarch64_asm_output_labelref (FILE* f, const char *name)
6058 asm_fprintf (f, "%U%s", name);
6061 static void
6062 aarch64_elf_asm_constructor (rtx symbol, int priority)
6064 if (priority == DEFAULT_INIT_PRIORITY)
6065 default_ctor_section_asm_out_constructor (symbol, priority);
6066 else
6068 section *s;
6069 /* While priority is known to be in range [0, 65535], so 18 bytes
6070 would be enough, the compiler might not know that. To avoid
6071 -Wformat-truncation false positive, use a larger size. */
6072 char buf[23];
6073 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6074 s = get_section (buf, SECTION_WRITE, NULL);
6075 switch_to_section (s);
6076 assemble_align (POINTER_SIZE);
6077 assemble_aligned_integer (POINTER_BYTES, symbol);
6081 static void
6082 aarch64_elf_asm_destructor (rtx symbol, int priority)
6084 if (priority == DEFAULT_INIT_PRIORITY)
6085 default_dtor_section_asm_out_destructor (symbol, priority);
6086 else
6088 section *s;
6089 /* While priority is known to be in range [0, 65535], so 18 bytes
6090 would be enough, the compiler might not know that. To avoid
6091 -Wformat-truncation false positive, use a larger size. */
6092 char buf[23];
6093 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6094 s = get_section (buf, SECTION_WRITE, NULL);
6095 switch_to_section (s);
6096 assemble_align (POINTER_SIZE);
6097 assemble_aligned_integer (POINTER_BYTES, symbol);
6101 const char*
6102 aarch64_output_casesi (rtx *operands)
6104 char buf[100];
6105 char label[100];
6106 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6107 int index;
6108 static const char *const patterns[4][2] =
6111 "ldrb\t%w3, [%0,%w1,uxtw]",
6112 "add\t%3, %4, %w3, sxtb #2"
6115 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6116 "add\t%3, %4, %w3, sxth #2"
6119 "ldr\t%w3, [%0,%w1,uxtw #2]",
6120 "add\t%3, %4, %w3, sxtw #2"
6122 /* We assume that DImode is only generated when not optimizing and
6123 that we don't really need 64-bit address offsets. That would
6124 imply an object file with 8GB of code in a single function! */
6126 "ldr\t%w3, [%0,%w1,uxtw #2]",
6127 "add\t%3, %4, %w3, sxtw #2"
6131 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6133 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6135 gcc_assert (index >= 0 && index <= 3);
6137 /* Need to implement table size reduction, by chaning the code below. */
6138 output_asm_insn (patterns[index][0], operands);
6139 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6140 snprintf (buf, sizeof (buf),
6141 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6142 output_asm_insn (buf, operands);
6143 output_asm_insn (patterns[index][1], operands);
6144 output_asm_insn ("br\t%3", operands);
6145 assemble_label (asm_out_file, label);
6146 return "";
6150 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6151 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6152 operator. */
6155 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6157 if (shift >= 0 && shift <= 3)
6159 int size;
6160 for (size = 8; size <= 32; size *= 2)
6162 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6163 if (mask == bits << shift)
6164 return size;
6167 return 0;
6170 /* Constant pools are per function only when PC relative
6171 literal loads are true or we are in the large memory
6172 model. */
6174 static inline bool
6175 aarch64_can_use_per_function_literal_pools_p (void)
6177 return (aarch64_pcrelative_literal_loads
6178 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6181 static bool
6182 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6184 /* Fixme:: In an ideal world this would work similar
6185 to the logic in aarch64_select_rtx_section but this
6186 breaks bootstrap in gcc go. For now we workaround
6187 this by returning false here. */
6188 return false;
6191 /* Select appropriate section for constants depending
6192 on where we place literal pools. */
6194 static section *
6195 aarch64_select_rtx_section (machine_mode mode,
6196 rtx x,
6197 unsigned HOST_WIDE_INT align)
6199 if (aarch64_can_use_per_function_literal_pools_p ())
6200 return function_section (current_function_decl);
6202 return default_elf_select_rtx_section (mode, x, align);
6205 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6206 void
6207 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6208 HOST_WIDE_INT offset)
6210 /* When using per-function literal pools, we must ensure that any code
6211 section is aligned to the minimal instruction length, lest we get
6212 errors from the assembler re "unaligned instructions". */
6213 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6214 ASM_OUTPUT_ALIGN (f, 2);
6217 /* Costs. */
6219 /* Helper function for rtx cost calculation. Strip a shift expression
6220 from X. Returns the inner operand if successful, or the original
6221 expression on failure. */
6222 static rtx
6223 aarch64_strip_shift (rtx x)
6225 rtx op = x;
6227 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6228 we can convert both to ROR during final output. */
6229 if ((GET_CODE (op) == ASHIFT
6230 || GET_CODE (op) == ASHIFTRT
6231 || GET_CODE (op) == LSHIFTRT
6232 || GET_CODE (op) == ROTATERT
6233 || GET_CODE (op) == ROTATE)
6234 && CONST_INT_P (XEXP (op, 1)))
6235 return XEXP (op, 0);
6237 if (GET_CODE (op) == MULT
6238 && CONST_INT_P (XEXP (op, 1))
6239 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6240 return XEXP (op, 0);
6242 return x;
6245 /* Helper function for rtx cost calculation. Strip an extend
6246 expression from X. Returns the inner operand if successful, or the
6247 original expression on failure. We deal with a number of possible
6248 canonicalization variations here. If STRIP_SHIFT is true, then
6249 we can strip off a shift also. */
6250 static rtx
6251 aarch64_strip_extend (rtx x, bool strip_shift)
6253 rtx op = x;
6255 /* Zero and sign extraction of a widened value. */
6256 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6257 && XEXP (op, 2) == const0_rtx
6258 && GET_CODE (XEXP (op, 0)) == MULT
6259 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6260 XEXP (op, 1)))
6261 return XEXP (XEXP (op, 0), 0);
6263 /* It can also be represented (for zero-extend) as an AND with an
6264 immediate. */
6265 if (GET_CODE (op) == AND
6266 && GET_CODE (XEXP (op, 0)) == MULT
6267 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6268 && CONST_INT_P (XEXP (op, 1))
6269 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6270 INTVAL (XEXP (op, 1))) != 0)
6271 return XEXP (XEXP (op, 0), 0);
6273 /* Now handle extended register, as this may also have an optional
6274 left shift by 1..4. */
6275 if (strip_shift
6276 && GET_CODE (op) == ASHIFT
6277 && CONST_INT_P (XEXP (op, 1))
6278 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6279 op = XEXP (op, 0);
6281 if (GET_CODE (op) == ZERO_EXTEND
6282 || GET_CODE (op) == SIGN_EXTEND)
6283 op = XEXP (op, 0);
6285 if (op != x)
6286 return op;
6288 return x;
6291 /* Return true iff CODE is a shift supported in combination
6292 with arithmetic instructions. */
6294 static bool
6295 aarch64_shift_p (enum rtx_code code)
6297 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6301 /* Return true iff X is a cheap shift without a sign extend. */
6303 static bool
6304 aarch64_cheap_mult_shift_p (rtx x)
6306 rtx op0, op1;
6308 op0 = XEXP (x, 0);
6309 op1 = XEXP (x, 1);
6311 if (!(aarch64_tune_params.extra_tuning_flags
6312 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6313 return false;
6315 if (GET_CODE (op0) == SIGN_EXTEND)
6316 return false;
6318 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6319 && UINTVAL (op1) <= 4)
6320 return true;
6322 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6323 return false;
6325 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6327 if (l2 > 0 && l2 <= 4)
6328 return true;
6330 return false;
6333 /* Helper function for rtx cost calculation. Calculate the cost of
6334 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6335 Return the calculated cost of the expression, recursing manually in to
6336 operands where needed. */
6338 static int
6339 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6341 rtx op0, op1;
6342 const struct cpu_cost_table *extra_cost
6343 = aarch64_tune_params.insn_extra_cost;
6344 int cost = 0;
6345 bool compound_p = (outer == PLUS || outer == MINUS);
6346 machine_mode mode = GET_MODE (x);
6348 gcc_checking_assert (code == MULT);
6350 op0 = XEXP (x, 0);
6351 op1 = XEXP (x, 1);
6353 if (VECTOR_MODE_P (mode))
6354 mode = GET_MODE_INNER (mode);
6356 /* Integer multiply/fma. */
6357 if (GET_MODE_CLASS (mode) == MODE_INT)
6359 /* The multiply will be canonicalized as a shift, cost it as such. */
6360 if (aarch64_shift_p (GET_CODE (x))
6361 || (CONST_INT_P (op1)
6362 && exact_log2 (INTVAL (op1)) > 0))
6364 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6365 || GET_CODE (op0) == SIGN_EXTEND;
6366 if (speed)
6368 if (compound_p)
6370 /* If the shift is considered cheap,
6371 then don't add any cost. */
6372 if (aarch64_cheap_mult_shift_p (x))
6374 else if (REG_P (op1))
6375 /* ARITH + shift-by-register. */
6376 cost += extra_cost->alu.arith_shift_reg;
6377 else if (is_extend)
6378 /* ARITH + extended register. We don't have a cost field
6379 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6380 cost += extra_cost->alu.extend_arith;
6381 else
6382 /* ARITH + shift-by-immediate. */
6383 cost += extra_cost->alu.arith_shift;
6385 else
6386 /* LSL (immediate). */
6387 cost += extra_cost->alu.shift;
6390 /* Strip extends as we will have costed them in the case above. */
6391 if (is_extend)
6392 op0 = aarch64_strip_extend (op0, true);
6394 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6396 return cost;
6399 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6400 compound and let the below cases handle it. After all, MNEG is a
6401 special-case alias of MSUB. */
6402 if (GET_CODE (op0) == NEG)
6404 op0 = XEXP (op0, 0);
6405 compound_p = true;
6408 /* Integer multiplies or FMAs have zero/sign extending variants. */
6409 if ((GET_CODE (op0) == ZERO_EXTEND
6410 && GET_CODE (op1) == ZERO_EXTEND)
6411 || (GET_CODE (op0) == SIGN_EXTEND
6412 && GET_CODE (op1) == SIGN_EXTEND))
6414 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6415 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6417 if (speed)
6419 if (compound_p)
6420 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6421 cost += extra_cost->mult[0].extend_add;
6422 else
6423 /* MUL/SMULL/UMULL. */
6424 cost += extra_cost->mult[0].extend;
6427 return cost;
6430 /* This is either an integer multiply or a MADD. In both cases
6431 we want to recurse and cost the operands. */
6432 cost += rtx_cost (op0, mode, MULT, 0, speed);
6433 cost += rtx_cost (op1, mode, MULT, 1, speed);
6435 if (speed)
6437 if (compound_p)
6438 /* MADD/MSUB. */
6439 cost += extra_cost->mult[mode == DImode].add;
6440 else
6441 /* MUL. */
6442 cost += extra_cost->mult[mode == DImode].simple;
6445 return cost;
6447 else
6449 if (speed)
6451 /* Floating-point FMA/FMUL can also support negations of the
6452 operands, unless the rounding mode is upward or downward in
6453 which case FNMUL is different than FMUL with operand negation. */
6454 bool neg0 = GET_CODE (op0) == NEG;
6455 bool neg1 = GET_CODE (op1) == NEG;
6456 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6458 if (neg0)
6459 op0 = XEXP (op0, 0);
6460 if (neg1)
6461 op1 = XEXP (op1, 0);
6464 if (compound_p)
6465 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6466 cost += extra_cost->fp[mode == DFmode].fma;
6467 else
6468 /* FMUL/FNMUL. */
6469 cost += extra_cost->fp[mode == DFmode].mult;
6472 cost += rtx_cost (op0, mode, MULT, 0, speed);
6473 cost += rtx_cost (op1, mode, MULT, 1, speed);
6474 return cost;
6478 static int
6479 aarch64_address_cost (rtx x,
6480 machine_mode mode,
6481 addr_space_t as ATTRIBUTE_UNUSED,
6482 bool speed)
6484 enum rtx_code c = GET_CODE (x);
6485 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6486 struct aarch64_address_info info;
6487 int cost = 0;
6488 info.shift = 0;
6490 if (!aarch64_classify_address (&info, x, mode, c, false))
6492 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6494 /* This is a CONST or SYMBOL ref which will be split
6495 in a different way depending on the code model in use.
6496 Cost it through the generic infrastructure. */
6497 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6498 /* Divide through by the cost of one instruction to
6499 bring it to the same units as the address costs. */
6500 cost_symbol_ref /= COSTS_N_INSNS (1);
6501 /* The cost is then the cost of preparing the address,
6502 followed by an immediate (possibly 0) offset. */
6503 return cost_symbol_ref + addr_cost->imm_offset;
6505 else
6507 /* This is most likely a jump table from a case
6508 statement. */
6509 return addr_cost->register_offset;
6513 switch (info.type)
6515 case ADDRESS_LO_SUM:
6516 case ADDRESS_SYMBOLIC:
6517 case ADDRESS_REG_IMM:
6518 cost += addr_cost->imm_offset;
6519 break;
6521 case ADDRESS_REG_WB:
6522 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6523 cost += addr_cost->pre_modify;
6524 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6525 cost += addr_cost->post_modify;
6526 else
6527 gcc_unreachable ();
6529 break;
6531 case ADDRESS_REG_REG:
6532 cost += addr_cost->register_offset;
6533 break;
6535 case ADDRESS_REG_SXTW:
6536 cost += addr_cost->register_sextend;
6537 break;
6539 case ADDRESS_REG_UXTW:
6540 cost += addr_cost->register_zextend;
6541 break;
6543 default:
6544 gcc_unreachable ();
6548 if (info.shift > 0)
6550 /* For the sake of calculating the cost of the shifted register
6551 component, we can treat same sized modes in the same way. */
6552 switch (GET_MODE_BITSIZE (mode))
6554 case 16:
6555 cost += addr_cost->addr_scale_costs.hi;
6556 break;
6558 case 32:
6559 cost += addr_cost->addr_scale_costs.si;
6560 break;
6562 case 64:
6563 cost += addr_cost->addr_scale_costs.di;
6564 break;
6566 /* We can't tell, or this is a 128-bit vector. */
6567 default:
6568 cost += addr_cost->addr_scale_costs.ti;
6569 break;
6573 return cost;
6576 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6577 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6578 to be taken. */
6581 aarch64_branch_cost (bool speed_p, bool predictable_p)
6583 /* When optimizing for speed, use the cost of unpredictable branches. */
6584 const struct cpu_branch_cost *branch_costs =
6585 aarch64_tune_params.branch_costs;
6587 if (!speed_p || predictable_p)
6588 return branch_costs->predictable;
6589 else
6590 return branch_costs->unpredictable;
6593 /* Return true if the RTX X in mode MODE is a zero or sign extract
6594 usable in an ADD or SUB (extended register) instruction. */
6595 static bool
6596 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6598 /* Catch add with a sign extract.
6599 This is add_<optab><mode>_multp2. */
6600 if (GET_CODE (x) == SIGN_EXTRACT
6601 || GET_CODE (x) == ZERO_EXTRACT)
6603 rtx op0 = XEXP (x, 0);
6604 rtx op1 = XEXP (x, 1);
6605 rtx op2 = XEXP (x, 2);
6607 if (GET_CODE (op0) == MULT
6608 && CONST_INT_P (op1)
6609 && op2 == const0_rtx
6610 && CONST_INT_P (XEXP (op0, 1))
6611 && aarch64_is_extend_from_extract (mode,
6612 XEXP (op0, 1),
6613 op1))
6615 return true;
6618 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6619 No shift. */
6620 else if (GET_CODE (x) == SIGN_EXTEND
6621 || GET_CODE (x) == ZERO_EXTEND)
6622 return REG_P (XEXP (x, 0));
6624 return false;
6627 static bool
6628 aarch64_frint_unspec_p (unsigned int u)
6630 switch (u)
6632 case UNSPEC_FRINTZ:
6633 case UNSPEC_FRINTP:
6634 case UNSPEC_FRINTM:
6635 case UNSPEC_FRINTA:
6636 case UNSPEC_FRINTN:
6637 case UNSPEC_FRINTX:
6638 case UNSPEC_FRINTI:
6639 return true;
6641 default:
6642 return false;
6646 /* Return true iff X is an rtx that will match an extr instruction
6647 i.e. as described in the *extr<mode>5_insn family of patterns.
6648 OP0 and OP1 will be set to the operands of the shifts involved
6649 on success and will be NULL_RTX otherwise. */
6651 static bool
6652 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6654 rtx op0, op1;
6655 machine_mode mode = GET_MODE (x);
6657 *res_op0 = NULL_RTX;
6658 *res_op1 = NULL_RTX;
6660 if (GET_CODE (x) != IOR)
6661 return false;
6663 op0 = XEXP (x, 0);
6664 op1 = XEXP (x, 1);
6666 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6667 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6669 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6670 if (GET_CODE (op1) == ASHIFT)
6671 std::swap (op0, op1);
6673 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6674 return false;
6676 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6677 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6679 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6680 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6682 *res_op0 = XEXP (op0, 0);
6683 *res_op1 = XEXP (op1, 0);
6684 return true;
6688 return false;
6691 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6692 storing it in *COST. Result is true if the total cost of the operation
6693 has now been calculated. */
6694 static bool
6695 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6697 rtx inner;
6698 rtx comparator;
6699 enum rtx_code cmpcode;
6701 if (COMPARISON_P (op0))
6703 inner = XEXP (op0, 0);
6704 comparator = XEXP (op0, 1);
6705 cmpcode = GET_CODE (op0);
6707 else
6709 inner = op0;
6710 comparator = const0_rtx;
6711 cmpcode = NE;
6714 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6716 /* Conditional branch. */
6717 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6718 return true;
6719 else
6721 if (cmpcode == NE || cmpcode == EQ)
6723 if (comparator == const0_rtx)
6725 /* TBZ/TBNZ/CBZ/CBNZ. */
6726 if (GET_CODE (inner) == ZERO_EXTRACT)
6727 /* TBZ/TBNZ. */
6728 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6729 ZERO_EXTRACT, 0, speed);
6730 else
6731 /* CBZ/CBNZ. */
6732 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6734 return true;
6737 else if (cmpcode == LT || cmpcode == GE)
6739 /* TBZ/TBNZ. */
6740 if (comparator == const0_rtx)
6741 return true;
6745 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6747 /* CCMP. */
6748 if (GET_CODE (op1) == COMPARE)
6750 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6751 if (XEXP (op1, 1) == const0_rtx)
6752 *cost += 1;
6753 if (speed)
6755 machine_mode mode = GET_MODE (XEXP (op1, 0));
6756 const struct cpu_cost_table *extra_cost
6757 = aarch64_tune_params.insn_extra_cost;
6759 if (GET_MODE_CLASS (mode) == MODE_INT)
6760 *cost += extra_cost->alu.arith;
6761 else
6762 *cost += extra_cost->fp[mode == DFmode].compare;
6764 return true;
6767 /* It's a conditional operation based on the status flags,
6768 so it must be some flavor of CSEL. */
6770 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6771 if (GET_CODE (op1) == NEG
6772 || GET_CODE (op1) == NOT
6773 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6774 op1 = XEXP (op1, 0);
6775 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6777 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6778 op1 = XEXP (op1, 0);
6779 op2 = XEXP (op2, 0);
6782 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6783 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6784 return true;
6787 /* We don't know what this is, cost all operands. */
6788 return false;
6791 /* Check whether X is a bitfield operation of the form shift + extend that
6792 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6793 operand to which the bitfield operation is applied. Otherwise return
6794 NULL_RTX. */
6796 static rtx
6797 aarch64_extend_bitfield_pattern_p (rtx x)
6799 rtx_code outer_code = GET_CODE (x);
6800 machine_mode outer_mode = GET_MODE (x);
6802 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6803 && outer_mode != SImode && outer_mode != DImode)
6804 return NULL_RTX;
6806 rtx inner = XEXP (x, 0);
6807 rtx_code inner_code = GET_CODE (inner);
6808 machine_mode inner_mode = GET_MODE (inner);
6809 rtx op = NULL_RTX;
6811 switch (inner_code)
6813 case ASHIFT:
6814 if (CONST_INT_P (XEXP (inner, 1))
6815 && (inner_mode == QImode || inner_mode == HImode))
6816 op = XEXP (inner, 0);
6817 break;
6818 case LSHIFTRT:
6819 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6820 && (inner_mode == QImode || inner_mode == HImode))
6821 op = XEXP (inner, 0);
6822 break;
6823 case ASHIFTRT:
6824 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6825 && (inner_mode == QImode || inner_mode == HImode))
6826 op = XEXP (inner, 0);
6827 break;
6828 default:
6829 break;
6832 return op;
6835 /* Return true if the mask and a shift amount from an RTX of the form
6836 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6837 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6839 bool
6840 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6842 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6843 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6844 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6845 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6848 /* Calculate the cost of calculating X, storing it in *COST. Result
6849 is true if the total cost of the operation has now been calculated. */
6850 static bool
6851 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6852 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6854 rtx op0, op1, op2;
6855 const struct cpu_cost_table *extra_cost
6856 = aarch64_tune_params.insn_extra_cost;
6857 int code = GET_CODE (x);
6858 scalar_int_mode int_mode;
6860 /* By default, assume that everything has equivalent cost to the
6861 cheapest instruction. Any additional costs are applied as a delta
6862 above this default. */
6863 *cost = COSTS_N_INSNS (1);
6865 switch (code)
6867 case SET:
6868 /* The cost depends entirely on the operands to SET. */
6869 *cost = 0;
6870 op0 = SET_DEST (x);
6871 op1 = SET_SRC (x);
6873 switch (GET_CODE (op0))
6875 case MEM:
6876 if (speed)
6878 rtx address = XEXP (op0, 0);
6879 if (VECTOR_MODE_P (mode))
6880 *cost += extra_cost->ldst.storev;
6881 else if (GET_MODE_CLASS (mode) == MODE_INT)
6882 *cost += extra_cost->ldst.store;
6883 else if (mode == SFmode)
6884 *cost += extra_cost->ldst.storef;
6885 else if (mode == DFmode)
6886 *cost += extra_cost->ldst.stored;
6888 *cost +=
6889 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6890 0, speed));
6893 *cost += rtx_cost (op1, mode, SET, 1, speed);
6894 return true;
6896 case SUBREG:
6897 if (! REG_P (SUBREG_REG (op0)))
6898 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6900 /* Fall through. */
6901 case REG:
6902 /* The cost is one per vector-register copied. */
6903 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6905 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6906 / GET_MODE_SIZE (V4SImode);
6907 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6909 /* const0_rtx is in general free, but we will use an
6910 instruction to set a register to 0. */
6911 else if (REG_P (op1) || op1 == const0_rtx)
6913 /* The cost is 1 per register copied. */
6914 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6915 / UNITS_PER_WORD;
6916 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6918 else
6919 /* Cost is just the cost of the RHS of the set. */
6920 *cost += rtx_cost (op1, mode, SET, 1, speed);
6921 return true;
6923 case ZERO_EXTRACT:
6924 case SIGN_EXTRACT:
6925 /* Bit-field insertion. Strip any redundant widening of
6926 the RHS to meet the width of the target. */
6927 if (GET_CODE (op1) == SUBREG)
6928 op1 = SUBREG_REG (op1);
6929 if ((GET_CODE (op1) == ZERO_EXTEND
6930 || GET_CODE (op1) == SIGN_EXTEND)
6931 && CONST_INT_P (XEXP (op0, 1))
6932 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6933 >= INTVAL (XEXP (op0, 1))))
6934 op1 = XEXP (op1, 0);
6936 if (CONST_INT_P (op1))
6938 /* MOV immediate is assumed to always be cheap. */
6939 *cost = COSTS_N_INSNS (1);
6941 else
6943 /* BFM. */
6944 if (speed)
6945 *cost += extra_cost->alu.bfi;
6946 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6949 return true;
6951 default:
6952 /* We can't make sense of this, assume default cost. */
6953 *cost = COSTS_N_INSNS (1);
6954 return false;
6956 return false;
6958 case CONST_INT:
6959 /* If an instruction can incorporate a constant within the
6960 instruction, the instruction's expression avoids calling
6961 rtx_cost() on the constant. If rtx_cost() is called on a
6962 constant, then it is usually because the constant must be
6963 moved into a register by one or more instructions.
6965 The exception is constant 0, which can be expressed
6966 as XZR/WZR and is therefore free. The exception to this is
6967 if we have (set (reg) (const0_rtx)) in which case we must cost
6968 the move. However, we can catch that when we cost the SET, so
6969 we don't need to consider that here. */
6970 if (x == const0_rtx)
6971 *cost = 0;
6972 else
6974 /* To an approximation, building any other constant is
6975 proportionally expensive to the number of instructions
6976 required to build that constant. This is true whether we
6977 are compiling for SPEED or otherwise. */
6978 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6979 (NULL_RTX, x, false, mode));
6981 return true;
6983 case CONST_DOUBLE:
6985 /* First determine number of instructions to do the move
6986 as an integer constant. */
6987 if (!aarch64_float_const_representable_p (x)
6988 && !aarch64_can_const_movi_rtx_p (x, mode)
6989 && aarch64_float_const_rtx_p (x))
6991 unsigned HOST_WIDE_INT ival;
6992 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6993 gcc_assert (succeed);
6995 machine_mode imode = (mode == HFmode
6996 ? SImode
6997 : int_mode_for_mode (mode).require ());
6998 int ncost = aarch64_internal_mov_immediate
6999 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7000 *cost += COSTS_N_INSNS (ncost);
7001 return true;
7004 if (speed)
7006 /* mov[df,sf]_aarch64. */
7007 if (aarch64_float_const_representable_p (x))
7008 /* FMOV (scalar immediate). */
7009 *cost += extra_cost->fp[mode == DFmode].fpconst;
7010 else if (!aarch64_float_const_zero_rtx_p (x))
7012 /* This will be a load from memory. */
7013 if (mode == DFmode)
7014 *cost += extra_cost->ldst.loadd;
7015 else
7016 *cost += extra_cost->ldst.loadf;
7018 else
7019 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7020 or MOV v0.s[0], wzr - neither of which are modeled by the
7021 cost tables. Just use the default cost. */
7026 return true;
7028 case MEM:
7029 if (speed)
7031 /* For loads we want the base cost of a load, plus an
7032 approximation for the additional cost of the addressing
7033 mode. */
7034 rtx address = XEXP (x, 0);
7035 if (VECTOR_MODE_P (mode))
7036 *cost += extra_cost->ldst.loadv;
7037 else if (GET_MODE_CLASS (mode) == MODE_INT)
7038 *cost += extra_cost->ldst.load;
7039 else if (mode == SFmode)
7040 *cost += extra_cost->ldst.loadf;
7041 else if (mode == DFmode)
7042 *cost += extra_cost->ldst.loadd;
7044 *cost +=
7045 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7046 0, speed));
7049 return true;
7051 case NEG:
7052 op0 = XEXP (x, 0);
7054 if (VECTOR_MODE_P (mode))
7056 if (speed)
7058 /* FNEG. */
7059 *cost += extra_cost->vect.alu;
7061 return false;
7064 if (GET_MODE_CLASS (mode) == MODE_INT)
7066 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7067 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7069 /* CSETM. */
7070 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7071 return true;
7074 /* Cost this as SUB wzr, X. */
7075 op0 = CONST0_RTX (mode);
7076 op1 = XEXP (x, 0);
7077 goto cost_minus;
7080 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7082 /* Support (neg(fma...)) as a single instruction only if
7083 sign of zeros is unimportant. This matches the decision
7084 making in aarch64.md. */
7085 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7087 /* FNMADD. */
7088 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7089 return true;
7091 if (GET_CODE (op0) == MULT)
7093 /* FNMUL. */
7094 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7095 return true;
7097 if (speed)
7098 /* FNEG. */
7099 *cost += extra_cost->fp[mode == DFmode].neg;
7100 return false;
7103 return false;
7105 case CLRSB:
7106 case CLZ:
7107 if (speed)
7109 if (VECTOR_MODE_P (mode))
7110 *cost += extra_cost->vect.alu;
7111 else
7112 *cost += extra_cost->alu.clz;
7115 return false;
7117 case COMPARE:
7118 op0 = XEXP (x, 0);
7119 op1 = XEXP (x, 1);
7121 if (op1 == const0_rtx
7122 && GET_CODE (op0) == AND)
7124 x = op0;
7125 mode = GET_MODE (op0);
7126 goto cost_logic;
7129 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7131 /* TODO: A write to the CC flags possibly costs extra, this
7132 needs encoding in the cost tables. */
7134 mode = GET_MODE (op0);
7135 /* ANDS. */
7136 if (GET_CODE (op0) == AND)
7138 x = op0;
7139 goto cost_logic;
7142 if (GET_CODE (op0) == PLUS)
7144 /* ADDS (and CMN alias). */
7145 x = op0;
7146 goto cost_plus;
7149 if (GET_CODE (op0) == MINUS)
7151 /* SUBS. */
7152 x = op0;
7153 goto cost_minus;
7156 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7157 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7158 && CONST_INT_P (XEXP (op0, 2)))
7160 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7161 Handle it here directly rather than going to cost_logic
7162 since we know the immediate generated for the TST is valid
7163 so we can avoid creating an intermediate rtx for it only
7164 for costing purposes. */
7165 if (speed)
7166 *cost += extra_cost->alu.logical;
7168 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7169 ZERO_EXTRACT, 0, speed);
7170 return true;
7173 if (GET_CODE (op1) == NEG)
7175 /* CMN. */
7176 if (speed)
7177 *cost += extra_cost->alu.arith;
7179 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7180 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7181 return true;
7184 /* CMP.
7186 Compare can freely swap the order of operands, and
7187 canonicalization puts the more complex operation first.
7188 But the integer MINUS logic expects the shift/extend
7189 operation in op1. */
7190 if (! (REG_P (op0)
7191 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7193 op0 = XEXP (x, 1);
7194 op1 = XEXP (x, 0);
7196 goto cost_minus;
7199 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7201 /* FCMP. */
7202 if (speed)
7203 *cost += extra_cost->fp[mode == DFmode].compare;
7205 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7207 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7208 /* FCMP supports constant 0.0 for no extra cost. */
7209 return true;
7211 return false;
7214 if (VECTOR_MODE_P (mode))
7216 /* Vector compare. */
7217 if (speed)
7218 *cost += extra_cost->vect.alu;
7220 if (aarch64_float_const_zero_rtx_p (op1))
7222 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7223 cost. */
7224 return true;
7226 return false;
7228 return false;
7230 case MINUS:
7232 op0 = XEXP (x, 0);
7233 op1 = XEXP (x, 1);
7235 cost_minus:
7236 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7238 /* Detect valid immediates. */
7239 if ((GET_MODE_CLASS (mode) == MODE_INT
7240 || (GET_MODE_CLASS (mode) == MODE_CC
7241 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7242 && CONST_INT_P (op1)
7243 && aarch64_uimm12_shift (INTVAL (op1)))
7245 if (speed)
7246 /* SUB(S) (immediate). */
7247 *cost += extra_cost->alu.arith;
7248 return true;
7251 /* Look for SUB (extended register). */
7252 if (aarch64_rtx_arith_op_extract_p (op1, mode))
7254 if (speed)
7255 *cost += extra_cost->alu.extend_arith;
7257 op1 = aarch64_strip_extend (op1, true);
7258 *cost += rtx_cost (op1, VOIDmode,
7259 (enum rtx_code) GET_CODE (op1), 0, speed);
7260 return true;
7263 rtx new_op1 = aarch64_strip_extend (op1, false);
7265 /* Cost this as an FMA-alike operation. */
7266 if ((GET_CODE (new_op1) == MULT
7267 || aarch64_shift_p (GET_CODE (new_op1)))
7268 && code != COMPARE)
7270 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7271 (enum rtx_code) code,
7272 speed);
7273 return true;
7276 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7278 if (speed)
7280 if (VECTOR_MODE_P (mode))
7282 /* Vector SUB. */
7283 *cost += extra_cost->vect.alu;
7285 else if (GET_MODE_CLASS (mode) == MODE_INT)
7287 /* SUB(S). */
7288 *cost += extra_cost->alu.arith;
7290 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7292 /* FSUB. */
7293 *cost += extra_cost->fp[mode == DFmode].addsub;
7296 return true;
7299 case PLUS:
7301 rtx new_op0;
7303 op0 = XEXP (x, 0);
7304 op1 = XEXP (x, 1);
7306 cost_plus:
7307 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7308 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7310 /* CSINC. */
7311 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7312 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7313 return true;
7316 if (GET_MODE_CLASS (mode) == MODE_INT
7317 && CONST_INT_P (op1)
7318 && aarch64_uimm12_shift (INTVAL (op1)))
7320 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7322 if (speed)
7323 /* ADD (immediate). */
7324 *cost += extra_cost->alu.arith;
7325 return true;
7328 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7330 /* Look for ADD (extended register). */
7331 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7333 if (speed)
7334 *cost += extra_cost->alu.extend_arith;
7336 op0 = aarch64_strip_extend (op0, true);
7337 *cost += rtx_cost (op0, VOIDmode,
7338 (enum rtx_code) GET_CODE (op0), 0, speed);
7339 return true;
7342 /* Strip any extend, leave shifts behind as we will
7343 cost them through mult_cost. */
7344 new_op0 = aarch64_strip_extend (op0, false);
7346 if (GET_CODE (new_op0) == MULT
7347 || aarch64_shift_p (GET_CODE (new_op0)))
7349 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7350 speed);
7351 return true;
7354 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7356 if (speed)
7358 if (VECTOR_MODE_P (mode))
7360 /* Vector ADD. */
7361 *cost += extra_cost->vect.alu;
7363 else if (GET_MODE_CLASS (mode) == MODE_INT)
7365 /* ADD. */
7366 *cost += extra_cost->alu.arith;
7368 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7370 /* FADD. */
7371 *cost += extra_cost->fp[mode == DFmode].addsub;
7374 return true;
7377 case BSWAP:
7378 *cost = COSTS_N_INSNS (1);
7380 if (speed)
7382 if (VECTOR_MODE_P (mode))
7383 *cost += extra_cost->vect.alu;
7384 else
7385 *cost += extra_cost->alu.rev;
7387 return false;
7389 case IOR:
7390 if (aarch_rev16_p (x))
7392 *cost = COSTS_N_INSNS (1);
7394 if (speed)
7396 if (VECTOR_MODE_P (mode))
7397 *cost += extra_cost->vect.alu;
7398 else
7399 *cost += extra_cost->alu.rev;
7401 return true;
7404 if (aarch64_extr_rtx_p (x, &op0, &op1))
7406 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7407 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7408 if (speed)
7409 *cost += extra_cost->alu.shift;
7411 return true;
7413 /* Fall through. */
7414 case XOR:
7415 case AND:
7416 cost_logic:
7417 op0 = XEXP (x, 0);
7418 op1 = XEXP (x, 1);
7420 if (VECTOR_MODE_P (mode))
7422 if (speed)
7423 *cost += extra_cost->vect.alu;
7424 return true;
7427 if (code == AND
7428 && GET_CODE (op0) == MULT
7429 && CONST_INT_P (XEXP (op0, 1))
7430 && CONST_INT_P (op1)
7431 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7432 INTVAL (op1)) != 0)
7434 /* This is a UBFM/SBFM. */
7435 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7436 if (speed)
7437 *cost += extra_cost->alu.bfx;
7438 return true;
7441 if (is_int_mode (mode, &int_mode))
7443 if (CONST_INT_P (op1))
7445 /* We have a mask + shift version of a UBFIZ
7446 i.e. the *andim_ashift<mode>_bfiz pattern. */
7447 if (GET_CODE (op0) == ASHIFT
7448 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7449 XEXP (op0, 1)))
7451 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7452 (enum rtx_code) code, 0, speed);
7453 if (speed)
7454 *cost += extra_cost->alu.bfx;
7456 return true;
7458 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7460 /* We possibly get the immediate for free, this is not
7461 modelled. */
7462 *cost += rtx_cost (op0, int_mode,
7463 (enum rtx_code) code, 0, speed);
7464 if (speed)
7465 *cost += extra_cost->alu.logical;
7467 return true;
7470 else
7472 rtx new_op0 = op0;
7474 /* Handle ORN, EON, or BIC. */
7475 if (GET_CODE (op0) == NOT)
7476 op0 = XEXP (op0, 0);
7478 new_op0 = aarch64_strip_shift (op0);
7480 /* If we had a shift on op0 then this is a logical-shift-
7481 by-register/immediate operation. Otherwise, this is just
7482 a logical operation. */
7483 if (speed)
7485 if (new_op0 != op0)
7487 /* Shift by immediate. */
7488 if (CONST_INT_P (XEXP (op0, 1)))
7489 *cost += extra_cost->alu.log_shift;
7490 else
7491 *cost += extra_cost->alu.log_shift_reg;
7493 else
7494 *cost += extra_cost->alu.logical;
7497 /* In both cases we want to cost both operands. */
7498 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7499 0, speed);
7500 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7501 1, speed);
7503 return true;
7506 return false;
7508 case NOT:
7509 x = XEXP (x, 0);
7510 op0 = aarch64_strip_shift (x);
7512 if (VECTOR_MODE_P (mode))
7514 /* Vector NOT. */
7515 *cost += extra_cost->vect.alu;
7516 return false;
7519 /* MVN-shifted-reg. */
7520 if (op0 != x)
7522 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7524 if (speed)
7525 *cost += extra_cost->alu.log_shift;
7527 return true;
7529 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7530 Handle the second form here taking care that 'a' in the above can
7531 be a shift. */
7532 else if (GET_CODE (op0) == XOR)
7534 rtx newop0 = XEXP (op0, 0);
7535 rtx newop1 = XEXP (op0, 1);
7536 rtx op0_stripped = aarch64_strip_shift (newop0);
7538 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7539 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7541 if (speed)
7543 if (op0_stripped != newop0)
7544 *cost += extra_cost->alu.log_shift;
7545 else
7546 *cost += extra_cost->alu.logical;
7549 return true;
7551 /* MVN. */
7552 if (speed)
7553 *cost += extra_cost->alu.logical;
7555 return false;
7557 case ZERO_EXTEND:
7559 op0 = XEXP (x, 0);
7560 /* If a value is written in SI mode, then zero extended to DI
7561 mode, the operation will in general be free as a write to
7562 a 'w' register implicitly zeroes the upper bits of an 'x'
7563 register. However, if this is
7565 (set (reg) (zero_extend (reg)))
7567 we must cost the explicit register move. */
7568 if (mode == DImode
7569 && GET_MODE (op0) == SImode
7570 && outer == SET)
7572 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7574 /* If OP_COST is non-zero, then the cost of the zero extend
7575 is effectively the cost of the inner operation. Otherwise
7576 we have a MOV instruction and we take the cost from the MOV
7577 itself. This is true independently of whether we are
7578 optimizing for space or time. */
7579 if (op_cost)
7580 *cost = op_cost;
7582 return true;
7584 else if (MEM_P (op0))
7586 /* All loads can zero extend to any size for free. */
7587 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7588 return true;
7591 op0 = aarch64_extend_bitfield_pattern_p (x);
7592 if (op0)
7594 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7595 if (speed)
7596 *cost += extra_cost->alu.bfx;
7597 return true;
7600 if (speed)
7602 if (VECTOR_MODE_P (mode))
7604 /* UMOV. */
7605 *cost += extra_cost->vect.alu;
7607 else
7609 /* We generate an AND instead of UXTB/UXTH. */
7610 *cost += extra_cost->alu.logical;
7613 return false;
7615 case SIGN_EXTEND:
7616 if (MEM_P (XEXP (x, 0)))
7618 /* LDRSH. */
7619 if (speed)
7621 rtx address = XEXP (XEXP (x, 0), 0);
7622 *cost += extra_cost->ldst.load_sign_extend;
7624 *cost +=
7625 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7626 0, speed));
7628 return true;
7631 op0 = aarch64_extend_bitfield_pattern_p (x);
7632 if (op0)
7634 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7635 if (speed)
7636 *cost += extra_cost->alu.bfx;
7637 return true;
7640 if (speed)
7642 if (VECTOR_MODE_P (mode))
7643 *cost += extra_cost->vect.alu;
7644 else
7645 *cost += extra_cost->alu.extend;
7647 return false;
7649 case ASHIFT:
7650 op0 = XEXP (x, 0);
7651 op1 = XEXP (x, 1);
7653 if (CONST_INT_P (op1))
7655 if (speed)
7657 if (VECTOR_MODE_P (mode))
7659 /* Vector shift (immediate). */
7660 *cost += extra_cost->vect.alu;
7662 else
7664 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7665 aliases. */
7666 *cost += extra_cost->alu.shift;
7670 /* We can incorporate zero/sign extend for free. */
7671 if (GET_CODE (op0) == ZERO_EXTEND
7672 || GET_CODE (op0) == SIGN_EXTEND)
7673 op0 = XEXP (op0, 0);
7675 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7676 return true;
7678 else
7680 if (VECTOR_MODE_P (mode))
7682 if (speed)
7683 /* Vector shift (register). */
7684 *cost += extra_cost->vect.alu;
7686 else
7688 if (speed)
7689 /* LSLV. */
7690 *cost += extra_cost->alu.shift_reg;
7692 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7693 && CONST_INT_P (XEXP (op1, 1))
7694 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7696 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7697 /* We already demanded XEXP (op1, 0) to be REG_P, so
7698 don't recurse into it. */
7699 return true;
7702 return false; /* All arguments need to be in registers. */
7705 case ROTATE:
7706 case ROTATERT:
7707 case LSHIFTRT:
7708 case ASHIFTRT:
7709 op0 = XEXP (x, 0);
7710 op1 = XEXP (x, 1);
7712 if (CONST_INT_P (op1))
7714 /* ASR (immediate) and friends. */
7715 if (speed)
7717 if (VECTOR_MODE_P (mode))
7718 *cost += extra_cost->vect.alu;
7719 else
7720 *cost += extra_cost->alu.shift;
7723 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7724 return true;
7726 else
7728 if (VECTOR_MODE_P (mode))
7730 if (speed)
7731 /* Vector shift (register). */
7732 *cost += extra_cost->vect.alu;
7734 else
7736 if (speed)
7737 /* ASR (register) and friends. */
7738 *cost += extra_cost->alu.shift_reg;
7740 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7741 && CONST_INT_P (XEXP (op1, 1))
7742 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7744 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7745 /* We already demanded XEXP (op1, 0) to be REG_P, so
7746 don't recurse into it. */
7747 return true;
7750 return false; /* All arguments need to be in registers. */
7753 case SYMBOL_REF:
7755 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7756 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7758 /* LDR. */
7759 if (speed)
7760 *cost += extra_cost->ldst.load;
7762 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7763 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7765 /* ADRP, followed by ADD. */
7766 *cost += COSTS_N_INSNS (1);
7767 if (speed)
7768 *cost += 2 * extra_cost->alu.arith;
7770 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7771 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7773 /* ADR. */
7774 if (speed)
7775 *cost += extra_cost->alu.arith;
7778 if (flag_pic)
7780 /* One extra load instruction, after accessing the GOT. */
7781 *cost += COSTS_N_INSNS (1);
7782 if (speed)
7783 *cost += extra_cost->ldst.load;
7785 return true;
7787 case HIGH:
7788 case LO_SUM:
7789 /* ADRP/ADD (immediate). */
7790 if (speed)
7791 *cost += extra_cost->alu.arith;
7792 return true;
7794 case ZERO_EXTRACT:
7795 case SIGN_EXTRACT:
7796 /* UBFX/SBFX. */
7797 if (speed)
7799 if (VECTOR_MODE_P (mode))
7800 *cost += extra_cost->vect.alu;
7801 else
7802 *cost += extra_cost->alu.bfx;
7805 /* We can trust that the immediates used will be correct (there
7806 are no by-register forms), so we need only cost op0. */
7807 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7808 return true;
7810 case MULT:
7811 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7812 /* aarch64_rtx_mult_cost always handles recursion to its
7813 operands. */
7814 return true;
7816 case MOD:
7817 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7818 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7819 an unconditional negate. This case should only ever be reached through
7820 the set_smod_pow2_cheap check in expmed.c. */
7821 if (CONST_INT_P (XEXP (x, 1))
7822 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7823 && (mode == SImode || mode == DImode))
7825 /* We expand to 4 instructions. Reset the baseline. */
7826 *cost = COSTS_N_INSNS (4);
7828 if (speed)
7829 *cost += 2 * extra_cost->alu.logical
7830 + 2 * extra_cost->alu.arith;
7832 return true;
7835 /* Fall-through. */
7836 case UMOD:
7837 if (speed)
7839 /* Slighly prefer UMOD over SMOD. */
7840 if (VECTOR_MODE_P (mode))
7841 *cost += extra_cost->vect.alu;
7842 else if (GET_MODE_CLASS (mode) == MODE_INT)
7843 *cost += (extra_cost->mult[mode == DImode].add
7844 + extra_cost->mult[mode == DImode].idiv
7845 + (code == MOD ? 1 : 0));
7847 return false; /* All arguments need to be in registers. */
7849 case DIV:
7850 case UDIV:
7851 case SQRT:
7852 if (speed)
7854 if (VECTOR_MODE_P (mode))
7855 *cost += extra_cost->vect.alu;
7856 else if (GET_MODE_CLASS (mode) == MODE_INT)
7857 /* There is no integer SQRT, so only DIV and UDIV can get
7858 here. */
7859 *cost += (extra_cost->mult[mode == DImode].idiv
7860 /* Slighly prefer UDIV over SDIV. */
7861 + (code == DIV ? 1 : 0));
7862 else
7863 *cost += extra_cost->fp[mode == DFmode].div;
7865 return false; /* All arguments need to be in registers. */
7867 case IF_THEN_ELSE:
7868 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7869 XEXP (x, 2), cost, speed);
7871 case EQ:
7872 case NE:
7873 case GT:
7874 case GTU:
7875 case LT:
7876 case LTU:
7877 case GE:
7878 case GEU:
7879 case LE:
7880 case LEU:
7882 return false; /* All arguments must be in registers. */
7884 case FMA:
7885 op0 = XEXP (x, 0);
7886 op1 = XEXP (x, 1);
7887 op2 = XEXP (x, 2);
7889 if (speed)
7891 if (VECTOR_MODE_P (mode))
7892 *cost += extra_cost->vect.alu;
7893 else
7894 *cost += extra_cost->fp[mode == DFmode].fma;
7897 /* FMSUB, FNMADD, and FNMSUB are free. */
7898 if (GET_CODE (op0) == NEG)
7899 op0 = XEXP (op0, 0);
7901 if (GET_CODE (op2) == NEG)
7902 op2 = XEXP (op2, 0);
7904 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7905 and the by-element operand as operand 0. */
7906 if (GET_CODE (op1) == NEG)
7907 op1 = XEXP (op1, 0);
7909 /* Catch vector-by-element operations. The by-element operand can
7910 either be (vec_duplicate (vec_select (x))) or just
7911 (vec_select (x)), depending on whether we are multiplying by
7912 a vector or a scalar.
7914 Canonicalization is not very good in these cases, FMA4 will put the
7915 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7916 if (GET_CODE (op0) == VEC_DUPLICATE)
7917 op0 = XEXP (op0, 0);
7918 else if (GET_CODE (op1) == VEC_DUPLICATE)
7919 op1 = XEXP (op1, 0);
7921 if (GET_CODE (op0) == VEC_SELECT)
7922 op0 = XEXP (op0, 0);
7923 else if (GET_CODE (op1) == VEC_SELECT)
7924 op1 = XEXP (op1, 0);
7926 /* If the remaining parameters are not registers,
7927 get the cost to put them into registers. */
7928 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7929 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7930 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7931 return true;
7933 case FLOAT:
7934 case UNSIGNED_FLOAT:
7935 if (speed)
7936 *cost += extra_cost->fp[mode == DFmode].fromint;
7937 return false;
7939 case FLOAT_EXTEND:
7940 if (speed)
7942 if (VECTOR_MODE_P (mode))
7944 /*Vector truncate. */
7945 *cost += extra_cost->vect.alu;
7947 else
7948 *cost += extra_cost->fp[mode == DFmode].widen;
7950 return false;
7952 case FLOAT_TRUNCATE:
7953 if (speed)
7955 if (VECTOR_MODE_P (mode))
7957 /*Vector conversion. */
7958 *cost += extra_cost->vect.alu;
7960 else
7961 *cost += extra_cost->fp[mode == DFmode].narrow;
7963 return false;
7965 case FIX:
7966 case UNSIGNED_FIX:
7967 x = XEXP (x, 0);
7968 /* Strip the rounding part. They will all be implemented
7969 by the fcvt* family of instructions anyway. */
7970 if (GET_CODE (x) == UNSPEC)
7972 unsigned int uns_code = XINT (x, 1);
7974 if (uns_code == UNSPEC_FRINTA
7975 || uns_code == UNSPEC_FRINTM
7976 || uns_code == UNSPEC_FRINTN
7977 || uns_code == UNSPEC_FRINTP
7978 || uns_code == UNSPEC_FRINTZ)
7979 x = XVECEXP (x, 0, 0);
7982 if (speed)
7984 if (VECTOR_MODE_P (mode))
7985 *cost += extra_cost->vect.alu;
7986 else
7987 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7990 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7991 fixed-point fcvt. */
7992 if (GET_CODE (x) == MULT
7993 && ((VECTOR_MODE_P (mode)
7994 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7995 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7997 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7998 0, speed);
7999 return true;
8002 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8003 return true;
8005 case ABS:
8006 if (VECTOR_MODE_P (mode))
8008 /* ABS (vector). */
8009 if (speed)
8010 *cost += extra_cost->vect.alu;
8012 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8014 op0 = XEXP (x, 0);
8016 /* FABD, which is analogous to FADD. */
8017 if (GET_CODE (op0) == MINUS)
8019 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8020 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8021 if (speed)
8022 *cost += extra_cost->fp[mode == DFmode].addsub;
8024 return true;
8026 /* Simple FABS is analogous to FNEG. */
8027 if (speed)
8028 *cost += extra_cost->fp[mode == DFmode].neg;
8030 else
8032 /* Integer ABS will either be split to
8033 two arithmetic instructions, or will be an ABS
8034 (scalar), which we don't model. */
8035 *cost = COSTS_N_INSNS (2);
8036 if (speed)
8037 *cost += 2 * extra_cost->alu.arith;
8039 return false;
8041 case SMAX:
8042 case SMIN:
8043 if (speed)
8045 if (VECTOR_MODE_P (mode))
8046 *cost += extra_cost->vect.alu;
8047 else
8049 /* FMAXNM/FMINNM/FMAX/FMIN.
8050 TODO: This may not be accurate for all implementations, but
8051 we do not model this in the cost tables. */
8052 *cost += extra_cost->fp[mode == DFmode].addsub;
8055 return false;
8057 case UNSPEC:
8058 /* The floating point round to integer frint* instructions. */
8059 if (aarch64_frint_unspec_p (XINT (x, 1)))
8061 if (speed)
8062 *cost += extra_cost->fp[mode == DFmode].roundint;
8064 return false;
8067 if (XINT (x, 1) == UNSPEC_RBIT)
8069 if (speed)
8070 *cost += extra_cost->alu.rev;
8072 return false;
8074 break;
8076 case TRUNCATE:
8078 /* Decompose <su>muldi3_highpart. */
8079 if (/* (truncate:DI */
8080 mode == DImode
8081 /* (lshiftrt:TI */
8082 && GET_MODE (XEXP (x, 0)) == TImode
8083 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8084 /* (mult:TI */
8085 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8086 /* (ANY_EXTEND:TI (reg:DI))
8087 (ANY_EXTEND:TI (reg:DI))) */
8088 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8089 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8090 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8091 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8092 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8093 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8094 /* (const_int 64) */
8095 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8096 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8098 /* UMULH/SMULH. */
8099 if (speed)
8100 *cost += extra_cost->mult[mode == DImode].extend;
8101 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8102 mode, MULT, 0, speed);
8103 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8104 mode, MULT, 1, speed);
8105 return true;
8108 /* Fall through. */
8109 default:
8110 break;
8113 if (dump_file
8114 && flag_aarch64_verbose_cost)
8115 fprintf (dump_file,
8116 "\nFailed to cost RTX. Assuming default cost.\n");
8118 return true;
8121 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8122 calculated for X. This cost is stored in *COST. Returns true
8123 if the total cost of X was calculated. */
8124 static bool
8125 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8126 int param, int *cost, bool speed)
8128 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8130 if (dump_file
8131 && flag_aarch64_verbose_cost)
8133 print_rtl_single (dump_file, x);
8134 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8135 speed ? "Hot" : "Cold",
8136 *cost, result ? "final" : "partial");
8139 return result;
8142 static int
8143 aarch64_register_move_cost (machine_mode mode,
8144 reg_class_t from_i, reg_class_t to_i)
8146 enum reg_class from = (enum reg_class) from_i;
8147 enum reg_class to = (enum reg_class) to_i;
8148 const struct cpu_regmove_cost *regmove_cost
8149 = aarch64_tune_params.regmove_cost;
8151 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8152 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8153 to = GENERAL_REGS;
8155 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8156 from = GENERAL_REGS;
8158 /* Moving between GPR and stack cost is the same as GP2GP. */
8159 if ((from == GENERAL_REGS && to == STACK_REG)
8160 || (to == GENERAL_REGS && from == STACK_REG))
8161 return regmove_cost->GP2GP;
8163 /* To/From the stack register, we move via the gprs. */
8164 if (to == STACK_REG || from == STACK_REG)
8165 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8166 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8168 if (GET_MODE_SIZE (mode) == 16)
8170 /* 128-bit operations on general registers require 2 instructions. */
8171 if (from == GENERAL_REGS && to == GENERAL_REGS)
8172 return regmove_cost->GP2GP * 2;
8173 else if (from == GENERAL_REGS)
8174 return regmove_cost->GP2FP * 2;
8175 else if (to == GENERAL_REGS)
8176 return regmove_cost->FP2GP * 2;
8178 /* When AdvSIMD instructions are disabled it is not possible to move
8179 a 128-bit value directly between Q registers. This is handled in
8180 secondary reload. A general register is used as a scratch to move
8181 the upper DI value and the lower DI value is moved directly,
8182 hence the cost is the sum of three moves. */
8183 if (! TARGET_SIMD)
8184 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8186 return regmove_cost->FP2FP;
8189 if (from == GENERAL_REGS && to == GENERAL_REGS)
8190 return regmove_cost->GP2GP;
8191 else if (from == GENERAL_REGS)
8192 return regmove_cost->GP2FP;
8193 else if (to == GENERAL_REGS)
8194 return regmove_cost->FP2GP;
8196 return regmove_cost->FP2FP;
8199 static int
8200 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8201 reg_class_t rclass ATTRIBUTE_UNUSED,
8202 bool in ATTRIBUTE_UNUSED)
8204 return aarch64_tune_params.memmov_cost;
8207 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8208 to optimize 1.0/sqrt. */
8210 static bool
8211 use_rsqrt_p (machine_mode mode)
8213 return (!flag_trapping_math
8214 && flag_unsafe_math_optimizations
8215 && ((aarch64_tune_params.approx_modes->recip_sqrt
8216 & AARCH64_APPROX_MODE (mode))
8217 || flag_mrecip_low_precision_sqrt));
8220 /* Function to decide when to use the approximate reciprocal square root
8221 builtin. */
8223 static tree
8224 aarch64_builtin_reciprocal (tree fndecl)
8226 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8228 if (!use_rsqrt_p (mode))
8229 return NULL_TREE;
8230 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8233 typedef rtx (*rsqrte_type) (rtx, rtx);
8235 /* Select reciprocal square root initial estimate insn depending on machine
8236 mode. */
8238 static rsqrte_type
8239 get_rsqrte_type (machine_mode mode)
8241 switch (mode)
8243 case E_DFmode: return gen_aarch64_rsqrtedf;
8244 case E_SFmode: return gen_aarch64_rsqrtesf;
8245 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8246 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8247 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8248 default: gcc_unreachable ();
8252 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8254 /* Select reciprocal square root series step insn depending on machine mode. */
8256 static rsqrts_type
8257 get_rsqrts_type (machine_mode mode)
8259 switch (mode)
8261 case E_DFmode: return gen_aarch64_rsqrtsdf;
8262 case E_SFmode: return gen_aarch64_rsqrtssf;
8263 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8264 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8265 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8266 default: gcc_unreachable ();
8270 /* Emit instruction sequence to compute either the approximate square root
8271 or its approximate reciprocal, depending on the flag RECP, and return
8272 whether the sequence was emitted or not. */
8274 bool
8275 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8277 machine_mode mode = GET_MODE (dst);
8279 if (GET_MODE_INNER (mode) == HFmode)
8281 gcc_assert (!recp);
8282 return false;
8285 machine_mode mmsk
8286 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)).require (),
8287 GET_MODE_NUNITS (mode));
8288 if (!recp)
8290 if (!(flag_mlow_precision_sqrt
8291 || (aarch64_tune_params.approx_modes->sqrt
8292 & AARCH64_APPROX_MODE (mode))))
8293 return false;
8295 if (flag_finite_math_only
8296 || flag_trapping_math
8297 || !flag_unsafe_math_optimizations
8298 || optimize_function_for_size_p (cfun))
8299 return false;
8301 else
8302 /* Caller assumes we cannot fail. */
8303 gcc_assert (use_rsqrt_p (mode));
8306 rtx xmsk = gen_reg_rtx (mmsk);
8307 if (!recp)
8308 /* When calculating the approximate square root, compare the
8309 argument with 0.0 and create a mask. */
8310 emit_insn (gen_rtx_SET (xmsk,
8311 gen_rtx_NEG (mmsk,
8312 gen_rtx_EQ (mmsk, src,
8313 CONST0_RTX (mode)))));
8315 /* Estimate the approximate reciprocal square root. */
8316 rtx xdst = gen_reg_rtx (mode);
8317 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8319 /* Iterate over the series twice for SF and thrice for DF. */
8320 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8322 /* Optionally iterate over the series once less for faster performance
8323 while sacrificing the accuracy. */
8324 if ((recp && flag_mrecip_low_precision_sqrt)
8325 || (!recp && flag_mlow_precision_sqrt))
8326 iterations--;
8328 /* Iterate over the series to calculate the approximate reciprocal square
8329 root. */
8330 rtx x1 = gen_reg_rtx (mode);
8331 while (iterations--)
8333 rtx x2 = gen_reg_rtx (mode);
8334 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8336 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8338 if (iterations > 0)
8339 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8342 if (!recp)
8344 /* Qualify the approximate reciprocal square root when the argument is
8345 0.0 by squashing the intermediary result to 0.0. */
8346 rtx xtmp = gen_reg_rtx (mmsk);
8347 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8348 gen_rtx_SUBREG (mmsk, xdst, 0)));
8349 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8351 /* Calculate the approximate square root. */
8352 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8355 /* Finalize the approximation. */
8356 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8358 return true;
8361 typedef rtx (*recpe_type) (rtx, rtx);
8363 /* Select reciprocal initial estimate insn depending on machine mode. */
8365 static recpe_type
8366 get_recpe_type (machine_mode mode)
8368 switch (mode)
8370 case E_SFmode: return (gen_aarch64_frecpesf);
8371 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8372 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8373 case E_DFmode: return (gen_aarch64_frecpedf);
8374 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8375 default: gcc_unreachable ();
8379 typedef rtx (*recps_type) (rtx, rtx, rtx);
8381 /* Select reciprocal series step insn depending on machine mode. */
8383 static recps_type
8384 get_recps_type (machine_mode mode)
8386 switch (mode)
8388 case E_SFmode: return (gen_aarch64_frecpssf);
8389 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8390 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8391 case E_DFmode: return (gen_aarch64_frecpsdf);
8392 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8393 default: gcc_unreachable ();
8397 /* Emit the instruction sequence to compute the approximation for the division
8398 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8400 bool
8401 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8403 machine_mode mode = GET_MODE (quo);
8405 if (GET_MODE_INNER (mode) == HFmode)
8406 return false;
8408 bool use_approx_division_p = (flag_mlow_precision_div
8409 || (aarch64_tune_params.approx_modes->division
8410 & AARCH64_APPROX_MODE (mode)));
8412 if (!flag_finite_math_only
8413 || flag_trapping_math
8414 || !flag_unsafe_math_optimizations
8415 || optimize_function_for_size_p (cfun)
8416 || !use_approx_division_p)
8417 return false;
8419 /* Estimate the approximate reciprocal. */
8420 rtx xrcp = gen_reg_rtx (mode);
8421 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8423 /* Iterate over the series twice for SF and thrice for DF. */
8424 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8426 /* Optionally iterate over the series once less for faster performance,
8427 while sacrificing the accuracy. */
8428 if (flag_mlow_precision_div)
8429 iterations--;
8431 /* Iterate over the series to calculate the approximate reciprocal. */
8432 rtx xtmp = gen_reg_rtx (mode);
8433 while (iterations--)
8435 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8437 if (iterations > 0)
8438 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8441 if (num != CONST1_RTX (mode))
8443 /* As the approximate reciprocal of DEN is already calculated, only
8444 calculate the approximate division when NUM is not 1.0. */
8445 rtx xnum = force_reg (mode, num);
8446 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8449 /* Finalize the approximation. */
8450 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8451 return true;
8454 /* Return the number of instructions that can be issued per cycle. */
8455 static int
8456 aarch64_sched_issue_rate (void)
8458 return aarch64_tune_params.issue_rate;
8461 static int
8462 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8464 int issue_rate = aarch64_sched_issue_rate ();
8466 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8470 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8471 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8472 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8474 static int
8475 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8476 int ready_index)
8478 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8482 /* Vectorizer cost model target hooks. */
8484 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8485 static int
8486 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8487 tree vectype,
8488 int misalign ATTRIBUTE_UNUSED)
8490 unsigned elements;
8491 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8492 bool fp = false;
8494 if (vectype != NULL)
8495 fp = FLOAT_TYPE_P (vectype);
8497 switch (type_of_cost)
8499 case scalar_stmt:
8500 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8502 case scalar_load:
8503 return costs->scalar_load_cost;
8505 case scalar_store:
8506 return costs->scalar_store_cost;
8508 case vector_stmt:
8509 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8511 case vector_load:
8512 return costs->vec_align_load_cost;
8514 case vector_store:
8515 return costs->vec_store_cost;
8517 case vec_to_scalar:
8518 return costs->vec_to_scalar_cost;
8520 case scalar_to_vec:
8521 return costs->scalar_to_vec_cost;
8523 case unaligned_load:
8524 return costs->vec_unalign_load_cost;
8526 case unaligned_store:
8527 return costs->vec_unalign_store_cost;
8529 case cond_branch_taken:
8530 return costs->cond_taken_branch_cost;
8532 case cond_branch_not_taken:
8533 return costs->cond_not_taken_branch_cost;
8535 case vec_perm:
8536 return costs->vec_permute_cost;
8538 case vec_promote_demote:
8539 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8541 case vec_construct:
8542 elements = TYPE_VECTOR_SUBPARTS (vectype);
8543 return elements / 2 + 1;
8545 default:
8546 gcc_unreachable ();
8550 /* Implement targetm.vectorize.add_stmt_cost. */
8551 static unsigned
8552 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8553 struct _stmt_vec_info *stmt_info, int misalign,
8554 enum vect_cost_model_location where)
8556 unsigned *cost = (unsigned *) data;
8557 unsigned retval = 0;
8559 if (flag_vect_cost_model)
8561 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8562 int stmt_cost =
8563 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8565 /* Statements in an inner loop relative to the loop being
8566 vectorized are weighted more heavily. The value here is
8567 arbitrary and could potentially be improved with analysis. */
8568 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8569 count *= 50; /* FIXME */
8571 retval = (unsigned) (count * stmt_cost);
8572 cost[where] += retval;
8575 return retval;
8578 static void initialize_aarch64_code_model (struct gcc_options *);
8580 /* Parse the TO_PARSE string and put the architecture struct that it
8581 selects into RES and the architectural features into ISA_FLAGS.
8582 Return an aarch64_parse_opt_result describing the parse result.
8583 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8585 static enum aarch64_parse_opt_result
8586 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8587 unsigned long *isa_flags)
8589 char *ext;
8590 const struct processor *arch;
8591 char *str = (char *) alloca (strlen (to_parse) + 1);
8592 size_t len;
8594 strcpy (str, to_parse);
8596 ext = strchr (str, '+');
8598 if (ext != NULL)
8599 len = ext - str;
8600 else
8601 len = strlen (str);
8603 if (len == 0)
8604 return AARCH64_PARSE_MISSING_ARG;
8607 /* Loop through the list of supported ARCHes to find a match. */
8608 for (arch = all_architectures; arch->name != NULL; arch++)
8610 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8612 unsigned long isa_temp = arch->flags;
8614 if (ext != NULL)
8616 /* TO_PARSE string contains at least one extension. */
8617 enum aarch64_parse_opt_result ext_res
8618 = aarch64_parse_extension (ext, &isa_temp);
8620 if (ext_res != AARCH64_PARSE_OK)
8621 return ext_res;
8623 /* Extension parsing was successful. Confirm the result
8624 arch and ISA flags. */
8625 *res = arch;
8626 *isa_flags = isa_temp;
8627 return AARCH64_PARSE_OK;
8631 /* ARCH name not found in list. */
8632 return AARCH64_PARSE_INVALID_ARG;
8635 /* Parse the TO_PARSE string and put the result tuning in RES and the
8636 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8637 describing the parse result. If there is an error parsing, RES and
8638 ISA_FLAGS are left unchanged. */
8640 static enum aarch64_parse_opt_result
8641 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8642 unsigned long *isa_flags)
8644 char *ext;
8645 const struct processor *cpu;
8646 char *str = (char *) alloca (strlen (to_parse) + 1);
8647 size_t len;
8649 strcpy (str, to_parse);
8651 ext = strchr (str, '+');
8653 if (ext != NULL)
8654 len = ext - str;
8655 else
8656 len = strlen (str);
8658 if (len == 0)
8659 return AARCH64_PARSE_MISSING_ARG;
8662 /* Loop through the list of supported CPUs to find a match. */
8663 for (cpu = all_cores; cpu->name != NULL; cpu++)
8665 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8667 unsigned long isa_temp = cpu->flags;
8670 if (ext != NULL)
8672 /* TO_PARSE string contains at least one extension. */
8673 enum aarch64_parse_opt_result ext_res
8674 = aarch64_parse_extension (ext, &isa_temp);
8676 if (ext_res != AARCH64_PARSE_OK)
8677 return ext_res;
8679 /* Extension parsing was successfull. Confirm the result
8680 cpu and ISA flags. */
8681 *res = cpu;
8682 *isa_flags = isa_temp;
8683 return AARCH64_PARSE_OK;
8687 /* CPU name not found in list. */
8688 return AARCH64_PARSE_INVALID_ARG;
8691 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8692 Return an aarch64_parse_opt_result describing the parse result.
8693 If the parsing fails the RES does not change. */
8695 static enum aarch64_parse_opt_result
8696 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8698 const struct processor *cpu;
8699 char *str = (char *) alloca (strlen (to_parse) + 1);
8701 strcpy (str, to_parse);
8703 /* Loop through the list of supported CPUs to find a match. */
8704 for (cpu = all_cores; cpu->name != NULL; cpu++)
8706 if (strcmp (cpu->name, str) == 0)
8708 *res = cpu;
8709 return AARCH64_PARSE_OK;
8713 /* CPU name not found in list. */
8714 return AARCH64_PARSE_INVALID_ARG;
8717 /* Parse TOKEN, which has length LENGTH to see if it is an option
8718 described in FLAG. If it is, return the index bit for that fusion type.
8719 If not, error (printing OPTION_NAME) and return zero. */
8721 static unsigned int
8722 aarch64_parse_one_option_token (const char *token,
8723 size_t length,
8724 const struct aarch64_flag_desc *flag,
8725 const char *option_name)
8727 for (; flag->name != NULL; flag++)
8729 if (length == strlen (flag->name)
8730 && !strncmp (flag->name, token, length))
8731 return flag->flag;
8734 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8735 return 0;
8738 /* Parse OPTION which is a comma-separated list of flags to enable.
8739 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8740 default state we inherit from the CPU tuning structures. OPTION_NAME
8741 gives the top-level option we are parsing in the -moverride string,
8742 for use in error messages. */
8744 static unsigned int
8745 aarch64_parse_boolean_options (const char *option,
8746 const struct aarch64_flag_desc *flags,
8747 unsigned int initial_state,
8748 const char *option_name)
8750 const char separator = '.';
8751 const char* specs = option;
8752 const char* ntoken = option;
8753 unsigned int found_flags = initial_state;
8755 while ((ntoken = strchr (specs, separator)))
8757 size_t token_length = ntoken - specs;
8758 unsigned token_ops = aarch64_parse_one_option_token (specs,
8759 token_length,
8760 flags,
8761 option_name);
8762 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8763 in the token stream, reset the supported operations. So:
8765 adrp+add.cmp+branch.none.adrp+add
8767 would have the result of turning on only adrp+add fusion. */
8768 if (!token_ops)
8769 found_flags = 0;
8771 found_flags |= token_ops;
8772 specs = ++ntoken;
8775 /* We ended with a comma, print something. */
8776 if (!(*specs))
8778 error ("%s string ill-formed\n", option_name);
8779 return 0;
8782 /* We still have one more token to parse. */
8783 size_t token_length = strlen (specs);
8784 unsigned token_ops = aarch64_parse_one_option_token (specs,
8785 token_length,
8786 flags,
8787 option_name);
8788 if (!token_ops)
8789 found_flags = 0;
8791 found_flags |= token_ops;
8792 return found_flags;
8795 /* Support for overriding instruction fusion. */
8797 static void
8798 aarch64_parse_fuse_string (const char *fuse_string,
8799 struct tune_params *tune)
8801 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8802 aarch64_fusible_pairs,
8803 tune->fusible_ops,
8804 "fuse=");
8807 /* Support for overriding other tuning flags. */
8809 static void
8810 aarch64_parse_tune_string (const char *tune_string,
8811 struct tune_params *tune)
8813 tune->extra_tuning_flags
8814 = aarch64_parse_boolean_options (tune_string,
8815 aarch64_tuning_flags,
8816 tune->extra_tuning_flags,
8817 "tune=");
8820 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8821 we understand. If it is, extract the option string and handoff to
8822 the appropriate function. */
8824 void
8825 aarch64_parse_one_override_token (const char* token,
8826 size_t length,
8827 struct tune_params *tune)
8829 const struct aarch64_tuning_override_function *fn
8830 = aarch64_tuning_override_functions;
8832 const char *option_part = strchr (token, '=');
8833 if (!option_part)
8835 error ("tuning string missing in option (%s)", token);
8836 return;
8839 /* Get the length of the option name. */
8840 length = option_part - token;
8841 /* Skip the '=' to get to the option string. */
8842 option_part++;
8844 for (; fn->name != NULL; fn++)
8846 if (!strncmp (fn->name, token, length))
8848 fn->parse_override (option_part, tune);
8849 return;
8853 error ("unknown tuning option (%s)",token);
8854 return;
8857 /* A checking mechanism for the implementation of the tls size. */
8859 static void
8860 initialize_aarch64_tls_size (struct gcc_options *opts)
8862 if (aarch64_tls_size == 0)
8863 aarch64_tls_size = 24;
8865 switch (opts->x_aarch64_cmodel_var)
8867 case AARCH64_CMODEL_TINY:
8868 /* Both the default and maximum TLS size allowed under tiny is 1M which
8869 needs two instructions to address, so we clamp the size to 24. */
8870 if (aarch64_tls_size > 24)
8871 aarch64_tls_size = 24;
8872 break;
8873 case AARCH64_CMODEL_SMALL:
8874 /* The maximum TLS size allowed under small is 4G. */
8875 if (aarch64_tls_size > 32)
8876 aarch64_tls_size = 32;
8877 break;
8878 case AARCH64_CMODEL_LARGE:
8879 /* The maximum TLS size allowed under large is 16E.
8880 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8881 if (aarch64_tls_size > 48)
8882 aarch64_tls_size = 48;
8883 break;
8884 default:
8885 gcc_unreachable ();
8888 return;
8891 /* Parse STRING looking for options in the format:
8892 string :: option:string
8893 option :: name=substring
8894 name :: {a-z}
8895 substring :: defined by option. */
8897 static void
8898 aarch64_parse_override_string (const char* input_string,
8899 struct tune_params* tune)
8901 const char separator = ':';
8902 size_t string_length = strlen (input_string) + 1;
8903 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8904 char *string = string_root;
8905 strncpy (string, input_string, string_length);
8906 string[string_length - 1] = '\0';
8908 char* ntoken = string;
8910 while ((ntoken = strchr (string, separator)))
8912 size_t token_length = ntoken - string;
8913 /* Make this substring look like a string. */
8914 *ntoken = '\0';
8915 aarch64_parse_one_override_token (string, token_length, tune);
8916 string = ++ntoken;
8919 /* One last option to parse. */
8920 aarch64_parse_one_override_token (string, strlen (string), tune);
8921 free (string_root);
8925 static void
8926 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8928 /* The logic here is that if we are disabling all frame pointer generation
8929 then we do not need to disable leaf frame pointer generation as a
8930 separate operation. But if we are *only* disabling leaf frame pointer
8931 generation then we set flag_omit_frame_pointer to true, but in
8932 aarch64_frame_pointer_required we return false only for leaf functions.
8934 PR 70044: We have to be careful about being called multiple times for the
8935 same function. Once we have decided to set flag_omit_frame_pointer just
8936 so that we can omit leaf frame pointers, we must then not interpret a
8937 second call as meaning that all frame pointer generation should be
8938 omitted. We do this by setting flag_omit_frame_pointer to a special,
8939 non-zero value. */
8940 if (opts->x_flag_omit_frame_pointer == 2)
8941 opts->x_flag_omit_frame_pointer = 0;
8943 if (opts->x_flag_omit_frame_pointer)
8944 opts->x_flag_omit_leaf_frame_pointer = false;
8945 else if (opts->x_flag_omit_leaf_frame_pointer)
8946 opts->x_flag_omit_frame_pointer = 2;
8948 /* If not optimizing for size, set the default
8949 alignment to what the target wants. */
8950 if (!opts->x_optimize_size)
8952 if (opts->x_align_loops <= 0)
8953 opts->x_align_loops = aarch64_tune_params.loop_align;
8954 if (opts->x_align_jumps <= 0)
8955 opts->x_align_jumps = aarch64_tune_params.jump_align;
8956 if (opts->x_align_functions <= 0)
8957 opts->x_align_functions = aarch64_tune_params.function_align;
8960 /* We default to no pc-relative literal loads. */
8962 aarch64_pcrelative_literal_loads = false;
8964 /* If -mpc-relative-literal-loads is set on the command line, this
8965 implies that the user asked for PC relative literal loads. */
8966 if (opts->x_pcrelative_literal_loads == 1)
8967 aarch64_pcrelative_literal_loads = true;
8969 /* This is PR70113. When building the Linux kernel with
8970 CONFIG_ARM64_ERRATUM_843419, support for relocations
8971 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8972 removed from the kernel to avoid loading objects with possibly
8973 offending sequences. Without -mpc-relative-literal-loads we would
8974 generate such relocations, preventing the kernel build from
8975 succeeding. */
8976 if (opts->x_pcrelative_literal_loads == 2
8977 && TARGET_FIX_ERR_A53_843419)
8978 aarch64_pcrelative_literal_loads = true;
8980 /* In the tiny memory model it makes no sense to disallow PC relative
8981 literal pool loads. */
8982 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8983 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8984 aarch64_pcrelative_literal_loads = true;
8986 /* When enabling the lower precision Newton series for the square root, also
8987 enable it for the reciprocal square root, since the latter is an
8988 intermediary step for the former. */
8989 if (flag_mlow_precision_sqrt)
8990 flag_mrecip_low_precision_sqrt = true;
8993 /* 'Unpack' up the internal tuning structs and update the options
8994 in OPTS. The caller must have set up selected_tune and selected_arch
8995 as all the other target-specific codegen decisions are
8996 derived from them. */
8998 void
8999 aarch64_override_options_internal (struct gcc_options *opts)
9001 aarch64_tune_flags = selected_tune->flags;
9002 aarch64_tune = selected_tune->sched_core;
9003 /* Make a copy of the tuning parameters attached to the core, which
9004 we may later overwrite. */
9005 aarch64_tune_params = *(selected_tune->tune);
9006 aarch64_architecture_version = selected_arch->architecture_version;
9008 if (opts->x_aarch64_override_tune_string)
9009 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9010 &aarch64_tune_params);
9012 /* This target defaults to strict volatile bitfields. */
9013 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9014 opts->x_flag_strict_volatile_bitfields = 1;
9016 initialize_aarch64_code_model (opts);
9017 initialize_aarch64_tls_size (opts);
9019 int queue_depth = 0;
9020 switch (aarch64_tune_params.autoprefetcher_model)
9022 case tune_params::AUTOPREFETCHER_OFF:
9023 queue_depth = -1;
9024 break;
9025 case tune_params::AUTOPREFETCHER_WEAK:
9026 queue_depth = 0;
9027 break;
9028 case tune_params::AUTOPREFETCHER_STRONG:
9029 queue_depth = max_insn_queue_index + 1;
9030 break;
9031 default:
9032 gcc_unreachable ();
9035 /* We don't mind passing in global_options_set here as we don't use
9036 the *options_set structs anyway. */
9037 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9038 queue_depth,
9039 opts->x_param_values,
9040 global_options_set.x_param_values);
9042 /* Set up parameters to be used in prefetching algorithm. Do not
9043 override the defaults unless we are tuning for a core we have
9044 researched values for. */
9045 if (aarch64_tune_params.prefetch->num_slots > 0)
9046 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9047 aarch64_tune_params.prefetch->num_slots,
9048 opts->x_param_values,
9049 global_options_set.x_param_values);
9050 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9051 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9052 aarch64_tune_params.prefetch->l1_cache_size,
9053 opts->x_param_values,
9054 global_options_set.x_param_values);
9055 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9056 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9057 aarch64_tune_params.prefetch->l1_cache_line_size,
9058 opts->x_param_values,
9059 global_options_set.x_param_values);
9060 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9061 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9062 aarch64_tune_params.prefetch->l2_cache_size,
9063 opts->x_param_values,
9064 global_options_set.x_param_values);
9066 /* Enable sw prefetching at specified optimization level for
9067 CPUS that have prefetch. Lower optimization level threshold by 1
9068 when profiling is enabled. */
9069 if (opts->x_flag_prefetch_loop_arrays < 0
9070 && !opts->x_optimize_size
9071 && aarch64_tune_params.prefetch->default_opt_level >= 0
9072 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9073 opts->x_flag_prefetch_loop_arrays = 1;
9075 aarch64_override_options_after_change_1 (opts);
9078 /* Print a hint with a suggestion for a core or architecture name that
9079 most closely resembles what the user passed in STR. ARCH is true if
9080 the user is asking for an architecture name. ARCH is false if the user
9081 is asking for a core name. */
9083 static void
9084 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9086 auto_vec<const char *> candidates;
9087 const struct processor *entry = arch ? all_architectures : all_cores;
9088 for (; entry->name != NULL; entry++)
9089 candidates.safe_push (entry->name);
9090 char *s;
9091 const char *hint = candidates_list_and_hint (str, s, candidates);
9092 if (hint)
9093 inform (input_location, "valid arguments are: %s;"
9094 " did you mean %qs?", s, hint);
9095 XDELETEVEC (s);
9098 /* Print a hint with a suggestion for a core name that most closely resembles
9099 what the user passed in STR. */
9101 inline static void
9102 aarch64_print_hint_for_core (const char *str)
9104 aarch64_print_hint_for_core_or_arch (str, false);
9107 /* Print a hint with a suggestion for an architecture name that most closely
9108 resembles what the user passed in STR. */
9110 inline static void
9111 aarch64_print_hint_for_arch (const char *str)
9113 aarch64_print_hint_for_core_or_arch (str, true);
9116 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9117 specified in STR and throw errors if appropriate. Put the results if
9118 they are valid in RES and ISA_FLAGS. Return whether the option is
9119 valid. */
9121 static bool
9122 aarch64_validate_mcpu (const char *str, const struct processor **res,
9123 unsigned long *isa_flags)
9125 enum aarch64_parse_opt_result parse_res
9126 = aarch64_parse_cpu (str, res, isa_flags);
9128 if (parse_res == AARCH64_PARSE_OK)
9129 return true;
9131 switch (parse_res)
9133 case AARCH64_PARSE_MISSING_ARG:
9134 error ("missing cpu name in %<-mcpu=%s%>", str);
9135 break;
9136 case AARCH64_PARSE_INVALID_ARG:
9137 error ("unknown value %qs for -mcpu", str);
9138 aarch64_print_hint_for_core (str);
9139 break;
9140 case AARCH64_PARSE_INVALID_FEATURE:
9141 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9142 break;
9143 default:
9144 gcc_unreachable ();
9147 return false;
9150 /* Validate a command-line -march option. Parse the arch and extensions
9151 (if any) specified in STR and throw errors if appropriate. Put the
9152 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9153 option is valid. */
9155 static bool
9156 aarch64_validate_march (const char *str, const struct processor **res,
9157 unsigned long *isa_flags)
9159 enum aarch64_parse_opt_result parse_res
9160 = aarch64_parse_arch (str, res, isa_flags);
9162 if (parse_res == AARCH64_PARSE_OK)
9163 return true;
9165 switch (parse_res)
9167 case AARCH64_PARSE_MISSING_ARG:
9168 error ("missing arch name in %<-march=%s%>", str);
9169 break;
9170 case AARCH64_PARSE_INVALID_ARG:
9171 error ("unknown value %qs for -march", str);
9172 aarch64_print_hint_for_arch (str);
9173 break;
9174 case AARCH64_PARSE_INVALID_FEATURE:
9175 error ("invalid feature modifier in %<-march=%s%>", str);
9176 break;
9177 default:
9178 gcc_unreachable ();
9181 return false;
9184 /* Validate a command-line -mtune option. Parse the cpu
9185 specified in STR and throw errors if appropriate. Put the
9186 result, if it is valid, in RES. Return whether the option is
9187 valid. */
9189 static bool
9190 aarch64_validate_mtune (const char *str, const struct processor **res)
9192 enum aarch64_parse_opt_result parse_res
9193 = aarch64_parse_tune (str, res);
9195 if (parse_res == AARCH64_PARSE_OK)
9196 return true;
9198 switch (parse_res)
9200 case AARCH64_PARSE_MISSING_ARG:
9201 error ("missing cpu name in %<-mtune=%s%>", str);
9202 break;
9203 case AARCH64_PARSE_INVALID_ARG:
9204 error ("unknown value %qs for -mtune", str);
9205 aarch64_print_hint_for_core (str);
9206 break;
9207 default:
9208 gcc_unreachable ();
9210 return false;
9213 /* Return the CPU corresponding to the enum CPU.
9214 If it doesn't specify a cpu, return the default. */
9216 static const struct processor *
9217 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9219 if (cpu != aarch64_none)
9220 return &all_cores[cpu];
9222 /* The & 0x3f is to extract the bottom 6 bits that encode the
9223 default cpu as selected by the --with-cpu GCC configure option
9224 in config.gcc.
9225 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9226 flags mechanism should be reworked to make it more sane. */
9227 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9230 /* Return the architecture corresponding to the enum ARCH.
9231 If it doesn't specify a valid architecture, return the default. */
9233 static const struct processor *
9234 aarch64_get_arch (enum aarch64_arch arch)
9236 if (arch != aarch64_no_arch)
9237 return &all_architectures[arch];
9239 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9241 return &all_architectures[cpu->arch];
9244 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9245 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9246 tuning structs. In particular it must set selected_tune and
9247 aarch64_isa_flags that define the available ISA features and tuning
9248 decisions. It must also set selected_arch as this will be used to
9249 output the .arch asm tags for each function. */
9251 static void
9252 aarch64_override_options (void)
9254 unsigned long cpu_isa = 0;
9255 unsigned long arch_isa = 0;
9256 aarch64_isa_flags = 0;
9258 bool valid_cpu = true;
9259 bool valid_tune = true;
9260 bool valid_arch = true;
9262 selected_cpu = NULL;
9263 selected_arch = NULL;
9264 selected_tune = NULL;
9266 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9267 If either of -march or -mtune is given, they override their
9268 respective component of -mcpu. */
9269 if (aarch64_cpu_string)
9270 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9271 &cpu_isa);
9273 if (aarch64_arch_string)
9274 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9275 &arch_isa);
9277 if (aarch64_tune_string)
9278 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9280 /* If the user did not specify a processor, choose the default
9281 one for them. This will be the CPU set during configuration using
9282 --with-cpu, otherwise it is "generic". */
9283 if (!selected_cpu)
9285 if (selected_arch)
9287 selected_cpu = &all_cores[selected_arch->ident];
9288 aarch64_isa_flags = arch_isa;
9289 explicit_arch = selected_arch->arch;
9291 else
9293 /* Get default configure-time CPU. */
9294 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9295 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9298 if (selected_tune)
9299 explicit_tune_core = selected_tune->ident;
9301 /* If both -mcpu and -march are specified check that they are architecturally
9302 compatible, warn if they're not and prefer the -march ISA flags. */
9303 else if (selected_arch)
9305 if (selected_arch->arch != selected_cpu->arch)
9307 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9308 all_architectures[selected_cpu->arch].name,
9309 selected_arch->name);
9311 aarch64_isa_flags = arch_isa;
9312 explicit_arch = selected_arch->arch;
9313 explicit_tune_core = selected_tune ? selected_tune->ident
9314 : selected_cpu->ident;
9316 else
9318 /* -mcpu but no -march. */
9319 aarch64_isa_flags = cpu_isa;
9320 explicit_tune_core = selected_tune ? selected_tune->ident
9321 : selected_cpu->ident;
9322 gcc_assert (selected_cpu);
9323 selected_arch = &all_architectures[selected_cpu->arch];
9324 explicit_arch = selected_arch->arch;
9327 /* Set the arch as well as we will need it when outputing
9328 the .arch directive in assembly. */
9329 if (!selected_arch)
9331 gcc_assert (selected_cpu);
9332 selected_arch = &all_architectures[selected_cpu->arch];
9335 if (!selected_tune)
9336 selected_tune = selected_cpu;
9338 #ifndef HAVE_AS_MABI_OPTION
9339 /* The compiler may have been configured with 2.23.* binutils, which does
9340 not have support for ILP32. */
9341 if (TARGET_ILP32)
9342 error ("Assembler does not support -mabi=ilp32");
9343 #endif
9345 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9346 sorry ("Return address signing is only supported for -mabi=lp64");
9348 /* Make sure we properly set up the explicit options. */
9349 if ((aarch64_cpu_string && valid_cpu)
9350 || (aarch64_tune_string && valid_tune))
9351 gcc_assert (explicit_tune_core != aarch64_none);
9353 if ((aarch64_cpu_string && valid_cpu)
9354 || (aarch64_arch_string && valid_arch))
9355 gcc_assert (explicit_arch != aarch64_no_arch);
9357 aarch64_override_options_internal (&global_options);
9359 /* Save these options as the default ones in case we push and pop them later
9360 while processing functions with potential target attributes. */
9361 target_option_default_node = target_option_current_node
9362 = build_target_option_node (&global_options);
9365 /* Implement targetm.override_options_after_change. */
9367 static void
9368 aarch64_override_options_after_change (void)
9370 aarch64_override_options_after_change_1 (&global_options);
9373 static struct machine_function *
9374 aarch64_init_machine_status (void)
9376 struct machine_function *machine;
9377 machine = ggc_cleared_alloc<machine_function> ();
9378 return machine;
9381 void
9382 aarch64_init_expanders (void)
9384 init_machine_status = aarch64_init_machine_status;
9387 /* A checking mechanism for the implementation of the various code models. */
9388 static void
9389 initialize_aarch64_code_model (struct gcc_options *opts)
9391 if (opts->x_flag_pic)
9393 switch (opts->x_aarch64_cmodel_var)
9395 case AARCH64_CMODEL_TINY:
9396 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9397 break;
9398 case AARCH64_CMODEL_SMALL:
9399 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9400 aarch64_cmodel = (flag_pic == 2
9401 ? AARCH64_CMODEL_SMALL_PIC
9402 : AARCH64_CMODEL_SMALL_SPIC);
9403 #else
9404 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9405 #endif
9406 break;
9407 case AARCH64_CMODEL_LARGE:
9408 sorry ("code model %qs with -f%s", "large",
9409 opts->x_flag_pic > 1 ? "PIC" : "pic");
9410 break;
9411 default:
9412 gcc_unreachable ();
9415 else
9416 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9419 /* Implement TARGET_OPTION_SAVE. */
9421 static void
9422 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9424 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9427 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9428 using the information saved in PTR. */
9430 static void
9431 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9433 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9434 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9435 opts->x_explicit_arch = ptr->x_explicit_arch;
9436 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9437 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9439 aarch64_override_options_internal (opts);
9442 /* Implement TARGET_OPTION_PRINT. */
9444 static void
9445 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9447 const struct processor *cpu
9448 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9449 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9450 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9451 std::string extension
9452 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9454 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9455 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9456 arch->name, extension.c_str ());
9459 static GTY(()) tree aarch64_previous_fndecl;
9461 void
9462 aarch64_reset_previous_fndecl (void)
9464 aarch64_previous_fndecl = NULL;
9467 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9468 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9469 make sure optab availability predicates are recomputed when necessary. */
9471 void
9472 aarch64_save_restore_target_globals (tree new_tree)
9474 if (TREE_TARGET_GLOBALS (new_tree))
9475 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9476 else if (new_tree == target_option_default_node)
9477 restore_target_globals (&default_target_globals);
9478 else
9479 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9482 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9483 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9484 of the function, if such exists. This function may be called multiple
9485 times on a single function so use aarch64_previous_fndecl to avoid
9486 setting up identical state. */
9488 static void
9489 aarch64_set_current_function (tree fndecl)
9491 if (!fndecl || fndecl == aarch64_previous_fndecl)
9492 return;
9494 tree old_tree = (aarch64_previous_fndecl
9495 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9496 : NULL_TREE);
9498 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9500 /* If current function has no attributes but the previous one did,
9501 use the default node. */
9502 if (!new_tree && old_tree)
9503 new_tree = target_option_default_node;
9505 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9506 the default have been handled by aarch64_save_restore_target_globals from
9507 aarch64_pragma_target_parse. */
9508 if (old_tree == new_tree)
9509 return;
9511 aarch64_previous_fndecl = fndecl;
9513 /* First set the target options. */
9514 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9516 aarch64_save_restore_target_globals (new_tree);
9519 /* Enum describing the various ways we can handle attributes.
9520 In many cases we can reuse the generic option handling machinery. */
9522 enum aarch64_attr_opt_type
9524 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9525 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9526 aarch64_attr_enum, /* Attribute sets an enum variable. */
9527 aarch64_attr_custom /* Attribute requires a custom handling function. */
9530 /* All the information needed to handle a target attribute.
9531 NAME is the name of the attribute.
9532 ATTR_TYPE specifies the type of behavior of the attribute as described
9533 in the definition of enum aarch64_attr_opt_type.
9534 ALLOW_NEG is true if the attribute supports a "no-" form.
9535 HANDLER is the function that takes the attribute string and whether
9536 it is a pragma or attribute and handles the option. It is needed only
9537 when the ATTR_TYPE is aarch64_attr_custom.
9538 OPT_NUM is the enum specifying the option that the attribute modifies.
9539 This is needed for attributes that mirror the behavior of a command-line
9540 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9541 aarch64_attr_enum. */
9543 struct aarch64_attribute_info
9545 const char *name;
9546 enum aarch64_attr_opt_type attr_type;
9547 bool allow_neg;
9548 bool (*handler) (const char *, const char *);
9549 enum opt_code opt_num;
9552 /* Handle the ARCH_STR argument to the arch= target attribute.
9553 PRAGMA_OR_ATTR is used in potential error messages. */
9555 static bool
9556 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9558 const struct processor *tmp_arch = NULL;
9559 enum aarch64_parse_opt_result parse_res
9560 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9562 if (parse_res == AARCH64_PARSE_OK)
9564 gcc_assert (tmp_arch);
9565 selected_arch = tmp_arch;
9566 explicit_arch = selected_arch->arch;
9567 return true;
9570 switch (parse_res)
9572 case AARCH64_PARSE_MISSING_ARG:
9573 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9574 break;
9575 case AARCH64_PARSE_INVALID_ARG:
9576 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9577 aarch64_print_hint_for_arch (str);
9578 break;
9579 case AARCH64_PARSE_INVALID_FEATURE:
9580 error ("invalid feature modifier %qs for 'arch' target %s",
9581 str, pragma_or_attr);
9582 break;
9583 default:
9584 gcc_unreachable ();
9587 return false;
9590 /* Handle the argument CPU_STR to the cpu= target attribute.
9591 PRAGMA_OR_ATTR is used in potential error messages. */
9593 static bool
9594 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9596 const struct processor *tmp_cpu = NULL;
9597 enum aarch64_parse_opt_result parse_res
9598 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9600 if (parse_res == AARCH64_PARSE_OK)
9602 gcc_assert (tmp_cpu);
9603 selected_tune = tmp_cpu;
9604 explicit_tune_core = selected_tune->ident;
9606 selected_arch = &all_architectures[tmp_cpu->arch];
9607 explicit_arch = selected_arch->arch;
9608 return true;
9611 switch (parse_res)
9613 case AARCH64_PARSE_MISSING_ARG:
9614 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9615 break;
9616 case AARCH64_PARSE_INVALID_ARG:
9617 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9618 aarch64_print_hint_for_core (str);
9619 break;
9620 case AARCH64_PARSE_INVALID_FEATURE:
9621 error ("invalid feature modifier %qs for 'cpu' target %s",
9622 str, pragma_or_attr);
9623 break;
9624 default:
9625 gcc_unreachable ();
9628 return false;
9631 /* Handle the argument STR to the tune= target attribute.
9632 PRAGMA_OR_ATTR is used in potential error messages. */
9634 static bool
9635 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9637 const struct processor *tmp_tune = NULL;
9638 enum aarch64_parse_opt_result parse_res
9639 = aarch64_parse_tune (str, &tmp_tune);
9641 if (parse_res == AARCH64_PARSE_OK)
9643 gcc_assert (tmp_tune);
9644 selected_tune = tmp_tune;
9645 explicit_tune_core = selected_tune->ident;
9646 return true;
9649 switch (parse_res)
9651 case AARCH64_PARSE_INVALID_ARG:
9652 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9653 aarch64_print_hint_for_core (str);
9654 break;
9655 default:
9656 gcc_unreachable ();
9659 return false;
9662 /* Parse an architecture extensions target attribute string specified in STR.
9663 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9664 if successful. Update aarch64_isa_flags to reflect the ISA features
9665 modified.
9666 PRAGMA_OR_ATTR is used in potential error messages. */
9668 static bool
9669 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9671 enum aarch64_parse_opt_result parse_res;
9672 unsigned long isa_flags = aarch64_isa_flags;
9674 /* We allow "+nothing" in the beginning to clear out all architectural
9675 features if the user wants to handpick specific features. */
9676 if (strncmp ("+nothing", str, 8) == 0)
9678 isa_flags = 0;
9679 str += 8;
9682 parse_res = aarch64_parse_extension (str, &isa_flags);
9684 if (parse_res == AARCH64_PARSE_OK)
9686 aarch64_isa_flags = isa_flags;
9687 return true;
9690 switch (parse_res)
9692 case AARCH64_PARSE_MISSING_ARG:
9693 error ("missing feature modifier in target %s %qs",
9694 pragma_or_attr, str);
9695 break;
9697 case AARCH64_PARSE_INVALID_FEATURE:
9698 error ("invalid feature modifier in target %s %qs",
9699 pragma_or_attr, str);
9700 break;
9702 default:
9703 gcc_unreachable ();
9706 return false;
9709 /* The target attributes that we support. On top of these we also support just
9710 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9711 handled explicitly in aarch64_process_one_target_attr. */
9713 static const struct aarch64_attribute_info aarch64_attributes[] =
9715 { "general-regs-only", aarch64_attr_mask, false, NULL,
9716 OPT_mgeneral_regs_only },
9717 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9718 OPT_mfix_cortex_a53_835769 },
9719 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9720 OPT_mfix_cortex_a53_843419 },
9721 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9722 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9723 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9724 OPT_momit_leaf_frame_pointer },
9725 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9726 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9727 OPT_march_ },
9728 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9729 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9730 OPT_mtune_ },
9731 { "sign-return-address", aarch64_attr_enum, false, NULL,
9732 OPT_msign_return_address_ },
9733 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9736 /* Parse ARG_STR which contains the definition of one target attribute.
9737 Show appropriate errors if any or return true if the attribute is valid.
9738 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9739 we're processing a target attribute or pragma. */
9741 static bool
9742 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9744 bool invert = false;
9746 size_t len = strlen (arg_str);
9748 if (len == 0)
9750 error ("malformed target %s", pragma_or_attr);
9751 return false;
9754 char *str_to_check = (char *) alloca (len + 1);
9755 strcpy (str_to_check, arg_str);
9757 /* Skip leading whitespace. */
9758 while (*str_to_check == ' ' || *str_to_check == '\t')
9759 str_to_check++;
9761 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9762 It is easier to detect and handle it explicitly here rather than going
9763 through the machinery for the rest of the target attributes in this
9764 function. */
9765 if (*str_to_check == '+')
9766 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9768 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9770 invert = true;
9771 str_to_check += 3;
9773 char *arg = strchr (str_to_check, '=');
9775 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9776 and point ARG to "foo". */
9777 if (arg)
9779 *arg = '\0';
9780 arg++;
9782 const struct aarch64_attribute_info *p_attr;
9783 bool found = false;
9784 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9786 /* If the names don't match up, or the user has given an argument
9787 to an attribute that doesn't accept one, or didn't give an argument
9788 to an attribute that expects one, fail to match. */
9789 if (strcmp (str_to_check, p_attr->name) != 0)
9790 continue;
9792 found = true;
9793 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9794 || p_attr->attr_type == aarch64_attr_enum;
9796 if (attr_need_arg_p ^ (arg != NULL))
9798 error ("target %s %qs does not accept an argument",
9799 pragma_or_attr, str_to_check);
9800 return false;
9803 /* If the name matches but the attribute does not allow "no-" versions
9804 then we can't match. */
9805 if (invert && !p_attr->allow_neg)
9807 error ("target %s %qs does not allow a negated form",
9808 pragma_or_attr, str_to_check);
9809 return false;
9812 switch (p_attr->attr_type)
9814 /* Has a custom handler registered.
9815 For example, cpu=, arch=, tune=. */
9816 case aarch64_attr_custom:
9817 gcc_assert (p_attr->handler);
9818 if (!p_attr->handler (arg, pragma_or_attr))
9819 return false;
9820 break;
9822 /* Either set or unset a boolean option. */
9823 case aarch64_attr_bool:
9825 struct cl_decoded_option decoded;
9827 generate_option (p_attr->opt_num, NULL, !invert,
9828 CL_TARGET, &decoded);
9829 aarch64_handle_option (&global_options, &global_options_set,
9830 &decoded, input_location);
9831 break;
9833 /* Set or unset a bit in the target_flags. aarch64_handle_option
9834 should know what mask to apply given the option number. */
9835 case aarch64_attr_mask:
9837 struct cl_decoded_option decoded;
9838 /* We only need to specify the option number.
9839 aarch64_handle_option will know which mask to apply. */
9840 decoded.opt_index = p_attr->opt_num;
9841 decoded.value = !invert;
9842 aarch64_handle_option (&global_options, &global_options_set,
9843 &decoded, input_location);
9844 break;
9846 /* Use the option setting machinery to set an option to an enum. */
9847 case aarch64_attr_enum:
9849 gcc_assert (arg);
9850 bool valid;
9851 int value;
9852 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9853 &value, CL_TARGET);
9854 if (valid)
9856 set_option (&global_options, NULL, p_attr->opt_num, value,
9857 NULL, DK_UNSPECIFIED, input_location,
9858 global_dc);
9860 else
9862 error ("target %s %s=%s is not valid",
9863 pragma_or_attr, str_to_check, arg);
9865 break;
9867 default:
9868 gcc_unreachable ();
9872 /* If we reached here we either have found an attribute and validated
9873 it or didn't match any. If we matched an attribute but its arguments
9874 were malformed we will have returned false already. */
9875 return found;
9878 /* Count how many times the character C appears in
9879 NULL-terminated string STR. */
9881 static unsigned int
9882 num_occurences_in_str (char c, char *str)
9884 unsigned int res = 0;
9885 while (*str != '\0')
9887 if (*str == c)
9888 res++;
9890 str++;
9893 return res;
9896 /* Parse the tree in ARGS that contains the target attribute information
9897 and update the global target options space. PRAGMA_OR_ATTR is a string
9898 to be used in error messages, specifying whether this is processing
9899 a target attribute or a target pragma. */
9901 bool
9902 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9904 if (TREE_CODE (args) == TREE_LIST)
9908 tree head = TREE_VALUE (args);
9909 if (head)
9911 if (!aarch64_process_target_attr (head, pragma_or_attr))
9912 return false;
9914 args = TREE_CHAIN (args);
9915 } while (args);
9917 return true;
9920 if (TREE_CODE (args) != STRING_CST)
9922 error ("attribute %<target%> argument not a string");
9923 return false;
9926 size_t len = strlen (TREE_STRING_POINTER (args));
9927 char *str_to_check = (char *) alloca (len + 1);
9928 strcpy (str_to_check, TREE_STRING_POINTER (args));
9930 if (len == 0)
9932 error ("malformed target %s value", pragma_or_attr);
9933 return false;
9936 /* Used to catch empty spaces between commas i.e.
9937 attribute ((target ("attr1,,attr2"))). */
9938 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9940 /* Handle multiple target attributes separated by ','. */
9941 char *token = strtok (str_to_check, ",");
9943 unsigned int num_attrs = 0;
9944 while (token)
9946 num_attrs++;
9947 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9949 error ("target %s %qs is invalid", pragma_or_attr, token);
9950 return false;
9953 token = strtok (NULL, ",");
9956 if (num_attrs != num_commas + 1)
9958 error ("malformed target %s list %qs",
9959 pragma_or_attr, TREE_STRING_POINTER (args));
9960 return false;
9963 return true;
9966 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9967 process attribute ((target ("..."))). */
9969 static bool
9970 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9972 struct cl_target_option cur_target;
9973 bool ret;
9974 tree old_optimize;
9975 tree new_target, new_optimize;
9976 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9978 /* If what we're processing is the current pragma string then the
9979 target option node is already stored in target_option_current_node
9980 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9981 having to re-parse the string. This is especially useful to keep
9982 arm_neon.h compile times down since that header contains a lot
9983 of intrinsics enclosed in pragmas. */
9984 if (!existing_target && args == current_target_pragma)
9986 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9987 return true;
9989 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9991 old_optimize = build_optimization_node (&global_options);
9992 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9994 /* If the function changed the optimization levels as well as setting
9995 target options, start with the optimizations specified. */
9996 if (func_optimize && func_optimize != old_optimize)
9997 cl_optimization_restore (&global_options,
9998 TREE_OPTIMIZATION (func_optimize));
10000 /* Save the current target options to restore at the end. */
10001 cl_target_option_save (&cur_target, &global_options);
10003 /* If fndecl already has some target attributes applied to it, unpack
10004 them so that we add this attribute on top of them, rather than
10005 overwriting them. */
10006 if (existing_target)
10008 struct cl_target_option *existing_options
10009 = TREE_TARGET_OPTION (existing_target);
10011 if (existing_options)
10012 cl_target_option_restore (&global_options, existing_options);
10014 else
10015 cl_target_option_restore (&global_options,
10016 TREE_TARGET_OPTION (target_option_current_node));
10019 ret = aarch64_process_target_attr (args, "attribute");
10021 /* Set up any additional state. */
10022 if (ret)
10024 aarch64_override_options_internal (&global_options);
10025 /* Initialize SIMD builtins if we haven't already.
10026 Set current_target_pragma to NULL for the duration so that
10027 the builtin initialization code doesn't try to tag the functions
10028 being built with the attributes specified by any current pragma, thus
10029 going into an infinite recursion. */
10030 if (TARGET_SIMD)
10032 tree saved_current_target_pragma = current_target_pragma;
10033 current_target_pragma = NULL;
10034 aarch64_init_simd_builtins ();
10035 current_target_pragma = saved_current_target_pragma;
10037 new_target = build_target_option_node (&global_options);
10039 else
10040 new_target = NULL;
10042 new_optimize = build_optimization_node (&global_options);
10044 if (fndecl && ret)
10046 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10048 if (old_optimize != new_optimize)
10049 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10052 cl_target_option_restore (&global_options, &cur_target);
10054 if (old_optimize != new_optimize)
10055 cl_optimization_restore (&global_options,
10056 TREE_OPTIMIZATION (old_optimize));
10057 return ret;
10060 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10061 tri-bool options (yes, no, don't care) and the default value is
10062 DEF, determine whether to reject inlining. */
10064 static bool
10065 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10066 int dont_care, int def)
10068 /* If the callee doesn't care, always allow inlining. */
10069 if (callee == dont_care)
10070 return true;
10072 /* If the caller doesn't care, always allow inlining. */
10073 if (caller == dont_care)
10074 return true;
10076 /* Otherwise, allow inlining if either the callee and caller values
10077 agree, or if the callee is using the default value. */
10078 return (callee == caller || callee == def);
10081 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10082 to inline CALLEE into CALLER based on target-specific info.
10083 Make sure that the caller and callee have compatible architectural
10084 features. Then go through the other possible target attributes
10085 and see if they can block inlining. Try not to reject always_inline
10086 callees unless they are incompatible architecturally. */
10088 static bool
10089 aarch64_can_inline_p (tree caller, tree callee)
10091 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10092 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10094 /* If callee has no option attributes, then it is ok to inline. */
10095 if (!callee_tree)
10096 return true;
10098 struct cl_target_option *caller_opts
10099 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10100 : target_option_default_node);
10102 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10105 /* Callee's ISA flags should be a subset of the caller's. */
10106 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10107 != callee_opts->x_aarch64_isa_flags)
10108 return false;
10110 /* Allow non-strict aligned functions inlining into strict
10111 aligned ones. */
10112 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10113 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10114 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10115 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10116 return false;
10118 bool always_inline = lookup_attribute ("always_inline",
10119 DECL_ATTRIBUTES (callee));
10121 /* If the architectural features match up and the callee is always_inline
10122 then the other attributes don't matter. */
10123 if (always_inline)
10124 return true;
10126 if (caller_opts->x_aarch64_cmodel_var
10127 != callee_opts->x_aarch64_cmodel_var)
10128 return false;
10130 if (caller_opts->x_aarch64_tls_dialect
10131 != callee_opts->x_aarch64_tls_dialect)
10132 return false;
10134 /* Honour explicit requests to workaround errata. */
10135 if (!aarch64_tribools_ok_for_inlining_p (
10136 caller_opts->x_aarch64_fix_a53_err835769,
10137 callee_opts->x_aarch64_fix_a53_err835769,
10138 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10139 return false;
10141 if (!aarch64_tribools_ok_for_inlining_p (
10142 caller_opts->x_aarch64_fix_a53_err843419,
10143 callee_opts->x_aarch64_fix_a53_err843419,
10144 2, TARGET_FIX_ERR_A53_843419))
10145 return false;
10147 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10148 caller and calle and they don't match up, reject inlining. */
10149 if (!aarch64_tribools_ok_for_inlining_p (
10150 caller_opts->x_flag_omit_leaf_frame_pointer,
10151 callee_opts->x_flag_omit_leaf_frame_pointer,
10152 2, 1))
10153 return false;
10155 /* If the callee has specific tuning overrides, respect them. */
10156 if (callee_opts->x_aarch64_override_tune_string != NULL
10157 && caller_opts->x_aarch64_override_tune_string == NULL)
10158 return false;
10160 /* If the user specified tuning override strings for the
10161 caller and callee and they don't match up, reject inlining.
10162 We just do a string compare here, we don't analyze the meaning
10163 of the string, as it would be too costly for little gain. */
10164 if (callee_opts->x_aarch64_override_tune_string
10165 && caller_opts->x_aarch64_override_tune_string
10166 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10167 caller_opts->x_aarch64_override_tune_string) != 0))
10168 return false;
10170 return true;
10173 /* Return true if SYMBOL_REF X binds locally. */
10175 static bool
10176 aarch64_symbol_binds_local_p (const_rtx x)
10178 return (SYMBOL_REF_DECL (x)
10179 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10180 : SYMBOL_REF_LOCAL_P (x));
10183 /* Return true if SYMBOL_REF X is thread local */
10184 static bool
10185 aarch64_tls_symbol_p (rtx x)
10187 if (! TARGET_HAVE_TLS)
10188 return false;
10190 if (GET_CODE (x) != SYMBOL_REF)
10191 return false;
10193 return SYMBOL_REF_TLS_MODEL (x) != 0;
10196 /* Classify a TLS symbol into one of the TLS kinds. */
10197 enum aarch64_symbol_type
10198 aarch64_classify_tls_symbol (rtx x)
10200 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10202 switch (tls_kind)
10204 case TLS_MODEL_GLOBAL_DYNAMIC:
10205 case TLS_MODEL_LOCAL_DYNAMIC:
10206 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10208 case TLS_MODEL_INITIAL_EXEC:
10209 switch (aarch64_cmodel)
10211 case AARCH64_CMODEL_TINY:
10212 case AARCH64_CMODEL_TINY_PIC:
10213 return SYMBOL_TINY_TLSIE;
10214 default:
10215 return SYMBOL_SMALL_TLSIE;
10218 case TLS_MODEL_LOCAL_EXEC:
10219 if (aarch64_tls_size == 12)
10220 return SYMBOL_TLSLE12;
10221 else if (aarch64_tls_size == 24)
10222 return SYMBOL_TLSLE24;
10223 else if (aarch64_tls_size == 32)
10224 return SYMBOL_TLSLE32;
10225 else if (aarch64_tls_size == 48)
10226 return SYMBOL_TLSLE48;
10227 else
10228 gcc_unreachable ();
10230 case TLS_MODEL_EMULATED:
10231 case TLS_MODEL_NONE:
10232 return SYMBOL_FORCE_TO_MEM;
10234 default:
10235 gcc_unreachable ();
10239 /* Return the method that should be used to access SYMBOL_REF or
10240 LABEL_REF X. */
10242 enum aarch64_symbol_type
10243 aarch64_classify_symbol (rtx x, rtx offset)
10245 if (GET_CODE (x) == LABEL_REF)
10247 switch (aarch64_cmodel)
10249 case AARCH64_CMODEL_LARGE:
10250 return SYMBOL_FORCE_TO_MEM;
10252 case AARCH64_CMODEL_TINY_PIC:
10253 case AARCH64_CMODEL_TINY:
10254 return SYMBOL_TINY_ABSOLUTE;
10256 case AARCH64_CMODEL_SMALL_SPIC:
10257 case AARCH64_CMODEL_SMALL_PIC:
10258 case AARCH64_CMODEL_SMALL:
10259 return SYMBOL_SMALL_ABSOLUTE;
10261 default:
10262 gcc_unreachable ();
10266 if (GET_CODE (x) == SYMBOL_REF)
10268 if (aarch64_tls_symbol_p (x))
10269 return aarch64_classify_tls_symbol (x);
10271 switch (aarch64_cmodel)
10273 case AARCH64_CMODEL_TINY:
10274 /* When we retrieve symbol + offset address, we have to make sure
10275 the offset does not cause overflow of the final address. But
10276 we have no way of knowing the address of symbol at compile time
10277 so we can't accurately say if the distance between the PC and
10278 symbol + offset is outside the addressible range of +/-1M in the
10279 TINY code model. So we rely on images not being greater than
10280 1M and cap the offset at 1M and anything beyond 1M will have to
10281 be loaded using an alternative mechanism. Furthermore if the
10282 symbol is a weak reference to something that isn't known to
10283 resolve to a symbol in this module, then force to memory. */
10284 if ((SYMBOL_REF_WEAK (x)
10285 && !aarch64_symbol_binds_local_p (x))
10286 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10287 return SYMBOL_FORCE_TO_MEM;
10288 return SYMBOL_TINY_ABSOLUTE;
10290 case AARCH64_CMODEL_SMALL:
10291 /* Same reasoning as the tiny code model, but the offset cap here is
10292 4G. */
10293 if ((SYMBOL_REF_WEAK (x)
10294 && !aarch64_symbol_binds_local_p (x))
10295 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10296 HOST_WIDE_INT_C (4294967264)))
10297 return SYMBOL_FORCE_TO_MEM;
10298 return SYMBOL_SMALL_ABSOLUTE;
10300 case AARCH64_CMODEL_TINY_PIC:
10301 if (!aarch64_symbol_binds_local_p (x))
10302 return SYMBOL_TINY_GOT;
10303 return SYMBOL_TINY_ABSOLUTE;
10305 case AARCH64_CMODEL_SMALL_SPIC:
10306 case AARCH64_CMODEL_SMALL_PIC:
10307 if (!aarch64_symbol_binds_local_p (x))
10308 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10309 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10310 return SYMBOL_SMALL_ABSOLUTE;
10312 case AARCH64_CMODEL_LARGE:
10313 /* This is alright even in PIC code as the constant
10314 pool reference is always PC relative and within
10315 the same translation unit. */
10316 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10317 return SYMBOL_SMALL_ABSOLUTE;
10318 else
10319 return SYMBOL_FORCE_TO_MEM;
10321 default:
10322 gcc_unreachable ();
10326 /* By default push everything into the constant pool. */
10327 return SYMBOL_FORCE_TO_MEM;
10330 bool
10331 aarch64_constant_address_p (rtx x)
10333 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10336 bool
10337 aarch64_legitimate_pic_operand_p (rtx x)
10339 if (GET_CODE (x) == SYMBOL_REF
10340 || (GET_CODE (x) == CONST
10341 && GET_CODE (XEXP (x, 0)) == PLUS
10342 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10343 return false;
10345 return true;
10348 /* Return true if X holds either a quarter-precision or
10349 floating-point +0.0 constant. */
10350 static bool
10351 aarch64_valid_floating_const (rtx x)
10353 if (!CONST_DOUBLE_P (x))
10354 return false;
10356 /* This call determines which constants can be used in mov<mode>
10357 as integer moves instead of constant loads. */
10358 if (aarch64_float_const_rtx_p (x))
10359 return true;
10361 return aarch64_float_const_representable_p (x);
10364 static bool
10365 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10367 /* Do not allow vector struct mode constants. We could support
10368 0 and -1 easily, but they need support in aarch64-simd.md. */
10369 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10370 return false;
10372 /* For these cases we never want to use a literal load.
10373 As such we have to prevent the compiler from forcing these
10374 to memory. */
10375 if ((GET_CODE (x) == CONST_VECTOR
10376 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10377 || CONST_INT_P (x)
10378 || aarch64_valid_floating_const (x)
10379 || aarch64_can_const_movi_rtx_p (x, mode)
10380 || aarch64_float_const_rtx_p (x))
10381 return !targetm.cannot_force_const_mem (mode, x);
10383 if (GET_CODE (x) == HIGH
10384 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10385 return true;
10387 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10388 so spilling them is better than rematerialization. */
10389 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10390 return true;
10392 return aarch64_constant_address_p (x);
10396 aarch64_load_tp (rtx target)
10398 if (!target
10399 || GET_MODE (target) != Pmode
10400 || !register_operand (target, Pmode))
10401 target = gen_reg_rtx (Pmode);
10403 /* Can return in any reg. */
10404 emit_insn (gen_aarch64_load_tp_hard (target));
10405 return target;
10408 /* On AAPCS systems, this is the "struct __va_list". */
10409 static GTY(()) tree va_list_type;
10411 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10412 Return the type to use as __builtin_va_list.
10414 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10416 struct __va_list
10418 void *__stack;
10419 void *__gr_top;
10420 void *__vr_top;
10421 int __gr_offs;
10422 int __vr_offs;
10423 }; */
10425 static tree
10426 aarch64_build_builtin_va_list (void)
10428 tree va_list_name;
10429 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10431 /* Create the type. */
10432 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10433 /* Give it the required name. */
10434 va_list_name = build_decl (BUILTINS_LOCATION,
10435 TYPE_DECL,
10436 get_identifier ("__va_list"),
10437 va_list_type);
10438 DECL_ARTIFICIAL (va_list_name) = 1;
10439 TYPE_NAME (va_list_type) = va_list_name;
10440 TYPE_STUB_DECL (va_list_type) = va_list_name;
10442 /* Create the fields. */
10443 f_stack = build_decl (BUILTINS_LOCATION,
10444 FIELD_DECL, get_identifier ("__stack"),
10445 ptr_type_node);
10446 f_grtop = build_decl (BUILTINS_LOCATION,
10447 FIELD_DECL, get_identifier ("__gr_top"),
10448 ptr_type_node);
10449 f_vrtop = build_decl (BUILTINS_LOCATION,
10450 FIELD_DECL, get_identifier ("__vr_top"),
10451 ptr_type_node);
10452 f_groff = build_decl (BUILTINS_LOCATION,
10453 FIELD_DECL, get_identifier ("__gr_offs"),
10454 integer_type_node);
10455 f_vroff = build_decl (BUILTINS_LOCATION,
10456 FIELD_DECL, get_identifier ("__vr_offs"),
10457 integer_type_node);
10459 /* Tell tree-stdarg pass about our internal offset fields.
10460 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10461 purpose to identify whether the code is updating va_list internal
10462 offset fields through irregular way. */
10463 va_list_gpr_counter_field = f_groff;
10464 va_list_fpr_counter_field = f_vroff;
10466 DECL_ARTIFICIAL (f_stack) = 1;
10467 DECL_ARTIFICIAL (f_grtop) = 1;
10468 DECL_ARTIFICIAL (f_vrtop) = 1;
10469 DECL_ARTIFICIAL (f_groff) = 1;
10470 DECL_ARTIFICIAL (f_vroff) = 1;
10472 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10473 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10474 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10475 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10476 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10478 TYPE_FIELDS (va_list_type) = f_stack;
10479 DECL_CHAIN (f_stack) = f_grtop;
10480 DECL_CHAIN (f_grtop) = f_vrtop;
10481 DECL_CHAIN (f_vrtop) = f_groff;
10482 DECL_CHAIN (f_groff) = f_vroff;
10484 /* Compute its layout. */
10485 layout_type (va_list_type);
10487 return va_list_type;
10490 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10491 static void
10492 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10494 const CUMULATIVE_ARGS *cum;
10495 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10496 tree stack, grtop, vrtop, groff, vroff;
10497 tree t;
10498 int gr_save_area_size = cfun->va_list_gpr_size;
10499 int vr_save_area_size = cfun->va_list_fpr_size;
10500 int vr_offset;
10502 cum = &crtl->args.info;
10503 if (cfun->va_list_gpr_size)
10504 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10505 cfun->va_list_gpr_size);
10506 if (cfun->va_list_fpr_size)
10507 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10508 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10510 if (!TARGET_FLOAT)
10512 gcc_assert (cum->aapcs_nvrn == 0);
10513 vr_save_area_size = 0;
10516 f_stack = TYPE_FIELDS (va_list_type_node);
10517 f_grtop = DECL_CHAIN (f_stack);
10518 f_vrtop = DECL_CHAIN (f_grtop);
10519 f_groff = DECL_CHAIN (f_vrtop);
10520 f_vroff = DECL_CHAIN (f_groff);
10522 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10523 NULL_TREE);
10524 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10525 NULL_TREE);
10526 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10527 NULL_TREE);
10528 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10529 NULL_TREE);
10530 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10531 NULL_TREE);
10533 /* Emit code to initialize STACK, which points to the next varargs stack
10534 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10535 by named arguments. STACK is 8-byte aligned. */
10536 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10537 if (cum->aapcs_stack_size > 0)
10538 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10539 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10540 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10542 /* Emit code to initialize GRTOP, the top of the GR save area.
10543 virtual_incoming_args_rtx should have been 16 byte aligned. */
10544 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10545 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10546 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10548 /* Emit code to initialize VRTOP, the top of the VR save area.
10549 This address is gr_save_area_bytes below GRTOP, rounded
10550 down to the next 16-byte boundary. */
10551 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10552 vr_offset = ROUND_UP (gr_save_area_size,
10553 STACK_BOUNDARY / BITS_PER_UNIT);
10555 if (vr_offset)
10556 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10557 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10558 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10560 /* Emit code to initialize GROFF, the offset from GRTOP of the
10561 next GPR argument. */
10562 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10563 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10564 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10566 /* Likewise emit code to initialize VROFF, the offset from FTOP
10567 of the next VR argument. */
10568 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10569 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10570 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10573 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10575 static tree
10576 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10577 gimple_seq *post_p ATTRIBUTE_UNUSED)
10579 tree addr;
10580 bool indirect_p;
10581 bool is_ha; /* is HFA or HVA. */
10582 bool dw_align; /* double-word align. */
10583 machine_mode ag_mode = VOIDmode;
10584 int nregs;
10585 machine_mode mode;
10587 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10588 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10589 HOST_WIDE_INT size, rsize, adjust, align;
10590 tree t, u, cond1, cond2;
10592 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10593 if (indirect_p)
10594 type = build_pointer_type (type);
10596 mode = TYPE_MODE (type);
10598 f_stack = TYPE_FIELDS (va_list_type_node);
10599 f_grtop = DECL_CHAIN (f_stack);
10600 f_vrtop = DECL_CHAIN (f_grtop);
10601 f_groff = DECL_CHAIN (f_vrtop);
10602 f_vroff = DECL_CHAIN (f_groff);
10604 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10605 f_stack, NULL_TREE);
10606 size = int_size_in_bytes (type);
10607 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10609 dw_align = false;
10610 adjust = 0;
10611 if (aarch64_vfp_is_call_or_return_candidate (mode,
10612 type,
10613 &ag_mode,
10614 &nregs,
10615 &is_ha))
10617 /* TYPE passed in fp/simd registers. */
10618 if (!TARGET_FLOAT)
10619 aarch64_err_no_fpadvsimd (mode, "varargs");
10621 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10622 unshare_expr (valist), f_vrtop, NULL_TREE);
10623 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10624 unshare_expr (valist), f_vroff, NULL_TREE);
10626 rsize = nregs * UNITS_PER_VREG;
10628 if (is_ha)
10630 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10631 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10633 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10634 && size < UNITS_PER_VREG)
10636 adjust = UNITS_PER_VREG - size;
10639 else
10641 /* TYPE passed in general registers. */
10642 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10643 unshare_expr (valist), f_grtop, NULL_TREE);
10644 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10645 unshare_expr (valist), f_groff, NULL_TREE);
10646 rsize = ROUND_UP (size, UNITS_PER_WORD);
10647 nregs = rsize / UNITS_PER_WORD;
10649 if (align > 8)
10650 dw_align = true;
10652 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10653 && size < UNITS_PER_WORD)
10655 adjust = UNITS_PER_WORD - size;
10659 /* Get a local temporary for the field value. */
10660 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10662 /* Emit code to branch if off >= 0. */
10663 t = build2 (GE_EXPR, boolean_type_node, off,
10664 build_int_cst (TREE_TYPE (off), 0));
10665 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10667 if (dw_align)
10669 /* Emit: offs = (offs + 15) & -16. */
10670 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10671 build_int_cst (TREE_TYPE (off), 15));
10672 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10673 build_int_cst (TREE_TYPE (off), -16));
10674 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10676 else
10677 roundup = NULL;
10679 /* Update ap.__[g|v]r_offs */
10680 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10681 build_int_cst (TREE_TYPE (off), rsize));
10682 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10684 /* String up. */
10685 if (roundup)
10686 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10688 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10689 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10690 build_int_cst (TREE_TYPE (f_off), 0));
10691 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10693 /* String up: make sure the assignment happens before the use. */
10694 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10695 COND_EXPR_ELSE (cond1) = t;
10697 /* Prepare the trees handling the argument that is passed on the stack;
10698 the top level node will store in ON_STACK. */
10699 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10700 if (align > 8)
10702 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10703 t = fold_convert (intDI_type_node, arg);
10704 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10705 build_int_cst (TREE_TYPE (t), 15));
10706 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10707 build_int_cst (TREE_TYPE (t), -16));
10708 t = fold_convert (TREE_TYPE (arg), t);
10709 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10711 else
10712 roundup = NULL;
10713 /* Advance ap.__stack */
10714 t = fold_convert (intDI_type_node, arg);
10715 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10716 build_int_cst (TREE_TYPE (t), size + 7));
10717 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10718 build_int_cst (TREE_TYPE (t), -8));
10719 t = fold_convert (TREE_TYPE (arg), t);
10720 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10721 /* String up roundup and advance. */
10722 if (roundup)
10723 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10724 /* String up with arg */
10725 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10726 /* Big-endianness related address adjustment. */
10727 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10728 && size < UNITS_PER_WORD)
10730 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10731 size_int (UNITS_PER_WORD - size));
10732 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10735 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10736 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10738 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10739 t = off;
10740 if (adjust)
10741 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10742 build_int_cst (TREE_TYPE (off), adjust));
10744 t = fold_convert (sizetype, t);
10745 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10747 if (is_ha)
10749 /* type ha; // treat as "struct {ftype field[n];}"
10750 ... [computing offs]
10751 for (i = 0; i <nregs; ++i, offs += 16)
10752 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10753 return ha; */
10754 int i;
10755 tree tmp_ha, field_t, field_ptr_t;
10757 /* Declare a local variable. */
10758 tmp_ha = create_tmp_var_raw (type, "ha");
10759 gimple_add_tmp_var (tmp_ha);
10761 /* Establish the base type. */
10762 switch (ag_mode)
10764 case E_SFmode:
10765 field_t = float_type_node;
10766 field_ptr_t = float_ptr_type_node;
10767 break;
10768 case E_DFmode:
10769 field_t = double_type_node;
10770 field_ptr_t = double_ptr_type_node;
10771 break;
10772 case E_TFmode:
10773 field_t = long_double_type_node;
10774 field_ptr_t = long_double_ptr_type_node;
10775 break;
10776 case E_HFmode:
10777 field_t = aarch64_fp16_type_node;
10778 field_ptr_t = aarch64_fp16_ptr_type_node;
10779 break;
10780 case E_V2SImode:
10781 case E_V4SImode:
10783 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10784 field_t = build_vector_type_for_mode (innertype, ag_mode);
10785 field_ptr_t = build_pointer_type (field_t);
10787 break;
10788 default:
10789 gcc_assert (0);
10792 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10793 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10794 addr = t;
10795 t = fold_convert (field_ptr_t, addr);
10796 t = build2 (MODIFY_EXPR, field_t,
10797 build1 (INDIRECT_REF, field_t, tmp_ha),
10798 build1 (INDIRECT_REF, field_t, t));
10800 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10801 for (i = 1; i < nregs; ++i)
10803 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10804 u = fold_convert (field_ptr_t, addr);
10805 u = build2 (MODIFY_EXPR, field_t,
10806 build2 (MEM_REF, field_t, tmp_ha,
10807 build_int_cst (field_ptr_t,
10808 (i *
10809 int_size_in_bytes (field_t)))),
10810 build1 (INDIRECT_REF, field_t, u));
10811 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10814 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10815 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10818 COND_EXPR_ELSE (cond2) = t;
10819 addr = fold_convert (build_pointer_type (type), cond1);
10820 addr = build_va_arg_indirect_ref (addr);
10822 if (indirect_p)
10823 addr = build_va_arg_indirect_ref (addr);
10825 return addr;
10828 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10830 static void
10831 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10832 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10833 int no_rtl)
10835 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10836 CUMULATIVE_ARGS local_cum;
10837 int gr_saved = cfun->va_list_gpr_size;
10838 int vr_saved = cfun->va_list_fpr_size;
10840 /* The caller has advanced CUM up to, but not beyond, the last named
10841 argument. Advance a local copy of CUM past the last "real" named
10842 argument, to find out how many registers are left over. */
10843 local_cum = *cum;
10844 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10846 /* Found out how many registers we need to save.
10847 Honor tree-stdvar analysis results. */
10848 if (cfun->va_list_gpr_size)
10849 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10850 cfun->va_list_gpr_size / UNITS_PER_WORD);
10851 if (cfun->va_list_fpr_size)
10852 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10853 cfun->va_list_fpr_size / UNITS_PER_VREG);
10855 if (!TARGET_FLOAT)
10857 gcc_assert (local_cum.aapcs_nvrn == 0);
10858 vr_saved = 0;
10861 if (!no_rtl)
10863 if (gr_saved > 0)
10865 rtx ptr, mem;
10867 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10868 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10869 - gr_saved * UNITS_PER_WORD);
10870 mem = gen_frame_mem (BLKmode, ptr);
10871 set_mem_alias_set (mem, get_varargs_alias_set ());
10873 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10874 mem, gr_saved);
10876 if (vr_saved > 0)
10878 /* We can't use move_block_from_reg, because it will use
10879 the wrong mode, storing D regs only. */
10880 machine_mode mode = TImode;
10881 int off, i, vr_start;
10883 /* Set OFF to the offset from virtual_incoming_args_rtx of
10884 the first vector register. The VR save area lies below
10885 the GR one, and is aligned to 16 bytes. */
10886 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10887 STACK_BOUNDARY / BITS_PER_UNIT);
10888 off -= vr_saved * UNITS_PER_VREG;
10890 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10891 for (i = 0; i < vr_saved; ++i)
10893 rtx ptr, mem;
10895 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10896 mem = gen_frame_mem (mode, ptr);
10897 set_mem_alias_set (mem, get_varargs_alias_set ());
10898 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10899 off += UNITS_PER_VREG;
10904 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10905 any complication of having crtl->args.pretend_args_size changed. */
10906 cfun->machine->frame.saved_varargs_size
10907 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10908 STACK_BOUNDARY / BITS_PER_UNIT)
10909 + vr_saved * UNITS_PER_VREG);
10912 static void
10913 aarch64_conditional_register_usage (void)
10915 int i;
10916 if (!TARGET_FLOAT)
10918 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10920 fixed_regs[i] = 1;
10921 call_used_regs[i] = 1;
10926 /* Walk down the type tree of TYPE counting consecutive base elements.
10927 If *MODEP is VOIDmode, then set it to the first valid floating point
10928 type. If a non-floating point type is found, or if a floating point
10929 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10930 otherwise return the count in the sub-tree. */
10931 static int
10932 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10934 machine_mode mode;
10935 HOST_WIDE_INT size;
10937 switch (TREE_CODE (type))
10939 case REAL_TYPE:
10940 mode = TYPE_MODE (type);
10941 if (mode != DFmode && mode != SFmode
10942 && mode != TFmode && mode != HFmode)
10943 return -1;
10945 if (*modep == VOIDmode)
10946 *modep = mode;
10948 if (*modep == mode)
10949 return 1;
10951 break;
10953 case COMPLEX_TYPE:
10954 mode = TYPE_MODE (TREE_TYPE (type));
10955 if (mode != DFmode && mode != SFmode
10956 && mode != TFmode && mode != HFmode)
10957 return -1;
10959 if (*modep == VOIDmode)
10960 *modep = mode;
10962 if (*modep == mode)
10963 return 2;
10965 break;
10967 case VECTOR_TYPE:
10968 /* Use V2SImode and V4SImode as representatives of all 64-bit
10969 and 128-bit vector types. */
10970 size = int_size_in_bytes (type);
10971 switch (size)
10973 case 8:
10974 mode = V2SImode;
10975 break;
10976 case 16:
10977 mode = V4SImode;
10978 break;
10979 default:
10980 return -1;
10983 if (*modep == VOIDmode)
10984 *modep = mode;
10986 /* Vector modes are considered to be opaque: two vectors are
10987 equivalent for the purposes of being homogeneous aggregates
10988 if they are the same size. */
10989 if (*modep == mode)
10990 return 1;
10992 break;
10994 case ARRAY_TYPE:
10996 int count;
10997 tree index = TYPE_DOMAIN (type);
10999 /* Can't handle incomplete types nor sizes that are not
11000 fixed. */
11001 if (!COMPLETE_TYPE_P (type)
11002 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11003 return -1;
11005 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11006 if (count == -1
11007 || !index
11008 || !TYPE_MAX_VALUE (index)
11009 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11010 || !TYPE_MIN_VALUE (index)
11011 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11012 || count < 0)
11013 return -1;
11015 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11016 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11018 /* There must be no padding. */
11019 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11020 return -1;
11022 return count;
11025 case RECORD_TYPE:
11027 int count = 0;
11028 int sub_count;
11029 tree field;
11031 /* Can't handle incomplete types nor sizes that are not
11032 fixed. */
11033 if (!COMPLETE_TYPE_P (type)
11034 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11035 return -1;
11037 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11039 if (TREE_CODE (field) != FIELD_DECL)
11040 continue;
11042 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11043 if (sub_count < 0)
11044 return -1;
11045 count += sub_count;
11048 /* There must be no padding. */
11049 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11050 return -1;
11052 return count;
11055 case UNION_TYPE:
11056 case QUAL_UNION_TYPE:
11058 /* These aren't very interesting except in a degenerate case. */
11059 int count = 0;
11060 int sub_count;
11061 tree field;
11063 /* Can't handle incomplete types nor sizes that are not
11064 fixed. */
11065 if (!COMPLETE_TYPE_P (type)
11066 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11067 return -1;
11069 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11071 if (TREE_CODE (field) != FIELD_DECL)
11072 continue;
11074 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11075 if (sub_count < 0)
11076 return -1;
11077 count = count > sub_count ? count : sub_count;
11080 /* There must be no padding. */
11081 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11082 return -1;
11084 return count;
11087 default:
11088 break;
11091 return -1;
11094 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11095 type as described in AAPCS64 \S 4.1.2.
11097 See the comment above aarch64_composite_type_p for the notes on MODE. */
11099 static bool
11100 aarch64_short_vector_p (const_tree type,
11101 machine_mode mode)
11103 HOST_WIDE_INT size = -1;
11105 if (type && TREE_CODE (type) == VECTOR_TYPE)
11106 size = int_size_in_bytes (type);
11107 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11108 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11109 size = GET_MODE_SIZE (mode);
11111 return (size == 8 || size == 16);
11114 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11115 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11116 array types. The C99 floating-point complex types are also considered
11117 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11118 types, which are GCC extensions and out of the scope of AAPCS64, are
11119 treated as composite types here as well.
11121 Note that MODE itself is not sufficient in determining whether a type
11122 is such a composite type or not. This is because
11123 stor-layout.c:compute_record_mode may have already changed the MODE
11124 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11125 structure with only one field may have its MODE set to the mode of the
11126 field. Also an integer mode whose size matches the size of the
11127 RECORD_TYPE type may be used to substitute the original mode
11128 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11129 solely relied on. */
11131 static bool
11132 aarch64_composite_type_p (const_tree type,
11133 machine_mode mode)
11135 if (aarch64_short_vector_p (type, mode))
11136 return false;
11138 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11139 return true;
11141 if (mode == BLKmode
11142 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11143 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11144 return true;
11146 return false;
11149 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11150 shall be passed or returned in simd/fp register(s) (providing these
11151 parameter passing registers are available).
11153 Upon successful return, *COUNT returns the number of needed registers,
11154 *BASE_MODE returns the mode of the individual register and when IS_HAF
11155 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11156 floating-point aggregate or a homogeneous short-vector aggregate. */
11158 static bool
11159 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11160 const_tree type,
11161 machine_mode *base_mode,
11162 int *count,
11163 bool *is_ha)
11165 machine_mode new_mode = VOIDmode;
11166 bool composite_p = aarch64_composite_type_p (type, mode);
11168 if (is_ha != NULL) *is_ha = false;
11170 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11171 || aarch64_short_vector_p (type, mode))
11173 *count = 1;
11174 new_mode = mode;
11176 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11178 if (is_ha != NULL) *is_ha = true;
11179 *count = 2;
11180 new_mode = GET_MODE_INNER (mode);
11182 else if (type && composite_p)
11184 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11186 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11188 if (is_ha != NULL) *is_ha = true;
11189 *count = ag_count;
11191 else
11192 return false;
11194 else
11195 return false;
11197 *base_mode = new_mode;
11198 return true;
11201 /* Implement TARGET_STRUCT_VALUE_RTX. */
11203 static rtx
11204 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11205 int incoming ATTRIBUTE_UNUSED)
11207 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11210 /* Implements target hook vector_mode_supported_p. */
11211 static bool
11212 aarch64_vector_mode_supported_p (machine_mode mode)
11214 if (TARGET_SIMD
11215 && (mode == V4SImode || mode == V8HImode
11216 || mode == V16QImode || mode == V2DImode
11217 || mode == V2SImode || mode == V4HImode
11218 || mode == V8QImode || mode == V2SFmode
11219 || mode == V4SFmode || mode == V2DFmode
11220 || mode == V4HFmode || mode == V8HFmode
11221 || mode == V1DFmode))
11222 return true;
11224 return false;
11227 /* Return appropriate SIMD container
11228 for MODE within a vector of WIDTH bits. */
11229 static machine_mode
11230 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11232 gcc_assert (width == 64 || width == 128);
11233 if (TARGET_SIMD)
11235 if (width == 128)
11236 switch (mode)
11238 case E_DFmode:
11239 return V2DFmode;
11240 case E_SFmode:
11241 return V4SFmode;
11242 case E_HFmode:
11243 return V8HFmode;
11244 case E_SImode:
11245 return V4SImode;
11246 case E_HImode:
11247 return V8HImode;
11248 case E_QImode:
11249 return V16QImode;
11250 case E_DImode:
11251 return V2DImode;
11252 default:
11253 break;
11255 else
11256 switch (mode)
11258 case E_SFmode:
11259 return V2SFmode;
11260 case E_HFmode:
11261 return V4HFmode;
11262 case E_SImode:
11263 return V2SImode;
11264 case E_HImode:
11265 return V4HImode;
11266 case E_QImode:
11267 return V8QImode;
11268 default:
11269 break;
11272 return word_mode;
11275 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11276 static machine_mode
11277 aarch64_preferred_simd_mode (scalar_mode mode)
11279 return aarch64_simd_container_mode (mode, 128);
11282 /* Return the bitmask of possible vector sizes for the vectorizer
11283 to iterate over. */
11284 static unsigned int
11285 aarch64_autovectorize_vector_sizes (void)
11287 return (16 | 8);
11290 /* Implement TARGET_MANGLE_TYPE. */
11292 static const char *
11293 aarch64_mangle_type (const_tree type)
11295 /* The AArch64 ABI documents say that "__va_list" has to be
11296 managled as if it is in the "std" namespace. */
11297 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11298 return "St9__va_list";
11300 /* Half-precision float. */
11301 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11302 return "Dh";
11304 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11305 builtin types. */
11306 if (TYPE_NAME (type) != NULL)
11307 return aarch64_mangle_builtin_type (type);
11309 /* Use the default mangling. */
11310 return NULL;
11313 /* Find the first rtx_insn before insn that will generate an assembly
11314 instruction. */
11316 static rtx_insn *
11317 aarch64_prev_real_insn (rtx_insn *insn)
11319 if (!insn)
11320 return NULL;
11324 insn = prev_real_insn (insn);
11326 while (insn && recog_memoized (insn) < 0);
11328 return insn;
11331 static bool
11332 is_madd_op (enum attr_type t1)
11334 unsigned int i;
11335 /* A number of these may be AArch32 only. */
11336 enum attr_type mlatypes[] = {
11337 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11338 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11339 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11342 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11344 if (t1 == mlatypes[i])
11345 return true;
11348 return false;
11351 /* Check if there is a register dependency between a load and the insn
11352 for which we hold recog_data. */
11354 static bool
11355 dep_between_memop_and_curr (rtx memop)
11357 rtx load_reg;
11358 int opno;
11360 gcc_assert (GET_CODE (memop) == SET);
11362 if (!REG_P (SET_DEST (memop)))
11363 return false;
11365 load_reg = SET_DEST (memop);
11366 for (opno = 1; opno < recog_data.n_operands; opno++)
11368 rtx operand = recog_data.operand[opno];
11369 if (REG_P (operand)
11370 && reg_overlap_mentioned_p (load_reg, operand))
11371 return true;
11374 return false;
11378 /* When working around the Cortex-A53 erratum 835769,
11379 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11380 instruction and has a preceding memory instruction such that a NOP
11381 should be inserted between them. */
11383 bool
11384 aarch64_madd_needs_nop (rtx_insn* insn)
11386 enum attr_type attr_type;
11387 rtx_insn *prev;
11388 rtx body;
11390 if (!TARGET_FIX_ERR_A53_835769)
11391 return false;
11393 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11394 return false;
11396 attr_type = get_attr_type (insn);
11397 if (!is_madd_op (attr_type))
11398 return false;
11400 prev = aarch64_prev_real_insn (insn);
11401 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11402 Restore recog state to INSN to avoid state corruption. */
11403 extract_constrain_insn_cached (insn);
11405 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11406 return false;
11408 body = single_set (prev);
11410 /* If the previous insn is a memory op and there is no dependency between
11411 it and the DImode madd, emit a NOP between them. If body is NULL then we
11412 have a complex memory operation, probably a load/store pair.
11413 Be conservative for now and emit a NOP. */
11414 if (GET_MODE (recog_data.operand[0]) == DImode
11415 && (!body || !dep_between_memop_and_curr (body)))
11416 return true;
11418 return false;
11423 /* Implement FINAL_PRESCAN_INSN. */
11425 void
11426 aarch64_final_prescan_insn (rtx_insn *insn)
11428 if (aarch64_madd_needs_nop (insn))
11429 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11433 /* Return the equivalent letter for size. */
11434 static char
11435 sizetochar (int size)
11437 switch (size)
11439 case 64: return 'd';
11440 case 32: return 's';
11441 case 16: return 'h';
11442 case 8 : return 'b';
11443 default: gcc_unreachable ();
11447 /* Return true iff x is a uniform vector of floating-point
11448 constants, and the constant can be represented in
11449 quarter-precision form. Note, as aarch64_float_const_representable
11450 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11451 static bool
11452 aarch64_vect_float_const_representable_p (rtx x)
11454 rtx elt;
11455 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11456 && const_vec_duplicate_p (x, &elt)
11457 && aarch64_float_const_representable_p (elt));
11460 /* Return true for valid and false for invalid. */
11461 bool
11462 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11463 struct simd_immediate_info *info)
11465 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11466 matches = 1; \
11467 for (i = 0; i < idx; i += (STRIDE)) \
11468 if (!(TEST)) \
11469 matches = 0; \
11470 if (matches) \
11472 immtype = (CLASS); \
11473 elsize = (ELSIZE); \
11474 eshift = (SHIFT); \
11475 emvn = (NEG); \
11476 break; \
11479 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11480 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11481 unsigned char bytes[16];
11482 int immtype = -1, matches;
11483 unsigned int invmask = inverse ? 0xff : 0;
11484 int eshift, emvn;
11486 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11488 if (! (aarch64_simd_imm_zero_p (op, mode)
11489 || aarch64_vect_float_const_representable_p (op)))
11490 return false;
11492 if (info)
11494 rtx elt = CONST_VECTOR_ELT (op, 0);
11495 scalar_float_mode elt_mode
11496 = as_a <scalar_float_mode> (GET_MODE (elt));
11498 info->value = elt;
11499 info->element_width = GET_MODE_BITSIZE (elt_mode);
11500 info->mvn = false;
11501 info->shift = 0;
11504 return true;
11507 /* Splat vector constant out into a byte vector. */
11508 for (i = 0; i < n_elts; i++)
11510 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11511 it must be laid out in the vector register in reverse order. */
11512 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11513 unsigned HOST_WIDE_INT elpart;
11515 gcc_assert (CONST_INT_P (el));
11516 elpart = INTVAL (el);
11518 for (unsigned int byte = 0; byte < innersize; byte++)
11520 bytes[idx++] = (elpart & 0xff) ^ invmask;
11521 elpart >>= BITS_PER_UNIT;
11526 /* Sanity check. */
11527 gcc_assert (idx == GET_MODE_SIZE (mode));
11531 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11532 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11534 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11535 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11537 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11538 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11540 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11541 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11543 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11545 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11547 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11548 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11550 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11551 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11553 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11554 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11556 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11557 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11559 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11561 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11563 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11564 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11566 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11567 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11569 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11570 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11572 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11573 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11575 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11577 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11578 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11580 while (0);
11582 if (immtype == -1)
11583 return false;
11585 if (info)
11587 info->element_width = elsize;
11588 info->mvn = emvn != 0;
11589 info->shift = eshift;
11591 unsigned HOST_WIDE_INT imm = 0;
11593 if (immtype >= 12 && immtype <= 15)
11594 info->msl = true;
11596 /* Un-invert bytes of recognized vector, if necessary. */
11597 if (invmask != 0)
11598 for (i = 0; i < idx; i++)
11599 bytes[i] ^= invmask;
11601 if (immtype == 17)
11603 /* FIXME: Broken on 32-bit H_W_I hosts. */
11604 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11606 for (i = 0; i < 8; i++)
11607 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11608 << (i * BITS_PER_UNIT);
11611 info->value = GEN_INT (imm);
11613 else
11615 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11616 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11618 /* Construct 'abcdefgh' because the assembler cannot handle
11619 generic constants. */
11620 if (info->mvn)
11621 imm = ~imm;
11622 imm = (imm >> info->shift) & 0xff;
11623 info->value = GEN_INT (imm);
11627 return true;
11628 #undef CHECK
11631 /* Check of immediate shift constants are within range. */
11632 bool
11633 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11635 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11636 if (left)
11637 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11638 else
11639 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11642 /* Return true if X is a uniform vector where all elements
11643 are either the floating-point constant 0.0 or the
11644 integer constant 0. */
11645 bool
11646 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11648 return x == CONST0_RTX (mode);
11652 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11653 operation of width WIDTH at bit position POS. */
11656 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11658 gcc_assert (CONST_INT_P (width));
11659 gcc_assert (CONST_INT_P (pos));
11661 unsigned HOST_WIDE_INT mask
11662 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11663 return GEN_INT (mask << UINTVAL (pos));
11666 bool
11667 aarch64_mov_operand_p (rtx x, machine_mode mode)
11669 if (GET_CODE (x) == HIGH
11670 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11671 return true;
11673 if (CONST_INT_P (x))
11674 return true;
11676 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11677 return true;
11679 return aarch64_classify_symbolic_expression (x)
11680 == SYMBOL_TINY_ABSOLUTE;
11683 /* Return a const_int vector of VAL. */
11685 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11687 int nunits = GET_MODE_NUNITS (mode);
11688 rtvec v = rtvec_alloc (nunits);
11689 int i;
11691 rtx cache = GEN_INT (val);
11693 for (i=0; i < nunits; i++)
11694 RTVEC_ELT (v, i) = cache;
11696 return gen_rtx_CONST_VECTOR (mode, v);
11699 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11701 bool
11702 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11704 machine_mode vmode;
11706 gcc_assert (!VECTOR_MODE_P (mode));
11707 vmode = aarch64_preferred_simd_mode (as_a <scalar_mode> (mode));
11708 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11709 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11712 /* Construct and return a PARALLEL RTX vector with elements numbering the
11713 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11714 the vector - from the perspective of the architecture. This does not
11715 line up with GCC's perspective on lane numbers, so we end up with
11716 different masks depending on our target endian-ness. The diagram
11717 below may help. We must draw the distinction when building masks
11718 which select one half of the vector. An instruction selecting
11719 architectural low-lanes for a big-endian target, must be described using
11720 a mask selecting GCC high-lanes.
11722 Big-Endian Little-Endian
11724 GCC 0 1 2 3 3 2 1 0
11725 | x | x | x | x | | x | x | x | x |
11726 Architecture 3 2 1 0 3 2 1 0
11728 Low Mask: { 2, 3 } { 0, 1 }
11729 High Mask: { 0, 1 } { 2, 3 }
11733 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11735 int nunits = GET_MODE_NUNITS (mode);
11736 rtvec v = rtvec_alloc (nunits / 2);
11737 int high_base = nunits / 2;
11738 int low_base = 0;
11739 int base;
11740 rtx t1;
11741 int i;
11743 if (BYTES_BIG_ENDIAN)
11744 base = high ? low_base : high_base;
11745 else
11746 base = high ? high_base : low_base;
11748 for (i = 0; i < nunits / 2; i++)
11749 RTVEC_ELT (v, i) = GEN_INT (base + i);
11751 t1 = gen_rtx_PARALLEL (mode, v);
11752 return t1;
11755 /* Check OP for validity as a PARALLEL RTX vector with elements
11756 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11757 from the perspective of the architecture. See the diagram above
11758 aarch64_simd_vect_par_cnst_half for more details. */
11760 bool
11761 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11762 bool high)
11764 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11765 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11766 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11767 int i = 0;
11769 if (!VECTOR_MODE_P (mode))
11770 return false;
11772 if (count_op != count_ideal)
11773 return false;
11775 for (i = 0; i < count_ideal; i++)
11777 rtx elt_op = XVECEXP (op, 0, i);
11778 rtx elt_ideal = XVECEXP (ideal, 0, i);
11780 if (!CONST_INT_P (elt_op)
11781 || INTVAL (elt_ideal) != INTVAL (elt_op))
11782 return false;
11784 return true;
11787 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11788 HIGH (exclusive). */
11789 void
11790 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11791 const_tree exp)
11793 HOST_WIDE_INT lane;
11794 gcc_assert (CONST_INT_P (operand));
11795 lane = INTVAL (operand);
11797 if (lane < low || lane >= high)
11799 if (exp)
11800 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11801 else
11802 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11806 /* Return TRUE if OP is a valid vector addressing mode. */
11807 bool
11808 aarch64_simd_mem_operand_p (rtx op)
11810 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11811 || REG_P (XEXP (op, 0)));
11814 /* Emit a register copy from operand to operand, taking care not to
11815 early-clobber source registers in the process.
11817 COUNT is the number of components into which the copy needs to be
11818 decomposed. */
11819 void
11820 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11821 unsigned int count)
11823 unsigned int i;
11824 int rdest = REGNO (operands[0]);
11825 int rsrc = REGNO (operands[1]);
11827 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11828 || rdest < rsrc)
11829 for (i = 0; i < count; i++)
11830 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11831 gen_rtx_REG (mode, rsrc + i));
11832 else
11833 for (i = 0; i < count; i++)
11834 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11835 gen_rtx_REG (mode, rsrc + count - i - 1));
11838 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11839 one of VSTRUCT modes: OI, CI, or XI. */
11841 aarch64_simd_attr_length_rglist (machine_mode mode)
11843 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11846 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11847 alignment of a vector to 128 bits. */
11848 static HOST_WIDE_INT
11849 aarch64_simd_vector_alignment (const_tree type)
11851 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11852 return MIN (align, 128);
11855 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11856 static bool
11857 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11859 if (is_packed)
11860 return false;
11862 /* We guarantee alignment for vectors up to 128-bits. */
11863 if (tree_int_cst_compare (TYPE_SIZE (type),
11864 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11865 return false;
11867 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11868 return true;
11871 /* Return true if the vector misalignment factor is supported by the
11872 target. */
11873 static bool
11874 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11875 const_tree type, int misalignment,
11876 bool is_packed)
11878 if (TARGET_SIMD && STRICT_ALIGNMENT)
11880 /* Return if movmisalign pattern is not supported for this mode. */
11881 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11882 return false;
11884 if (misalignment == -1)
11886 /* Misalignment factor is unknown at compile time but we know
11887 it's word aligned. */
11888 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11890 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11892 if (element_size != 64)
11893 return true;
11895 return false;
11898 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11899 is_packed);
11902 /* If VALS is a vector constant that can be loaded into a register
11903 using DUP, generate instructions to do so and return an RTX to
11904 assign to the register. Otherwise return NULL_RTX. */
11905 static rtx
11906 aarch64_simd_dup_constant (rtx vals)
11908 machine_mode mode = GET_MODE (vals);
11909 machine_mode inner_mode = GET_MODE_INNER (mode);
11910 rtx x;
11912 if (!const_vec_duplicate_p (vals, &x))
11913 return NULL_RTX;
11915 /* We can load this constant by using DUP and a constant in a
11916 single ARM register. This will be cheaper than a vector
11917 load. */
11918 x = copy_to_mode_reg (inner_mode, x);
11919 return gen_rtx_VEC_DUPLICATE (mode, x);
11923 /* Generate code to load VALS, which is a PARALLEL containing only
11924 constants (for vec_init) or CONST_VECTOR, efficiently into a
11925 register. Returns an RTX to copy into the register, or NULL_RTX
11926 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11927 static rtx
11928 aarch64_simd_make_constant (rtx vals)
11930 machine_mode mode = GET_MODE (vals);
11931 rtx const_dup;
11932 rtx const_vec = NULL_RTX;
11933 int n_elts = GET_MODE_NUNITS (mode);
11934 int n_const = 0;
11935 int i;
11937 if (GET_CODE (vals) == CONST_VECTOR)
11938 const_vec = vals;
11939 else if (GET_CODE (vals) == PARALLEL)
11941 /* A CONST_VECTOR must contain only CONST_INTs and
11942 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11943 Only store valid constants in a CONST_VECTOR. */
11944 for (i = 0; i < n_elts; ++i)
11946 rtx x = XVECEXP (vals, 0, i);
11947 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11948 n_const++;
11950 if (n_const == n_elts)
11951 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11953 else
11954 gcc_unreachable ();
11956 if (const_vec != NULL_RTX
11957 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11958 /* Load using MOVI/MVNI. */
11959 return const_vec;
11960 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11961 /* Loaded using DUP. */
11962 return const_dup;
11963 else if (const_vec != NULL_RTX)
11964 /* Load from constant pool. We can not take advantage of single-cycle
11965 LD1 because we need a PC-relative addressing mode. */
11966 return const_vec;
11967 else
11968 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11969 We can not construct an initializer. */
11970 return NULL_RTX;
11973 /* Expand a vector initialisation sequence, such that TARGET is
11974 initialised to contain VALS. */
11976 void
11977 aarch64_expand_vector_init (rtx target, rtx vals)
11979 machine_mode mode = GET_MODE (target);
11980 machine_mode inner_mode = GET_MODE_INNER (mode);
11981 /* The number of vector elements. */
11982 int n_elts = GET_MODE_NUNITS (mode);
11983 /* The number of vector elements which are not constant. */
11984 int n_var = 0;
11985 rtx any_const = NULL_RTX;
11986 /* The first element of vals. */
11987 rtx v0 = XVECEXP (vals, 0, 0);
11988 bool all_same = true;
11990 /* Count the number of variable elements to initialise. */
11991 for (int i = 0; i < n_elts; ++i)
11993 rtx x = XVECEXP (vals, 0, i);
11994 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11995 ++n_var;
11996 else
11997 any_const = x;
11999 all_same &= rtx_equal_p (x, v0);
12002 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12003 how best to handle this. */
12004 if (n_var == 0)
12006 rtx constant = aarch64_simd_make_constant (vals);
12007 if (constant != NULL_RTX)
12009 emit_move_insn (target, constant);
12010 return;
12014 /* Splat a single non-constant element if we can. */
12015 if (all_same)
12017 rtx x = copy_to_mode_reg (inner_mode, v0);
12018 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12019 return;
12022 enum insn_code icode = optab_handler (vec_set_optab, mode);
12023 gcc_assert (icode != CODE_FOR_nothing);
12025 /* If there are only variable elements, try to optimize
12026 the insertion using dup for the most common element
12027 followed by insertions. */
12029 /* The algorithm will fill matches[*][0] with the earliest matching element,
12030 and matches[X][1] with the count of duplicate elements (if X is the
12031 earliest element which has duplicates). */
12033 if (n_var == n_elts && n_elts <= 16)
12035 int matches[16][2] = {0};
12036 for (int i = 0; i < n_elts; i++)
12038 for (int j = 0; j <= i; j++)
12040 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12042 matches[i][0] = j;
12043 matches[j][1]++;
12044 break;
12048 int maxelement = 0;
12049 int maxv = 0;
12050 for (int i = 0; i < n_elts; i++)
12051 if (matches[i][1] > maxv)
12053 maxelement = i;
12054 maxv = matches[i][1];
12057 /* Create a duplicate of the most common element. */
12058 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12059 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12061 /* Insert the rest. */
12062 for (int i = 0; i < n_elts; i++)
12064 rtx x = XVECEXP (vals, 0, i);
12065 if (matches[i][0] == maxelement)
12066 continue;
12067 x = copy_to_mode_reg (inner_mode, x);
12068 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12070 return;
12073 /* Initialise a vector which is part-variable. We want to first try
12074 to build those lanes which are constant in the most efficient way we
12075 can. */
12076 if (n_var != n_elts)
12078 rtx copy = copy_rtx (vals);
12080 /* Load constant part of vector. We really don't care what goes into the
12081 parts we will overwrite, but we're more likely to be able to load the
12082 constant efficiently if it has fewer, larger, repeating parts
12083 (see aarch64_simd_valid_immediate). */
12084 for (int i = 0; i < n_elts; i++)
12086 rtx x = XVECEXP (vals, 0, i);
12087 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12088 continue;
12089 rtx subst = any_const;
12090 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12092 /* Look in the copied vector, as more elements are const. */
12093 rtx test = XVECEXP (copy, 0, i ^ bit);
12094 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12096 subst = test;
12097 break;
12100 XVECEXP (copy, 0, i) = subst;
12102 aarch64_expand_vector_init (target, copy);
12105 /* Insert the variable lanes directly. */
12106 for (int i = 0; i < n_elts; i++)
12108 rtx x = XVECEXP (vals, 0, i);
12109 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12110 continue;
12111 x = copy_to_mode_reg (inner_mode, x);
12112 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12116 static unsigned HOST_WIDE_INT
12117 aarch64_shift_truncation_mask (machine_mode mode)
12119 return
12120 (!SHIFT_COUNT_TRUNCATED
12121 || aarch64_vector_mode_supported_p (mode)
12122 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12125 /* Select a format to encode pointers in exception handling data. */
12127 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12129 int type;
12130 switch (aarch64_cmodel)
12132 case AARCH64_CMODEL_TINY:
12133 case AARCH64_CMODEL_TINY_PIC:
12134 case AARCH64_CMODEL_SMALL:
12135 case AARCH64_CMODEL_SMALL_PIC:
12136 case AARCH64_CMODEL_SMALL_SPIC:
12137 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12138 for everything. */
12139 type = DW_EH_PE_sdata4;
12140 break;
12141 default:
12142 /* No assumptions here. 8-byte relocs required. */
12143 type = DW_EH_PE_sdata8;
12144 break;
12146 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12149 /* The last .arch and .tune assembly strings that we printed. */
12150 static std::string aarch64_last_printed_arch_string;
12151 static std::string aarch64_last_printed_tune_string;
12153 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12154 by the function fndecl. */
12156 void
12157 aarch64_declare_function_name (FILE *stream, const char* name,
12158 tree fndecl)
12160 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12162 struct cl_target_option *targ_options;
12163 if (target_parts)
12164 targ_options = TREE_TARGET_OPTION (target_parts);
12165 else
12166 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12167 gcc_assert (targ_options);
12169 const struct processor *this_arch
12170 = aarch64_get_arch (targ_options->x_explicit_arch);
12172 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12173 std::string extension
12174 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12175 this_arch->flags);
12176 /* Only update the assembler .arch string if it is distinct from the last
12177 such string we printed. */
12178 std::string to_print = this_arch->name + extension;
12179 if (to_print != aarch64_last_printed_arch_string)
12181 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12182 aarch64_last_printed_arch_string = to_print;
12185 /* Print the cpu name we're tuning for in the comments, might be
12186 useful to readers of the generated asm. Do it only when it changes
12187 from function to function and verbose assembly is requested. */
12188 const struct processor *this_tune
12189 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12191 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12193 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12194 this_tune->name);
12195 aarch64_last_printed_tune_string = this_tune->name;
12198 /* Don't forget the type directive for ELF. */
12199 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12200 ASM_OUTPUT_LABEL (stream, name);
12203 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12205 static void
12206 aarch64_start_file (void)
12208 struct cl_target_option *default_options
12209 = TREE_TARGET_OPTION (target_option_default_node);
12211 const struct processor *default_arch
12212 = aarch64_get_arch (default_options->x_explicit_arch);
12213 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12214 std::string extension
12215 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12216 default_arch->flags);
12218 aarch64_last_printed_arch_string = default_arch->name + extension;
12219 aarch64_last_printed_tune_string = "";
12220 asm_fprintf (asm_out_file, "\t.arch %s\n",
12221 aarch64_last_printed_arch_string.c_str ());
12223 default_file_start ();
12226 /* Emit load exclusive. */
12228 static void
12229 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12230 rtx mem, rtx model_rtx)
12232 rtx (*gen) (rtx, rtx, rtx);
12234 switch (mode)
12236 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12237 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12238 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12239 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12240 default:
12241 gcc_unreachable ();
12244 emit_insn (gen (rval, mem, model_rtx));
12247 /* Emit store exclusive. */
12249 static void
12250 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12251 rtx rval, rtx mem, rtx model_rtx)
12253 rtx (*gen) (rtx, rtx, rtx, rtx);
12255 switch (mode)
12257 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12258 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12259 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12260 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12261 default:
12262 gcc_unreachable ();
12265 emit_insn (gen (bval, rval, mem, model_rtx));
12268 /* Mark the previous jump instruction as unlikely. */
12270 static void
12271 aarch64_emit_unlikely_jump (rtx insn)
12273 rtx_insn *jump = emit_jump_insn (insn);
12274 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12277 /* Expand a compare and swap pattern. */
12279 void
12280 aarch64_expand_compare_and_swap (rtx operands[])
12282 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12283 machine_mode mode, cmp_mode;
12284 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12285 int idx;
12286 gen_cas_fn gen;
12287 const gen_cas_fn split_cas[] =
12289 gen_aarch64_compare_and_swapqi,
12290 gen_aarch64_compare_and_swaphi,
12291 gen_aarch64_compare_and_swapsi,
12292 gen_aarch64_compare_and_swapdi
12294 const gen_cas_fn atomic_cas[] =
12296 gen_aarch64_compare_and_swapqi_lse,
12297 gen_aarch64_compare_and_swaphi_lse,
12298 gen_aarch64_compare_and_swapsi_lse,
12299 gen_aarch64_compare_and_swapdi_lse
12302 bval = operands[0];
12303 rval = operands[1];
12304 mem = operands[2];
12305 oldval = operands[3];
12306 newval = operands[4];
12307 is_weak = operands[5];
12308 mod_s = operands[6];
12309 mod_f = operands[7];
12310 mode = GET_MODE (mem);
12311 cmp_mode = mode;
12313 /* Normally the succ memory model must be stronger than fail, but in the
12314 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12315 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12317 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12318 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12319 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12321 switch (mode)
12323 case E_QImode:
12324 case E_HImode:
12325 /* For short modes, we're going to perform the comparison in SImode,
12326 so do the zero-extension now. */
12327 cmp_mode = SImode;
12328 rval = gen_reg_rtx (SImode);
12329 oldval = convert_modes (SImode, mode, oldval, true);
12330 /* Fall through. */
12332 case E_SImode:
12333 case E_DImode:
12334 /* Force the value into a register if needed. */
12335 if (!aarch64_plus_operand (oldval, mode))
12336 oldval = force_reg (cmp_mode, oldval);
12337 break;
12339 default:
12340 gcc_unreachable ();
12343 switch (mode)
12345 case E_QImode: idx = 0; break;
12346 case E_HImode: idx = 1; break;
12347 case E_SImode: idx = 2; break;
12348 case E_DImode: idx = 3; break;
12349 default:
12350 gcc_unreachable ();
12352 if (TARGET_LSE)
12353 gen = atomic_cas[idx];
12354 else
12355 gen = split_cas[idx];
12357 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12359 if (mode == QImode || mode == HImode)
12360 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12362 x = gen_rtx_REG (CCmode, CC_REGNUM);
12363 x = gen_rtx_EQ (SImode, x, const0_rtx);
12364 emit_insn (gen_rtx_SET (bval, x));
12367 /* Test whether the target supports using a atomic load-operate instruction.
12368 CODE is the operation and AFTER is TRUE if the data in memory after the
12369 operation should be returned and FALSE if the data before the operation
12370 should be returned. Returns FALSE if the operation isn't supported by the
12371 architecture. */
12373 bool
12374 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12376 if (!TARGET_LSE)
12377 return false;
12379 switch (code)
12381 case SET:
12382 case AND:
12383 case IOR:
12384 case XOR:
12385 case MINUS:
12386 case PLUS:
12387 return true;
12388 default:
12389 return false;
12393 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12394 sequence implementing an atomic operation. */
12396 static void
12397 aarch64_emit_post_barrier (enum memmodel model)
12399 const enum memmodel base_model = memmodel_base (model);
12401 if (is_mm_sync (model)
12402 && (base_model == MEMMODEL_ACQUIRE
12403 || base_model == MEMMODEL_ACQ_REL
12404 || base_model == MEMMODEL_SEQ_CST))
12406 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12410 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12411 for the data in memory. EXPECTED is the value expected to be in memory.
12412 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12413 is the memory ordering to use. */
12415 void
12416 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12417 rtx expected, rtx desired,
12418 rtx model)
12420 rtx (*gen) (rtx, rtx, rtx, rtx);
12421 machine_mode mode;
12423 mode = GET_MODE (mem);
12425 switch (mode)
12427 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12428 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12429 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12430 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12431 default:
12432 gcc_unreachable ();
12435 /* Move the expected value into the CAS destination register. */
12436 emit_insn (gen_rtx_SET (rval, expected));
12438 /* Emit the CAS. */
12439 emit_insn (gen (rval, mem, desired, model));
12441 /* Compare the expected value with the value loaded by the CAS, to establish
12442 whether the swap was made. */
12443 aarch64_gen_compare_reg (EQ, rval, expected);
12446 /* Split a compare and swap pattern. */
12448 void
12449 aarch64_split_compare_and_swap (rtx operands[])
12451 rtx rval, mem, oldval, newval, scratch;
12452 machine_mode mode;
12453 bool is_weak;
12454 rtx_code_label *label1, *label2;
12455 rtx x, cond;
12456 enum memmodel model;
12457 rtx model_rtx;
12459 rval = operands[0];
12460 mem = operands[1];
12461 oldval = operands[2];
12462 newval = operands[3];
12463 is_weak = (operands[4] != const0_rtx);
12464 model_rtx = operands[5];
12465 scratch = operands[7];
12466 mode = GET_MODE (mem);
12467 model = memmodel_from_int (INTVAL (model_rtx));
12469 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12470 loop:
12471 .label1:
12472 LD[A]XR rval, [mem]
12473 CBNZ rval, .label2
12474 ST[L]XR scratch, newval, [mem]
12475 CBNZ scratch, .label1
12476 .label2:
12477 CMP rval, 0. */
12478 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12480 label1 = NULL;
12481 if (!is_weak)
12483 label1 = gen_label_rtx ();
12484 emit_label (label1);
12486 label2 = gen_label_rtx ();
12488 /* The initial load can be relaxed for a __sync operation since a final
12489 barrier will be emitted to stop code hoisting. */
12490 if (is_mm_sync (model))
12491 aarch64_emit_load_exclusive (mode, rval, mem,
12492 GEN_INT (MEMMODEL_RELAXED));
12493 else
12494 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12496 if (strong_zero_p)
12498 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12499 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12500 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12501 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12503 else
12505 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12506 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12507 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12508 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12509 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12512 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12514 if (!is_weak)
12516 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12517 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12518 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12519 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12521 else
12523 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12524 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12525 emit_insn (gen_rtx_SET (cond, x));
12528 emit_label (label2);
12529 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12530 to set the condition flags. If this is not used it will be removed by
12531 later passes. */
12532 if (strong_zero_p)
12534 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12535 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12536 emit_insn (gen_rtx_SET (cond, x));
12538 /* Emit any final barrier needed for a __sync operation. */
12539 if (is_mm_sync (model))
12540 aarch64_emit_post_barrier (model);
12543 /* Emit a BIC instruction. */
12545 static void
12546 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12548 rtx shift_rtx = GEN_INT (shift);
12549 rtx (*gen) (rtx, rtx, rtx, rtx);
12551 switch (mode)
12553 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12554 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12555 default:
12556 gcc_unreachable ();
12559 emit_insn (gen (dst, s2, shift_rtx, s1));
12562 /* Emit an atomic swap. */
12564 static void
12565 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12566 rtx mem, rtx model)
12568 rtx (*gen) (rtx, rtx, rtx, rtx);
12570 switch (mode)
12572 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12573 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12574 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12575 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12576 default:
12577 gcc_unreachable ();
12580 emit_insn (gen (dst, mem, value, model));
12583 /* Operations supported by aarch64_emit_atomic_load_op. */
12585 enum aarch64_atomic_load_op_code
12587 AARCH64_LDOP_PLUS, /* A + B */
12588 AARCH64_LDOP_XOR, /* A ^ B */
12589 AARCH64_LDOP_OR, /* A | B */
12590 AARCH64_LDOP_BIC /* A & ~B */
12593 /* Emit an atomic load-operate. */
12595 static void
12596 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12597 machine_mode mode, rtx dst, rtx src,
12598 rtx mem, rtx model)
12600 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12601 const aarch64_atomic_load_op_fn plus[] =
12603 gen_aarch64_atomic_loadaddqi,
12604 gen_aarch64_atomic_loadaddhi,
12605 gen_aarch64_atomic_loadaddsi,
12606 gen_aarch64_atomic_loadadddi
12608 const aarch64_atomic_load_op_fn eor[] =
12610 gen_aarch64_atomic_loadeorqi,
12611 gen_aarch64_atomic_loadeorhi,
12612 gen_aarch64_atomic_loadeorsi,
12613 gen_aarch64_atomic_loadeordi
12615 const aarch64_atomic_load_op_fn ior[] =
12617 gen_aarch64_atomic_loadsetqi,
12618 gen_aarch64_atomic_loadsethi,
12619 gen_aarch64_atomic_loadsetsi,
12620 gen_aarch64_atomic_loadsetdi
12622 const aarch64_atomic_load_op_fn bic[] =
12624 gen_aarch64_atomic_loadclrqi,
12625 gen_aarch64_atomic_loadclrhi,
12626 gen_aarch64_atomic_loadclrsi,
12627 gen_aarch64_atomic_loadclrdi
12629 aarch64_atomic_load_op_fn gen;
12630 int idx = 0;
12632 switch (mode)
12634 case E_QImode: idx = 0; break;
12635 case E_HImode: idx = 1; break;
12636 case E_SImode: idx = 2; break;
12637 case E_DImode: idx = 3; break;
12638 default:
12639 gcc_unreachable ();
12642 switch (code)
12644 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12645 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12646 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12647 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12648 default:
12649 gcc_unreachable ();
12652 emit_insn (gen (dst, mem, src, model));
12655 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12656 location to store the data read from memory. OUT_RESULT is the location to
12657 store the result of the operation. MEM is the memory location to read and
12658 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12659 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12660 be NULL. */
12662 void
12663 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12664 rtx mem, rtx value, rtx model_rtx)
12666 machine_mode mode = GET_MODE (mem);
12667 machine_mode wmode = (mode == DImode ? DImode : SImode);
12668 const bool short_mode = (mode < SImode);
12669 aarch64_atomic_load_op_code ldop_code;
12670 rtx src;
12671 rtx x;
12673 if (out_data)
12674 out_data = gen_lowpart (mode, out_data);
12676 if (out_result)
12677 out_result = gen_lowpart (mode, out_result);
12679 /* Make sure the value is in a register, putting it into a destination
12680 register if it needs to be manipulated. */
12681 if (!register_operand (value, mode)
12682 || code == AND || code == MINUS)
12684 src = out_result ? out_result : out_data;
12685 emit_move_insn (src, gen_lowpart (mode, value));
12687 else
12688 src = value;
12689 gcc_assert (register_operand (src, mode));
12691 /* Preprocess the data for the operation as necessary. If the operation is
12692 a SET then emit a swap instruction and finish. */
12693 switch (code)
12695 case SET:
12696 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12697 return;
12699 case MINUS:
12700 /* Negate the value and treat it as a PLUS. */
12702 rtx neg_src;
12704 /* Resize the value if necessary. */
12705 if (short_mode)
12706 src = gen_lowpart (wmode, src);
12708 neg_src = gen_rtx_NEG (wmode, src);
12709 emit_insn (gen_rtx_SET (src, neg_src));
12711 if (short_mode)
12712 src = gen_lowpart (mode, src);
12714 /* Fall-through. */
12715 case PLUS:
12716 ldop_code = AARCH64_LDOP_PLUS;
12717 break;
12719 case IOR:
12720 ldop_code = AARCH64_LDOP_OR;
12721 break;
12723 case XOR:
12724 ldop_code = AARCH64_LDOP_XOR;
12725 break;
12727 case AND:
12729 rtx not_src;
12731 /* Resize the value if necessary. */
12732 if (short_mode)
12733 src = gen_lowpart (wmode, src);
12735 not_src = gen_rtx_NOT (wmode, src);
12736 emit_insn (gen_rtx_SET (src, not_src));
12738 if (short_mode)
12739 src = gen_lowpart (mode, src);
12741 ldop_code = AARCH64_LDOP_BIC;
12742 break;
12744 default:
12745 /* The operation can't be done with atomic instructions. */
12746 gcc_unreachable ();
12749 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12751 /* If necessary, calculate the data in memory after the update by redoing the
12752 operation from values in registers. */
12753 if (!out_result)
12754 return;
12756 if (short_mode)
12758 src = gen_lowpart (wmode, src);
12759 out_data = gen_lowpart (wmode, out_data);
12760 out_result = gen_lowpart (wmode, out_result);
12763 x = NULL_RTX;
12765 switch (code)
12767 case MINUS:
12768 case PLUS:
12769 x = gen_rtx_PLUS (wmode, out_data, src);
12770 break;
12771 case IOR:
12772 x = gen_rtx_IOR (wmode, out_data, src);
12773 break;
12774 case XOR:
12775 x = gen_rtx_XOR (wmode, out_data, src);
12776 break;
12777 case AND:
12778 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12779 return;
12780 default:
12781 gcc_unreachable ();
12784 emit_set_insn (out_result, x);
12786 return;
12789 /* Split an atomic operation. */
12791 void
12792 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12793 rtx value, rtx model_rtx, rtx cond)
12795 machine_mode mode = GET_MODE (mem);
12796 machine_mode wmode = (mode == DImode ? DImode : SImode);
12797 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12798 const bool is_sync = is_mm_sync (model);
12799 rtx_code_label *label;
12800 rtx x;
12802 /* Split the atomic operation into a sequence. */
12803 label = gen_label_rtx ();
12804 emit_label (label);
12806 if (new_out)
12807 new_out = gen_lowpart (wmode, new_out);
12808 if (old_out)
12809 old_out = gen_lowpart (wmode, old_out);
12810 else
12811 old_out = new_out;
12812 value = simplify_gen_subreg (wmode, value, mode, 0);
12814 /* The initial load can be relaxed for a __sync operation since a final
12815 barrier will be emitted to stop code hoisting. */
12816 if (is_sync)
12817 aarch64_emit_load_exclusive (mode, old_out, mem,
12818 GEN_INT (MEMMODEL_RELAXED));
12819 else
12820 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12822 switch (code)
12824 case SET:
12825 new_out = value;
12826 break;
12828 case NOT:
12829 x = gen_rtx_AND (wmode, old_out, value);
12830 emit_insn (gen_rtx_SET (new_out, x));
12831 x = gen_rtx_NOT (wmode, new_out);
12832 emit_insn (gen_rtx_SET (new_out, x));
12833 break;
12835 case MINUS:
12836 if (CONST_INT_P (value))
12838 value = GEN_INT (-INTVAL (value));
12839 code = PLUS;
12841 /* Fall through. */
12843 default:
12844 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12845 emit_insn (gen_rtx_SET (new_out, x));
12846 break;
12849 aarch64_emit_store_exclusive (mode, cond, mem,
12850 gen_lowpart (mode, new_out), model_rtx);
12852 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12853 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12854 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12855 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12857 /* Emit any final barrier needed for a __sync operation. */
12858 if (is_sync)
12859 aarch64_emit_post_barrier (model);
12862 static void
12863 aarch64_init_libfuncs (void)
12865 /* Half-precision float operations. The compiler handles all operations
12866 with NULL libfuncs by converting to SFmode. */
12868 /* Conversions. */
12869 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12870 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12872 /* Arithmetic. */
12873 set_optab_libfunc (add_optab, HFmode, NULL);
12874 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12875 set_optab_libfunc (smul_optab, HFmode, NULL);
12876 set_optab_libfunc (neg_optab, HFmode, NULL);
12877 set_optab_libfunc (sub_optab, HFmode, NULL);
12879 /* Comparisons. */
12880 set_optab_libfunc (eq_optab, HFmode, NULL);
12881 set_optab_libfunc (ne_optab, HFmode, NULL);
12882 set_optab_libfunc (lt_optab, HFmode, NULL);
12883 set_optab_libfunc (le_optab, HFmode, NULL);
12884 set_optab_libfunc (ge_optab, HFmode, NULL);
12885 set_optab_libfunc (gt_optab, HFmode, NULL);
12886 set_optab_libfunc (unord_optab, HFmode, NULL);
12889 /* Target hook for c_mode_for_suffix. */
12890 static machine_mode
12891 aarch64_c_mode_for_suffix (char suffix)
12893 if (suffix == 'q')
12894 return TFmode;
12896 return VOIDmode;
12899 /* We can only represent floating point constants which will fit in
12900 "quarter-precision" values. These values are characterised by
12901 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12904 (-1)^s * (n/16) * 2^r
12906 Where:
12907 's' is the sign bit.
12908 'n' is an integer in the range 16 <= n <= 31.
12909 'r' is an integer in the range -3 <= r <= 4. */
12911 /* Return true iff X can be represented by a quarter-precision
12912 floating point immediate operand X. Note, we cannot represent 0.0. */
12913 bool
12914 aarch64_float_const_representable_p (rtx x)
12916 /* This represents our current view of how many bits
12917 make up the mantissa. */
12918 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12919 int exponent;
12920 unsigned HOST_WIDE_INT mantissa, mask;
12921 REAL_VALUE_TYPE r, m;
12922 bool fail;
12924 if (!CONST_DOUBLE_P (x))
12925 return false;
12927 /* We don't support HFmode constants yet. */
12928 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12929 return false;
12931 r = *CONST_DOUBLE_REAL_VALUE (x);
12933 /* We cannot represent infinities, NaNs or +/-zero. We won't
12934 know if we have +zero until we analyse the mantissa, but we
12935 can reject the other invalid values. */
12936 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12937 || REAL_VALUE_MINUS_ZERO (r))
12938 return false;
12940 /* Extract exponent. */
12941 r = real_value_abs (&r);
12942 exponent = REAL_EXP (&r);
12944 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12945 highest (sign) bit, with a fixed binary point at bit point_pos.
12946 m1 holds the low part of the mantissa, m2 the high part.
12947 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12948 bits for the mantissa, this can fail (low bits will be lost). */
12949 real_ldexp (&m, &r, point_pos - exponent);
12950 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12952 /* If the low part of the mantissa has bits set we cannot represent
12953 the value. */
12954 if (w.ulow () != 0)
12955 return false;
12956 /* We have rejected the lower HOST_WIDE_INT, so update our
12957 understanding of how many bits lie in the mantissa and
12958 look only at the high HOST_WIDE_INT. */
12959 mantissa = w.elt (1);
12960 point_pos -= HOST_BITS_PER_WIDE_INT;
12962 /* We can only represent values with a mantissa of the form 1.xxxx. */
12963 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12964 if ((mantissa & mask) != 0)
12965 return false;
12967 /* Having filtered unrepresentable values, we may now remove all
12968 but the highest 5 bits. */
12969 mantissa >>= point_pos - 5;
12971 /* We cannot represent the value 0.0, so reject it. This is handled
12972 elsewhere. */
12973 if (mantissa == 0)
12974 return false;
12976 /* Then, as bit 4 is always set, we can mask it off, leaving
12977 the mantissa in the range [0, 15]. */
12978 mantissa &= ~(1 << 4);
12979 gcc_assert (mantissa <= 15);
12981 /* GCC internally does not use IEEE754-like encoding (where normalized
12982 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12983 Our mantissa values are shifted 4 places to the left relative to
12984 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12985 by 5 places to correct for GCC's representation. */
12986 exponent = 5 - exponent;
12988 return (exponent >= 0 && exponent <= 7);
12991 char*
12992 aarch64_output_simd_mov_immediate (rtx const_vector,
12993 machine_mode mode,
12994 unsigned width)
12996 bool is_valid;
12997 static char templ[40];
12998 const char *mnemonic;
12999 const char *shift_op;
13000 unsigned int lane_count = 0;
13001 char element_char;
13003 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13005 /* This will return true to show const_vector is legal for use as either
13006 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
13007 also update INFO to show how the immediate should be generated. */
13008 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13009 gcc_assert (is_valid);
13011 element_char = sizetochar (info.element_width);
13012 lane_count = width / info.element_width;
13014 mode = GET_MODE_INNER (mode);
13015 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13017 gcc_assert (info.shift == 0 && ! info.mvn);
13018 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13019 move immediate path. */
13020 if (aarch64_float_const_zero_rtx_p (info.value))
13021 info.value = GEN_INT (0);
13022 else
13024 const unsigned int buf_size = 20;
13025 char float_buf[buf_size] = {'\0'};
13026 real_to_decimal_for_mode (float_buf,
13027 CONST_DOUBLE_REAL_VALUE (info.value),
13028 buf_size, buf_size, 1, mode);
13030 if (lane_count == 1)
13031 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13032 else
13033 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13034 lane_count, element_char, float_buf);
13035 return templ;
13039 mnemonic = info.mvn ? "mvni" : "movi";
13040 shift_op = info.msl ? "msl" : "lsl";
13042 gcc_assert (CONST_INT_P (info.value));
13043 if (lane_count == 1)
13044 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13045 mnemonic, UINTVAL (info.value));
13046 else if (info.shift)
13047 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13048 ", %s %d", mnemonic, lane_count, element_char,
13049 UINTVAL (info.value), shift_op, info.shift);
13050 else
13051 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13052 mnemonic, lane_count, element_char, UINTVAL (info.value));
13053 return templ;
13056 char*
13057 aarch64_output_scalar_simd_mov_immediate (rtx immediate, machine_mode mode)
13060 /* If a floating point number was passed and we desire to use it in an
13061 integer mode do the conversion to integer. */
13062 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13064 unsigned HOST_WIDE_INT ival;
13065 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13066 gcc_unreachable ();
13067 immediate = gen_int_mode (ival, mode);
13070 machine_mode vmode;
13071 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13072 a 128 bit vector mode. */
13073 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13075 gcc_assert (!VECTOR_MODE_P (mode));
13076 vmode = aarch64_simd_container_mode (mode, width);
13077 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13078 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13081 /* Split operands into moves from op[1] + op[2] into op[0]. */
13083 void
13084 aarch64_split_combinev16qi (rtx operands[3])
13086 unsigned int dest = REGNO (operands[0]);
13087 unsigned int src1 = REGNO (operands[1]);
13088 unsigned int src2 = REGNO (operands[2]);
13089 machine_mode halfmode = GET_MODE (operands[1]);
13090 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13091 rtx destlo, desthi;
13093 gcc_assert (halfmode == V16QImode);
13095 if (src1 == dest && src2 == dest + halfregs)
13097 /* No-op move. Can't split to nothing; emit something. */
13098 emit_note (NOTE_INSN_DELETED);
13099 return;
13102 /* Preserve register attributes for variable tracking. */
13103 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13104 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13105 GET_MODE_SIZE (halfmode));
13107 /* Special case of reversed high/low parts. */
13108 if (reg_overlap_mentioned_p (operands[2], destlo)
13109 && reg_overlap_mentioned_p (operands[1], desthi))
13111 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13112 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13113 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13115 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13117 /* Try to avoid unnecessary moves if part of the result
13118 is in the right place already. */
13119 if (src1 != dest)
13120 emit_move_insn (destlo, operands[1]);
13121 if (src2 != dest + halfregs)
13122 emit_move_insn (desthi, operands[2]);
13124 else
13126 if (src2 != dest + halfregs)
13127 emit_move_insn (desthi, operands[2]);
13128 if (src1 != dest)
13129 emit_move_insn (destlo, operands[1]);
13133 /* vec_perm support. */
13135 #define MAX_VECT_LEN 16
13137 struct expand_vec_perm_d
13139 rtx target, op0, op1;
13140 unsigned char perm[MAX_VECT_LEN];
13141 machine_mode vmode;
13142 unsigned char nelt;
13143 bool one_vector_p;
13144 bool testing_p;
13147 /* Generate a variable permutation. */
13149 static void
13150 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13152 machine_mode vmode = GET_MODE (target);
13153 bool one_vector_p = rtx_equal_p (op0, op1);
13155 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13156 gcc_checking_assert (GET_MODE (op0) == vmode);
13157 gcc_checking_assert (GET_MODE (op1) == vmode);
13158 gcc_checking_assert (GET_MODE (sel) == vmode);
13159 gcc_checking_assert (TARGET_SIMD);
13161 if (one_vector_p)
13163 if (vmode == V8QImode)
13165 /* Expand the argument to a V16QI mode by duplicating it. */
13166 rtx pair = gen_reg_rtx (V16QImode);
13167 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13168 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13170 else
13172 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13175 else
13177 rtx pair;
13179 if (vmode == V8QImode)
13181 pair = gen_reg_rtx (V16QImode);
13182 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13183 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13185 else
13187 pair = gen_reg_rtx (OImode);
13188 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13189 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13194 void
13195 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13197 machine_mode vmode = GET_MODE (target);
13198 unsigned int nelt = GET_MODE_NUNITS (vmode);
13199 bool one_vector_p = rtx_equal_p (op0, op1);
13200 rtx mask;
13202 /* The TBL instruction does not use a modulo index, so we must take care
13203 of that ourselves. */
13204 mask = aarch64_simd_gen_const_vector_dup (vmode,
13205 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13206 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13208 /* For big-endian, we also need to reverse the index within the vector
13209 (but not which vector). */
13210 if (BYTES_BIG_ENDIAN)
13212 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13213 if (!one_vector_p)
13214 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13215 sel = expand_simple_binop (vmode, XOR, sel, mask,
13216 NULL, 0, OPTAB_LIB_WIDEN);
13218 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13221 /* Recognize patterns suitable for the TRN instructions. */
13222 static bool
13223 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13225 unsigned int i, odd, mask, nelt = d->nelt;
13226 rtx out, in0, in1, x;
13227 rtx (*gen) (rtx, rtx, rtx);
13228 machine_mode vmode = d->vmode;
13230 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13231 return false;
13233 /* Note that these are little-endian tests.
13234 We correct for big-endian later. */
13235 if (d->perm[0] == 0)
13236 odd = 0;
13237 else if (d->perm[0] == 1)
13238 odd = 1;
13239 else
13240 return false;
13241 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13243 for (i = 0; i < nelt; i += 2)
13245 if (d->perm[i] != i + odd)
13246 return false;
13247 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13248 return false;
13251 /* Success! */
13252 if (d->testing_p)
13253 return true;
13255 in0 = d->op0;
13256 in1 = d->op1;
13257 if (BYTES_BIG_ENDIAN)
13259 x = in0, in0 = in1, in1 = x;
13260 odd = !odd;
13262 out = d->target;
13264 if (odd)
13266 switch (vmode)
13268 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13269 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13270 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13271 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13272 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13273 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13274 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13275 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13276 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13277 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13278 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13279 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13280 default:
13281 return false;
13284 else
13286 switch (vmode)
13288 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13289 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13290 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13291 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13292 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13293 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13294 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13295 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13296 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13297 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13298 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13299 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13300 default:
13301 return false;
13305 emit_insn (gen (out, in0, in1));
13306 return true;
13309 /* Recognize patterns suitable for the UZP instructions. */
13310 static bool
13311 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13313 unsigned int i, odd, mask, nelt = d->nelt;
13314 rtx out, in0, in1, x;
13315 rtx (*gen) (rtx, rtx, rtx);
13316 machine_mode vmode = d->vmode;
13318 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13319 return false;
13321 /* Note that these are little-endian tests.
13322 We correct for big-endian later. */
13323 if (d->perm[0] == 0)
13324 odd = 0;
13325 else if (d->perm[0] == 1)
13326 odd = 1;
13327 else
13328 return false;
13329 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13331 for (i = 0; i < nelt; i++)
13333 unsigned elt = (i * 2 + odd) & mask;
13334 if (d->perm[i] != elt)
13335 return false;
13338 /* Success! */
13339 if (d->testing_p)
13340 return true;
13342 in0 = d->op0;
13343 in1 = d->op1;
13344 if (BYTES_BIG_ENDIAN)
13346 x = in0, in0 = in1, in1 = x;
13347 odd = !odd;
13349 out = d->target;
13351 if (odd)
13353 switch (vmode)
13355 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13356 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13357 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13358 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13359 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13360 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13361 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13362 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13363 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13364 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13365 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13366 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13367 default:
13368 return false;
13371 else
13373 switch (vmode)
13375 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13376 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13377 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13378 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13379 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13380 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13381 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13382 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13383 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13384 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13385 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13386 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13387 default:
13388 return false;
13392 emit_insn (gen (out, in0, in1));
13393 return true;
13396 /* Recognize patterns suitable for the ZIP instructions. */
13397 static bool
13398 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13400 unsigned int i, high, mask, nelt = d->nelt;
13401 rtx out, in0, in1, x;
13402 rtx (*gen) (rtx, rtx, rtx);
13403 machine_mode vmode = d->vmode;
13405 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13406 return false;
13408 /* Note that these are little-endian tests.
13409 We correct for big-endian later. */
13410 high = nelt / 2;
13411 if (d->perm[0] == high)
13412 /* Do Nothing. */
13414 else if (d->perm[0] == 0)
13415 high = 0;
13416 else
13417 return false;
13418 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13420 for (i = 0; i < nelt / 2; i++)
13422 unsigned elt = (i + high) & mask;
13423 if (d->perm[i * 2] != elt)
13424 return false;
13425 elt = (elt + nelt) & mask;
13426 if (d->perm[i * 2 + 1] != elt)
13427 return false;
13430 /* Success! */
13431 if (d->testing_p)
13432 return true;
13434 in0 = d->op0;
13435 in1 = d->op1;
13436 if (BYTES_BIG_ENDIAN)
13438 x = in0, in0 = in1, in1 = x;
13439 high = !high;
13441 out = d->target;
13443 if (high)
13445 switch (vmode)
13447 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13448 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13449 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13450 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13451 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13452 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13453 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13454 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13455 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13456 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13457 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13458 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13459 default:
13460 return false;
13463 else
13465 switch (vmode)
13467 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13468 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13469 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13470 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13471 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13472 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13473 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13474 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13475 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13476 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13477 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13478 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13479 default:
13480 return false;
13484 emit_insn (gen (out, in0, in1));
13485 return true;
13488 /* Recognize patterns for the EXT insn. */
13490 static bool
13491 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13493 unsigned int i, nelt = d->nelt;
13494 rtx (*gen) (rtx, rtx, rtx, rtx);
13495 rtx offset;
13497 unsigned int location = d->perm[0]; /* Always < nelt. */
13499 /* Check if the extracted indices are increasing by one. */
13500 for (i = 1; i < nelt; i++)
13502 unsigned int required = location + i;
13503 if (d->one_vector_p)
13505 /* We'll pass the same vector in twice, so allow indices to wrap. */
13506 required &= (nelt - 1);
13508 if (d->perm[i] != required)
13509 return false;
13512 switch (d->vmode)
13514 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13515 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13516 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13517 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13518 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13519 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13520 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13521 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13522 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13523 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13524 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13525 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13526 default:
13527 return false;
13530 /* Success! */
13531 if (d->testing_p)
13532 return true;
13534 /* The case where (location == 0) is a no-op for both big- and little-endian,
13535 and is removed by the mid-end at optimization levels -O1 and higher. */
13537 if (BYTES_BIG_ENDIAN && (location != 0))
13539 /* After setup, we want the high elements of the first vector (stored
13540 at the LSB end of the register), and the low elements of the second
13541 vector (stored at the MSB end of the register). So swap. */
13542 std::swap (d->op0, d->op1);
13543 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13544 location = nelt - location;
13547 offset = GEN_INT (location);
13548 emit_insn (gen (d->target, d->op0, d->op1, offset));
13549 return true;
13552 /* Recognize patterns for the REV insns. */
13554 static bool
13555 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13557 unsigned int i, j, diff, nelt = d->nelt;
13558 rtx (*gen) (rtx, rtx);
13560 if (!d->one_vector_p)
13561 return false;
13563 diff = d->perm[0];
13564 switch (diff)
13566 case 7:
13567 switch (d->vmode)
13569 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13570 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13571 default:
13572 return false;
13574 break;
13575 case 3:
13576 switch (d->vmode)
13578 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13579 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13580 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13581 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13582 default:
13583 return false;
13585 break;
13586 case 1:
13587 switch (d->vmode)
13589 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13590 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13591 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13592 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13593 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13594 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13595 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13596 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13597 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13598 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13599 default:
13600 return false;
13602 break;
13603 default:
13604 return false;
13607 for (i = 0; i < nelt ; i += diff + 1)
13608 for (j = 0; j <= diff; j += 1)
13610 /* This is guaranteed to be true as the value of diff
13611 is 7, 3, 1 and we should have enough elements in the
13612 queue to generate this. Getting a vector mask with a
13613 value of diff other than these values implies that
13614 something is wrong by the time we get here. */
13615 gcc_assert (i + j < nelt);
13616 if (d->perm[i + j] != i + diff - j)
13617 return false;
13620 /* Success! */
13621 if (d->testing_p)
13622 return true;
13624 emit_insn (gen (d->target, d->op0));
13625 return true;
13628 static bool
13629 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13631 rtx (*gen) (rtx, rtx, rtx);
13632 rtx out = d->target;
13633 rtx in0;
13634 machine_mode vmode = d->vmode;
13635 unsigned int i, elt, nelt = d->nelt;
13636 rtx lane;
13638 elt = d->perm[0];
13639 for (i = 1; i < nelt; i++)
13641 if (elt != d->perm[i])
13642 return false;
13645 /* The generic preparation in aarch64_expand_vec_perm_const_1
13646 swaps the operand order and the permute indices if it finds
13647 d->perm[0] to be in the second operand. Thus, we can always
13648 use d->op0 and need not do any extra arithmetic to get the
13649 correct lane number. */
13650 in0 = d->op0;
13651 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13653 switch (vmode)
13655 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13656 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13657 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13658 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13659 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13660 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13661 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13662 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13663 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13664 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13665 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13666 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13667 default:
13668 return false;
13671 emit_insn (gen (out, in0, lane));
13672 return true;
13675 static bool
13676 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13678 rtx rperm[MAX_VECT_LEN], sel;
13679 machine_mode vmode = d->vmode;
13680 unsigned int i, nelt = d->nelt;
13682 if (d->testing_p)
13683 return true;
13685 /* Generic code will try constant permutation twice. Once with the
13686 original mode and again with the elements lowered to QImode.
13687 So wait and don't do the selector expansion ourselves. */
13688 if (vmode != V8QImode && vmode != V16QImode)
13689 return false;
13691 for (i = 0; i < nelt; ++i)
13693 int nunits = GET_MODE_NUNITS (vmode);
13695 /* If big-endian and two vectors we end up with a weird mixed-endian
13696 mode on NEON. Reverse the index within each word but not the word
13697 itself. */
13698 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13699 : d->perm[i]);
13701 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13702 sel = force_reg (vmode, sel);
13704 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13705 return true;
13708 static bool
13709 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13711 /* The pattern matching functions above are written to look for a small
13712 number to begin the sequence (0, 1, N/2). If we begin with an index
13713 from the second operand, we can swap the operands. */
13714 if (d->perm[0] >= d->nelt)
13716 unsigned i, nelt = d->nelt;
13718 gcc_assert (nelt == (nelt & -nelt));
13719 for (i = 0; i < nelt; ++i)
13720 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13722 std::swap (d->op0, d->op1);
13725 if (TARGET_SIMD)
13727 if (aarch64_evpc_rev (d))
13728 return true;
13729 else if (aarch64_evpc_ext (d))
13730 return true;
13731 else if (aarch64_evpc_dup (d))
13732 return true;
13733 else if (aarch64_evpc_zip (d))
13734 return true;
13735 else if (aarch64_evpc_uzp (d))
13736 return true;
13737 else if (aarch64_evpc_trn (d))
13738 return true;
13739 return aarch64_evpc_tbl (d);
13741 return false;
13744 /* Expand a vec_perm_const pattern. */
13746 bool
13747 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13749 struct expand_vec_perm_d d;
13750 int i, nelt, which;
13752 d.target = target;
13753 d.op0 = op0;
13754 d.op1 = op1;
13756 d.vmode = GET_MODE (target);
13757 gcc_assert (VECTOR_MODE_P (d.vmode));
13758 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13759 d.testing_p = false;
13761 for (i = which = 0; i < nelt; ++i)
13763 rtx e = XVECEXP (sel, 0, i);
13764 int ei = INTVAL (e) & (2 * nelt - 1);
13765 which |= (ei < nelt ? 1 : 2);
13766 d.perm[i] = ei;
13769 switch (which)
13771 default:
13772 gcc_unreachable ();
13774 case 3:
13775 d.one_vector_p = false;
13776 if (!rtx_equal_p (op0, op1))
13777 break;
13779 /* The elements of PERM do not suggest that only the first operand
13780 is used, but both operands are identical. Allow easier matching
13781 of the permutation by folding the permutation into the single
13782 input vector. */
13783 /* Fall Through. */
13784 case 2:
13785 for (i = 0; i < nelt; ++i)
13786 d.perm[i] &= nelt - 1;
13787 d.op0 = op1;
13788 d.one_vector_p = true;
13789 break;
13791 case 1:
13792 d.op1 = op0;
13793 d.one_vector_p = true;
13794 break;
13797 return aarch64_expand_vec_perm_const_1 (&d);
13800 static bool
13801 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13802 const unsigned char *sel)
13804 struct expand_vec_perm_d d;
13805 unsigned int i, nelt, which;
13806 bool ret;
13808 d.vmode = vmode;
13809 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13810 d.testing_p = true;
13811 memcpy (d.perm, sel, nelt);
13813 /* Calculate whether all elements are in one vector. */
13814 for (i = which = 0; i < nelt; ++i)
13816 unsigned char e = d.perm[i];
13817 gcc_assert (e < 2 * nelt);
13818 which |= (e < nelt ? 1 : 2);
13821 /* If all elements are from the second vector, reindex as if from the
13822 first vector. */
13823 if (which == 2)
13824 for (i = 0; i < nelt; ++i)
13825 d.perm[i] -= nelt;
13827 /* Check whether the mask can be applied to a single vector. */
13828 d.one_vector_p = (which != 3);
13830 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13831 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13832 if (!d.one_vector_p)
13833 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13835 start_sequence ();
13836 ret = aarch64_expand_vec_perm_const_1 (&d);
13837 end_sequence ();
13839 return ret;
13843 aarch64_reverse_mask (machine_mode mode)
13845 /* We have to reverse each vector because we dont have
13846 a permuted load that can reverse-load according to ABI rules. */
13847 rtx mask;
13848 rtvec v = rtvec_alloc (16);
13849 int i, j;
13850 int nunits = GET_MODE_NUNITS (mode);
13851 int usize = GET_MODE_UNIT_SIZE (mode);
13853 gcc_assert (BYTES_BIG_ENDIAN);
13854 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13856 for (i = 0; i < nunits; i++)
13857 for (j = 0; j < usize; j++)
13858 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13859 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13860 return force_reg (V16QImode, mask);
13863 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
13864 true. However due to issues with register allocation it is preferable
13865 to avoid tieing integer scalar and FP scalar modes. Executing integer
13866 operations in general registers is better than treating them as scalar
13867 vector operations. This reduces latency and avoids redundant int<->FP
13868 moves. So tie modes if they are either the same class, or vector modes
13869 with other vector modes, vector structs or any scalar mode. */
13871 static bool
13872 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13874 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13875 return true;
13877 /* We specifically want to allow elements of "structure" modes to
13878 be tieable to the structure. This more general condition allows
13879 other rarer situations too. */
13880 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13881 return true;
13883 /* Also allow any scalar modes with vectors. */
13884 if (aarch64_vector_mode_supported_p (mode1)
13885 || aarch64_vector_mode_supported_p (mode2))
13886 return true;
13888 return false;
13891 /* Return a new RTX holding the result of moving POINTER forward by
13892 AMOUNT bytes. */
13894 static rtx
13895 aarch64_move_pointer (rtx pointer, int amount)
13897 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13899 return adjust_automodify_address (pointer, GET_MODE (pointer),
13900 next, amount);
13903 /* Return a new RTX holding the result of moving POINTER forward by the
13904 size of the mode it points to. */
13906 static rtx
13907 aarch64_progress_pointer (rtx pointer)
13909 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13911 return aarch64_move_pointer (pointer, amount);
13914 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13915 MODE bytes. */
13917 static void
13918 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13919 machine_mode mode)
13921 rtx reg = gen_reg_rtx (mode);
13923 /* "Cast" the pointers to the correct mode. */
13924 *src = adjust_address (*src, mode, 0);
13925 *dst = adjust_address (*dst, mode, 0);
13926 /* Emit the memcpy. */
13927 emit_move_insn (reg, *src);
13928 emit_move_insn (*dst, reg);
13929 /* Move the pointers forward. */
13930 *src = aarch64_progress_pointer (*src);
13931 *dst = aarch64_progress_pointer (*dst);
13934 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13935 we succeed, otherwise return false. */
13937 bool
13938 aarch64_expand_movmem (rtx *operands)
13940 unsigned int n;
13941 rtx dst = operands[0];
13942 rtx src = operands[1];
13943 rtx base;
13944 bool speed_p = !optimize_function_for_size_p (cfun);
13946 /* When optimizing for size, give a better estimate of the length of a
13947 memcpy call, but use the default otherwise. */
13948 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13950 /* We can't do anything smart if the amount to copy is not constant. */
13951 if (!CONST_INT_P (operands[2]))
13952 return false;
13954 n = UINTVAL (operands[2]);
13956 /* Try to keep the number of instructions low. For cases below 16 bytes we
13957 need to make at most two moves. For cases above 16 bytes it will be one
13958 move for each 16 byte chunk, then at most two additional moves. */
13959 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13960 return false;
13962 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13963 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13965 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13966 src = adjust_automodify_address (src, VOIDmode, base, 0);
13968 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13969 1-byte chunk. */
13970 if (n < 4)
13972 if (n >= 2)
13974 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13975 n -= 2;
13978 if (n == 1)
13979 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13981 return true;
13984 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13985 4-byte chunk, partially overlapping with the previously copied chunk. */
13986 if (n < 8)
13988 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13989 n -= 4;
13990 if (n > 0)
13992 int move = n - 4;
13994 src = aarch64_move_pointer (src, move);
13995 dst = aarch64_move_pointer (dst, move);
13996 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13998 return true;
14001 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
14002 them, then (if applicable) an 8-byte chunk. */
14003 while (n >= 8)
14005 if (n / 16)
14007 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14008 n -= 16;
14010 else
14012 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14013 n -= 8;
14017 /* Finish the final bytes of the copy. We can always do this in one
14018 instruction. We either copy the exact amount we need, or partially
14019 overlap with the previous chunk we copied and copy 8-bytes. */
14020 if (n == 0)
14021 return true;
14022 else if (n == 1)
14023 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14024 else if (n == 2)
14025 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14026 else if (n == 4)
14027 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14028 else
14030 if (n == 3)
14032 src = aarch64_move_pointer (src, -1);
14033 dst = aarch64_move_pointer (dst, -1);
14034 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14036 else
14038 int move = n - 8;
14040 src = aarch64_move_pointer (src, move);
14041 dst = aarch64_move_pointer (dst, move);
14042 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14046 return true;
14049 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14050 SImode stores. Handle the case when the constant has identical
14051 bottom and top halves. This is beneficial when the two stores can be
14052 merged into an STP and we avoid synthesising potentially expensive
14053 immediates twice. Return true if such a split is possible. */
14055 bool
14056 aarch64_split_dimode_const_store (rtx dst, rtx src)
14058 rtx lo = gen_lowpart (SImode, src);
14059 rtx hi = gen_highpart_mode (SImode, DImode, src);
14061 bool size_p = optimize_function_for_size_p (cfun);
14063 if (!rtx_equal_p (lo, hi))
14064 return false;
14066 unsigned int orig_cost
14067 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14068 unsigned int lo_cost
14069 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14071 /* We want to transform:
14072 MOV x1, 49370
14073 MOVK x1, 0x140, lsl 16
14074 MOVK x1, 0xc0da, lsl 32
14075 MOVK x1, 0x140, lsl 48
14076 STR x1, [x0]
14077 into:
14078 MOV w1, 49370
14079 MOVK w1, 0x140, lsl 16
14080 STP w1, w1, [x0]
14081 So we want to perform this only when we save two instructions
14082 or more. When optimizing for size, however, accept any code size
14083 savings we can. */
14084 if (size_p && orig_cost <= lo_cost)
14085 return false;
14087 if (!size_p
14088 && (orig_cost <= lo_cost + 1))
14089 return false;
14091 rtx mem_lo = adjust_address (dst, SImode, 0);
14092 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14093 return false;
14095 rtx tmp_reg = gen_reg_rtx (SImode);
14096 aarch64_expand_mov_immediate (tmp_reg, lo);
14097 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14098 /* Don't emit an explicit store pair as this may not be always profitable.
14099 Let the sched-fusion logic decide whether to merge them. */
14100 emit_move_insn (mem_lo, tmp_reg);
14101 emit_move_insn (mem_hi, tmp_reg);
14103 return true;
14106 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14108 static unsigned HOST_WIDE_INT
14109 aarch64_asan_shadow_offset (void)
14111 return (HOST_WIDE_INT_1 << 36);
14114 static bool
14115 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14116 unsigned int align,
14117 enum by_pieces_operation op,
14118 bool speed_p)
14120 /* STORE_BY_PIECES can be used when copying a constant string, but
14121 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14122 For now we always fail this and let the move_by_pieces code copy
14123 the string from read-only memory. */
14124 if (op == STORE_BY_PIECES)
14125 return false;
14127 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14130 static rtx
14131 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14132 int code, tree treeop0, tree treeop1)
14134 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14135 rtx op0, op1;
14136 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14137 insn_code icode;
14138 struct expand_operand ops[4];
14140 start_sequence ();
14141 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14143 op_mode = GET_MODE (op0);
14144 if (op_mode == VOIDmode)
14145 op_mode = GET_MODE (op1);
14147 switch (op_mode)
14149 case E_QImode:
14150 case E_HImode:
14151 case E_SImode:
14152 cmp_mode = SImode;
14153 icode = CODE_FOR_cmpsi;
14154 break;
14156 case E_DImode:
14157 cmp_mode = DImode;
14158 icode = CODE_FOR_cmpdi;
14159 break;
14161 case E_SFmode:
14162 cmp_mode = SFmode;
14163 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14164 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14165 break;
14167 case E_DFmode:
14168 cmp_mode = DFmode;
14169 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14170 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14171 break;
14173 default:
14174 end_sequence ();
14175 return NULL_RTX;
14178 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14179 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14180 if (!op0 || !op1)
14182 end_sequence ();
14183 return NULL_RTX;
14185 *prep_seq = get_insns ();
14186 end_sequence ();
14188 create_fixed_operand (&ops[0], op0);
14189 create_fixed_operand (&ops[1], op1);
14191 start_sequence ();
14192 if (!maybe_expand_insn (icode, 2, ops))
14194 end_sequence ();
14195 return NULL_RTX;
14197 *gen_seq = get_insns ();
14198 end_sequence ();
14200 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14201 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14204 static rtx
14205 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14206 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14208 rtx op0, op1, target;
14209 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14210 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14211 insn_code icode;
14212 struct expand_operand ops[6];
14213 int aarch64_cond;
14215 push_to_sequence (*prep_seq);
14216 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14218 op_mode = GET_MODE (op0);
14219 if (op_mode == VOIDmode)
14220 op_mode = GET_MODE (op1);
14222 switch (op_mode)
14224 case E_QImode:
14225 case E_HImode:
14226 case E_SImode:
14227 cmp_mode = SImode;
14228 icode = CODE_FOR_ccmpsi;
14229 break;
14231 case E_DImode:
14232 cmp_mode = DImode;
14233 icode = CODE_FOR_ccmpdi;
14234 break;
14236 case E_SFmode:
14237 cmp_mode = SFmode;
14238 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14239 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14240 break;
14242 case E_DFmode:
14243 cmp_mode = DFmode;
14244 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14245 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14246 break;
14248 default:
14249 end_sequence ();
14250 return NULL_RTX;
14253 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14254 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14255 if (!op0 || !op1)
14257 end_sequence ();
14258 return NULL_RTX;
14260 *prep_seq = get_insns ();
14261 end_sequence ();
14263 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14264 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14266 if (bit_code != AND)
14268 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14269 GET_MODE (XEXP (prev, 0))),
14270 VOIDmode, XEXP (prev, 0), const0_rtx);
14271 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14274 create_fixed_operand (&ops[0], XEXP (prev, 0));
14275 create_fixed_operand (&ops[1], target);
14276 create_fixed_operand (&ops[2], op0);
14277 create_fixed_operand (&ops[3], op1);
14278 create_fixed_operand (&ops[4], prev);
14279 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14281 push_to_sequence (*gen_seq);
14282 if (!maybe_expand_insn (icode, 6, ops))
14284 end_sequence ();
14285 return NULL_RTX;
14288 *gen_seq = get_insns ();
14289 end_sequence ();
14291 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14294 #undef TARGET_GEN_CCMP_FIRST
14295 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14297 #undef TARGET_GEN_CCMP_NEXT
14298 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14300 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14301 instruction fusion of some sort. */
14303 static bool
14304 aarch64_macro_fusion_p (void)
14306 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14310 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14311 should be kept together during scheduling. */
14313 static bool
14314 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14316 rtx set_dest;
14317 rtx prev_set = single_set (prev);
14318 rtx curr_set = single_set (curr);
14319 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14320 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14322 if (!aarch64_macro_fusion_p ())
14323 return false;
14325 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14327 /* We are trying to match:
14328 prev (mov) == (set (reg r0) (const_int imm16))
14329 curr (movk) == (set (zero_extract (reg r0)
14330 (const_int 16)
14331 (const_int 16))
14332 (const_int imm16_1)) */
14334 set_dest = SET_DEST (curr_set);
14336 if (GET_CODE (set_dest) == ZERO_EXTRACT
14337 && CONST_INT_P (SET_SRC (curr_set))
14338 && CONST_INT_P (SET_SRC (prev_set))
14339 && CONST_INT_P (XEXP (set_dest, 2))
14340 && INTVAL (XEXP (set_dest, 2)) == 16
14341 && REG_P (XEXP (set_dest, 0))
14342 && REG_P (SET_DEST (prev_set))
14343 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14345 return true;
14349 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14352 /* We're trying to match:
14353 prev (adrp) == (set (reg r1)
14354 (high (symbol_ref ("SYM"))))
14355 curr (add) == (set (reg r0)
14356 (lo_sum (reg r1)
14357 (symbol_ref ("SYM"))))
14358 Note that r0 need not necessarily be the same as r1, especially
14359 during pre-regalloc scheduling. */
14361 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14362 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14364 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14365 && REG_P (XEXP (SET_SRC (curr_set), 0))
14366 && REGNO (XEXP (SET_SRC (curr_set), 0))
14367 == REGNO (SET_DEST (prev_set))
14368 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14369 XEXP (SET_SRC (curr_set), 1)))
14370 return true;
14374 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14377 /* We're trying to match:
14378 prev (movk) == (set (zero_extract (reg r0)
14379 (const_int 16)
14380 (const_int 32))
14381 (const_int imm16_1))
14382 curr (movk) == (set (zero_extract (reg r0)
14383 (const_int 16)
14384 (const_int 48))
14385 (const_int imm16_2)) */
14387 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14388 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14389 && REG_P (XEXP (SET_DEST (prev_set), 0))
14390 && REG_P (XEXP (SET_DEST (curr_set), 0))
14391 && REGNO (XEXP (SET_DEST (prev_set), 0))
14392 == REGNO (XEXP (SET_DEST (curr_set), 0))
14393 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14394 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14395 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14396 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14397 && CONST_INT_P (SET_SRC (prev_set))
14398 && CONST_INT_P (SET_SRC (curr_set)))
14399 return true;
14402 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14404 /* We're trying to match:
14405 prev (adrp) == (set (reg r0)
14406 (high (symbol_ref ("SYM"))))
14407 curr (ldr) == (set (reg r1)
14408 (mem (lo_sum (reg r0)
14409 (symbol_ref ("SYM")))))
14411 curr (ldr) == (set (reg r1)
14412 (zero_extend (mem
14413 (lo_sum (reg r0)
14414 (symbol_ref ("SYM")))))) */
14415 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14416 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14418 rtx curr_src = SET_SRC (curr_set);
14420 if (GET_CODE (curr_src) == ZERO_EXTEND)
14421 curr_src = XEXP (curr_src, 0);
14423 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14424 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14425 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14426 == REGNO (SET_DEST (prev_set))
14427 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14428 XEXP (SET_SRC (prev_set), 0)))
14429 return true;
14433 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14434 && aarch_crypto_can_dual_issue (prev, curr))
14435 return true;
14437 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14438 && any_condjump_p (curr))
14440 enum attr_type prev_type = get_attr_type (prev);
14442 unsigned int condreg1, condreg2;
14443 rtx cc_reg_1;
14444 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14445 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14447 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14448 && prev
14449 && modified_in_p (cc_reg_1, prev))
14451 /* FIXME: this misses some which is considered simple arthematic
14452 instructions for ThunderX. Simple shifts are missed here. */
14453 if (prev_type == TYPE_ALUS_SREG
14454 || prev_type == TYPE_ALUS_IMM
14455 || prev_type == TYPE_LOGICS_REG
14456 || prev_type == TYPE_LOGICS_IMM)
14457 return true;
14461 if (prev_set
14462 && curr_set
14463 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14464 && any_condjump_p (curr))
14466 /* We're trying to match:
14467 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14468 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14469 (const_int 0))
14470 (label_ref ("SYM"))
14471 (pc)) */
14472 if (SET_DEST (curr_set) == (pc_rtx)
14473 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14474 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14475 && REG_P (SET_DEST (prev_set))
14476 && REGNO (SET_DEST (prev_set))
14477 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14479 /* Fuse ALU operations followed by conditional branch instruction. */
14480 switch (get_attr_type (prev))
14482 case TYPE_ALU_IMM:
14483 case TYPE_ALU_SREG:
14484 case TYPE_ADC_REG:
14485 case TYPE_ADC_IMM:
14486 case TYPE_ADCS_REG:
14487 case TYPE_ADCS_IMM:
14488 case TYPE_LOGIC_REG:
14489 case TYPE_LOGIC_IMM:
14490 case TYPE_CSEL:
14491 case TYPE_ADR:
14492 case TYPE_MOV_IMM:
14493 case TYPE_SHIFT_REG:
14494 case TYPE_SHIFT_IMM:
14495 case TYPE_BFM:
14496 case TYPE_RBIT:
14497 case TYPE_REV:
14498 case TYPE_EXTEND:
14499 return true;
14501 default:;
14506 return false;
14509 /* Return true iff the instruction fusion described by OP is enabled. */
14511 bool
14512 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14514 return (aarch64_tune_params.fusible_ops & op) != 0;
14517 /* If MEM is in the form of [base+offset], extract the two parts
14518 of address and set to BASE and OFFSET, otherwise return false
14519 after clearing BASE and OFFSET. */
14521 bool
14522 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14524 rtx addr;
14526 gcc_assert (MEM_P (mem));
14528 addr = XEXP (mem, 0);
14530 if (REG_P (addr))
14532 *base = addr;
14533 *offset = const0_rtx;
14534 return true;
14537 if (GET_CODE (addr) == PLUS
14538 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14540 *base = XEXP (addr, 0);
14541 *offset = XEXP (addr, 1);
14542 return true;
14545 *base = NULL_RTX;
14546 *offset = NULL_RTX;
14548 return false;
14551 /* Types for scheduling fusion. */
14552 enum sched_fusion_type
14554 SCHED_FUSION_NONE = 0,
14555 SCHED_FUSION_LD_SIGN_EXTEND,
14556 SCHED_FUSION_LD_ZERO_EXTEND,
14557 SCHED_FUSION_LD,
14558 SCHED_FUSION_ST,
14559 SCHED_FUSION_NUM
14562 /* If INSN is a load or store of address in the form of [base+offset],
14563 extract the two parts and set to BASE and OFFSET. Return scheduling
14564 fusion type this INSN is. */
14566 static enum sched_fusion_type
14567 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14569 rtx x, dest, src;
14570 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14572 gcc_assert (INSN_P (insn));
14573 x = PATTERN (insn);
14574 if (GET_CODE (x) != SET)
14575 return SCHED_FUSION_NONE;
14577 src = SET_SRC (x);
14578 dest = SET_DEST (x);
14580 machine_mode dest_mode = GET_MODE (dest);
14582 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14583 return SCHED_FUSION_NONE;
14585 if (GET_CODE (src) == SIGN_EXTEND)
14587 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14588 src = XEXP (src, 0);
14589 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14590 return SCHED_FUSION_NONE;
14592 else if (GET_CODE (src) == ZERO_EXTEND)
14594 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14595 src = XEXP (src, 0);
14596 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14597 return SCHED_FUSION_NONE;
14600 if (GET_CODE (src) == MEM && REG_P (dest))
14601 extract_base_offset_in_addr (src, base, offset);
14602 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14604 fusion = SCHED_FUSION_ST;
14605 extract_base_offset_in_addr (dest, base, offset);
14607 else
14608 return SCHED_FUSION_NONE;
14610 if (*base == NULL_RTX || *offset == NULL_RTX)
14611 fusion = SCHED_FUSION_NONE;
14613 return fusion;
14616 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14618 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14619 and PRI are only calculated for these instructions. For other instruction,
14620 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14621 type instruction fusion can be added by returning different priorities.
14623 It's important that irrelevant instructions get the largest FUSION_PRI. */
14625 static void
14626 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14627 int *fusion_pri, int *pri)
14629 int tmp, off_val;
14630 rtx base, offset;
14631 enum sched_fusion_type fusion;
14633 gcc_assert (INSN_P (insn));
14635 tmp = max_pri - 1;
14636 fusion = fusion_load_store (insn, &base, &offset);
14637 if (fusion == SCHED_FUSION_NONE)
14639 *pri = tmp;
14640 *fusion_pri = tmp;
14641 return;
14644 /* Set FUSION_PRI according to fusion type and base register. */
14645 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14647 /* Calculate PRI. */
14648 tmp /= 2;
14650 /* INSN with smaller offset goes first. */
14651 off_val = (int)(INTVAL (offset));
14652 if (off_val >= 0)
14653 tmp -= (off_val & 0xfffff);
14654 else
14655 tmp += ((- off_val) & 0xfffff);
14657 *pri = tmp;
14658 return;
14661 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14662 Adjust priority of sha1h instructions so they are scheduled before
14663 other SHA1 instructions. */
14665 static int
14666 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14668 rtx x = PATTERN (insn);
14670 if (GET_CODE (x) == SET)
14672 x = SET_SRC (x);
14674 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14675 return priority + 10;
14678 return priority;
14681 /* Given OPERANDS of consecutive load/store, check if we can merge
14682 them into ldp/stp. LOAD is true if they are load instructions.
14683 MODE is the mode of memory operands. */
14685 bool
14686 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14687 machine_mode mode)
14689 HOST_WIDE_INT offval_1, offval_2, msize;
14690 enum reg_class rclass_1, rclass_2;
14691 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14693 if (load)
14695 mem_1 = operands[1];
14696 mem_2 = operands[3];
14697 reg_1 = operands[0];
14698 reg_2 = operands[2];
14699 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14700 if (REGNO (reg_1) == REGNO (reg_2))
14701 return false;
14703 else
14705 mem_1 = operands[0];
14706 mem_2 = operands[2];
14707 reg_1 = operands[1];
14708 reg_2 = operands[3];
14711 /* The mems cannot be volatile. */
14712 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14713 return false;
14715 /* If we have SImode and slow unaligned ldp,
14716 check the alignment to be at least 8 byte. */
14717 if (mode == SImode
14718 && (aarch64_tune_params.extra_tuning_flags
14719 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14720 && !optimize_size
14721 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14722 return false;
14724 /* Check if the addresses are in the form of [base+offset]. */
14725 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14726 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14727 return false;
14728 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14729 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14730 return false;
14732 /* Check if the bases are same. */
14733 if (!rtx_equal_p (base_1, base_2))
14734 return false;
14736 offval_1 = INTVAL (offset_1);
14737 offval_2 = INTVAL (offset_2);
14738 msize = GET_MODE_SIZE (mode);
14739 /* Check if the offsets are consecutive. */
14740 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14741 return false;
14743 /* Check if the addresses are clobbered by load. */
14744 if (load)
14746 if (reg_mentioned_p (reg_1, mem_1))
14747 return false;
14749 /* In increasing order, the last load can clobber the address. */
14750 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14751 return false;
14754 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14755 rclass_1 = FP_REGS;
14756 else
14757 rclass_1 = GENERAL_REGS;
14759 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14760 rclass_2 = FP_REGS;
14761 else
14762 rclass_2 = GENERAL_REGS;
14764 /* Check if the registers are of same class. */
14765 if (rclass_1 != rclass_2)
14766 return false;
14768 return true;
14771 /* Given OPERANDS of consecutive load/store, check if we can merge
14772 them into ldp/stp by adjusting the offset. LOAD is true if they
14773 are load instructions. MODE is the mode of memory operands.
14775 Given below consecutive stores:
14777 str w1, [xb, 0x100]
14778 str w1, [xb, 0x104]
14779 str w1, [xb, 0x108]
14780 str w1, [xb, 0x10c]
14782 Though the offsets are out of the range supported by stp, we can
14783 still pair them after adjusting the offset, like:
14785 add scratch, xb, 0x100
14786 stp w1, w1, [scratch]
14787 stp w1, w1, [scratch, 0x8]
14789 The peephole patterns detecting this opportunity should guarantee
14790 the scratch register is avaliable. */
14792 bool
14793 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14794 machine_mode mode)
14796 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14797 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14798 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14799 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14801 if (load)
14803 reg_1 = operands[0];
14804 mem_1 = operands[1];
14805 reg_2 = operands[2];
14806 mem_2 = operands[3];
14807 reg_3 = operands[4];
14808 mem_3 = operands[5];
14809 reg_4 = operands[6];
14810 mem_4 = operands[7];
14811 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14812 && REG_P (reg_3) && REG_P (reg_4));
14813 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14814 return false;
14816 else
14818 mem_1 = operands[0];
14819 reg_1 = operands[1];
14820 mem_2 = operands[2];
14821 reg_2 = operands[3];
14822 mem_3 = operands[4];
14823 reg_3 = operands[5];
14824 mem_4 = operands[6];
14825 reg_4 = operands[7];
14827 /* Skip if memory operand is by itslef valid for ldp/stp. */
14828 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14829 return false;
14831 /* The mems cannot be volatile. */
14832 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14833 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14834 return false;
14836 /* Check if the addresses are in the form of [base+offset]. */
14837 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14838 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14839 return false;
14840 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14841 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14842 return false;
14843 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14844 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14845 return false;
14846 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14847 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14848 return false;
14850 /* Check if the bases are same. */
14851 if (!rtx_equal_p (base_1, base_2)
14852 || !rtx_equal_p (base_2, base_3)
14853 || !rtx_equal_p (base_3, base_4))
14854 return false;
14856 offval_1 = INTVAL (offset_1);
14857 offval_2 = INTVAL (offset_2);
14858 offval_3 = INTVAL (offset_3);
14859 offval_4 = INTVAL (offset_4);
14860 msize = GET_MODE_SIZE (mode);
14861 /* Check if the offsets are consecutive. */
14862 if ((offval_1 != (offval_2 + msize)
14863 || offval_1 != (offval_3 + msize * 2)
14864 || offval_1 != (offval_4 + msize * 3))
14865 && (offval_4 != (offval_3 + msize)
14866 || offval_4 != (offval_2 + msize * 2)
14867 || offval_4 != (offval_1 + msize * 3)))
14868 return false;
14870 /* Check if the addresses are clobbered by load. */
14871 if (load)
14873 if (reg_mentioned_p (reg_1, mem_1)
14874 || reg_mentioned_p (reg_2, mem_2)
14875 || reg_mentioned_p (reg_3, mem_3))
14876 return false;
14878 /* In increasing order, the last load can clobber the address. */
14879 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14880 return false;
14883 /* If we have SImode and slow unaligned ldp,
14884 check the alignment to be at least 8 byte. */
14885 if (mode == SImode
14886 && (aarch64_tune_params.extra_tuning_flags
14887 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14888 && !optimize_size
14889 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14890 return false;
14892 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14893 rclass_1 = FP_REGS;
14894 else
14895 rclass_1 = GENERAL_REGS;
14897 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14898 rclass_2 = FP_REGS;
14899 else
14900 rclass_2 = GENERAL_REGS;
14902 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14903 rclass_3 = FP_REGS;
14904 else
14905 rclass_3 = GENERAL_REGS;
14907 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14908 rclass_4 = FP_REGS;
14909 else
14910 rclass_4 = GENERAL_REGS;
14912 /* Check if the registers are of same class. */
14913 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14914 return false;
14916 return true;
14919 /* Given OPERANDS of consecutive load/store, this function pairs them
14920 into ldp/stp after adjusting the offset. It depends on the fact
14921 that addresses of load/store instructions are in increasing order.
14922 MODE is the mode of memory operands. CODE is the rtl operator
14923 which should be applied to all memory operands, it's SIGN_EXTEND,
14924 ZERO_EXTEND or UNKNOWN. */
14926 bool
14927 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14928 machine_mode mode, RTX_CODE code)
14930 rtx base, offset, t1, t2;
14931 rtx mem_1, mem_2, mem_3, mem_4;
14932 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14934 if (load)
14936 mem_1 = operands[1];
14937 mem_2 = operands[3];
14938 mem_3 = operands[5];
14939 mem_4 = operands[7];
14941 else
14943 mem_1 = operands[0];
14944 mem_2 = operands[2];
14945 mem_3 = operands[4];
14946 mem_4 = operands[6];
14947 gcc_assert (code == UNKNOWN);
14950 extract_base_offset_in_addr (mem_1, &base, &offset);
14951 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14953 /* Adjust offset thus it can fit in ldp/stp instruction. */
14954 msize = GET_MODE_SIZE (mode);
14955 stp_off_limit = msize * 0x40;
14956 off_val = INTVAL (offset);
14957 abs_off = (off_val < 0) ? -off_val : off_val;
14958 new_off = abs_off % stp_off_limit;
14959 adj_off = abs_off - new_off;
14961 /* Further adjust to make sure all offsets are OK. */
14962 if ((new_off + msize * 2) >= stp_off_limit)
14964 adj_off += stp_off_limit;
14965 new_off -= stp_off_limit;
14968 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14969 if (adj_off >= 0x1000)
14970 return false;
14972 if (off_val < 0)
14974 adj_off = -adj_off;
14975 new_off = -new_off;
14978 /* Create new memory references. */
14979 mem_1 = change_address (mem_1, VOIDmode,
14980 plus_constant (DImode, operands[8], new_off));
14982 /* Check if the adjusted address is OK for ldp/stp. */
14983 if (!aarch64_mem_pair_operand (mem_1, mode))
14984 return false;
14986 msize = GET_MODE_SIZE (mode);
14987 mem_2 = change_address (mem_2, VOIDmode,
14988 plus_constant (DImode,
14989 operands[8],
14990 new_off + msize));
14991 mem_3 = change_address (mem_3, VOIDmode,
14992 plus_constant (DImode,
14993 operands[8],
14994 new_off + msize * 2));
14995 mem_4 = change_address (mem_4, VOIDmode,
14996 plus_constant (DImode,
14997 operands[8],
14998 new_off + msize * 3));
15000 if (code == ZERO_EXTEND)
15002 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15003 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15004 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15005 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15007 else if (code == SIGN_EXTEND)
15009 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15010 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15011 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15012 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15015 if (load)
15017 operands[1] = mem_1;
15018 operands[3] = mem_2;
15019 operands[5] = mem_3;
15020 operands[7] = mem_4;
15022 else
15024 operands[0] = mem_1;
15025 operands[2] = mem_2;
15026 operands[4] = mem_3;
15027 operands[6] = mem_4;
15030 /* Emit adjusting instruction. */
15031 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15032 /* Emit ldp/stp instructions. */
15033 t1 = gen_rtx_SET (operands[0], operands[1]);
15034 t2 = gen_rtx_SET (operands[2], operands[3]);
15035 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15036 t1 = gen_rtx_SET (operands[4], operands[5]);
15037 t2 = gen_rtx_SET (operands[6], operands[7]);
15038 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15039 return true;
15042 /* Return 1 if pseudo register should be created and used to hold
15043 GOT address for PIC code. */
15045 bool
15046 aarch64_use_pseudo_pic_reg (void)
15048 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15051 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15053 static int
15054 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15056 switch (XINT (x, 1))
15058 case UNSPEC_GOTSMALLPIC:
15059 case UNSPEC_GOTSMALLPIC28K:
15060 case UNSPEC_GOTTINYPIC:
15061 return 0;
15062 default:
15063 break;
15066 return default_unspec_may_trap_p (x, flags);
15070 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15071 return the log2 of that value. Otherwise return -1. */
15074 aarch64_fpconst_pow_of_2 (rtx x)
15076 const REAL_VALUE_TYPE *r;
15078 if (!CONST_DOUBLE_P (x))
15079 return -1;
15081 r = CONST_DOUBLE_REAL_VALUE (x);
15083 if (REAL_VALUE_NEGATIVE (*r)
15084 || REAL_VALUE_ISNAN (*r)
15085 || REAL_VALUE_ISINF (*r)
15086 || !real_isinteger (r, DFmode))
15087 return -1;
15089 return exact_log2 (real_to_integer (r));
15092 /* If X is a vector of equal CONST_DOUBLE values and that value is
15093 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15096 aarch64_vec_fpconst_pow_of_2 (rtx x)
15098 if (GET_CODE (x) != CONST_VECTOR)
15099 return -1;
15101 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15102 return -1;
15104 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15105 if (firstval <= 0)
15106 return -1;
15108 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15109 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15110 return -1;
15112 return firstval;
15115 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15116 to float.
15118 __fp16 always promotes through this hook.
15119 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15120 through the generic excess precision logic rather than here. */
15122 static tree
15123 aarch64_promoted_type (const_tree t)
15125 if (SCALAR_FLOAT_TYPE_P (t)
15126 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15127 return float_type_node;
15129 return NULL_TREE;
15132 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15134 static bool
15135 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15136 optimization_type opt_type)
15138 switch (op)
15140 case rsqrt_optab:
15141 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15143 default:
15144 return true;
15148 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15149 if MODE is HFmode, and punt to the generic implementation otherwise. */
15151 static bool
15152 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15154 return (mode == HFmode
15155 ? true
15156 : default_libgcc_floating_mode_supported_p (mode));
15159 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15160 if MODE is HFmode, and punt to the generic implementation otherwise. */
15162 static bool
15163 aarch64_scalar_mode_supported_p (scalar_mode mode)
15165 return (mode == HFmode
15166 ? true
15167 : default_scalar_mode_supported_p (mode));
15170 /* Set the value of FLT_EVAL_METHOD.
15171 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15173 0: evaluate all operations and constants, whose semantic type has at
15174 most the range and precision of type float, to the range and
15175 precision of float; evaluate all other operations and constants to
15176 the range and precision of the semantic type;
15178 N, where _FloatN is a supported interchange floating type
15179 evaluate all operations and constants, whose semantic type has at
15180 most the range and precision of _FloatN type, to the range and
15181 precision of the _FloatN type; evaluate all other operations and
15182 constants to the range and precision of the semantic type;
15184 If we have the ARMv8.2-A extensions then we support _Float16 in native
15185 precision, so we should set this to 16. Otherwise, we support the type,
15186 but want to evaluate expressions in float precision, so set this to
15187 0. */
15189 static enum flt_eval_method
15190 aarch64_excess_precision (enum excess_precision_type type)
15192 switch (type)
15194 case EXCESS_PRECISION_TYPE_FAST:
15195 case EXCESS_PRECISION_TYPE_STANDARD:
15196 /* We can calculate either in 16-bit range and precision or
15197 32-bit range and precision. Make that decision based on whether
15198 we have native support for the ARMv8.2-A 16-bit floating-point
15199 instructions or not. */
15200 return (TARGET_FP_F16INST
15201 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15202 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15203 case EXCESS_PRECISION_TYPE_IMPLICIT:
15204 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15205 default:
15206 gcc_unreachable ();
15208 return FLT_EVAL_METHOD_UNPREDICTABLE;
15211 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15212 scheduled for speculative execution. Reject the long-running division
15213 and square-root instructions. */
15215 static bool
15216 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15218 switch (get_attr_type (insn))
15220 case TYPE_SDIV:
15221 case TYPE_UDIV:
15222 case TYPE_FDIVS:
15223 case TYPE_FDIVD:
15224 case TYPE_FSQRTS:
15225 case TYPE_FSQRTD:
15226 case TYPE_NEON_FP_SQRT_S:
15227 case TYPE_NEON_FP_SQRT_D:
15228 case TYPE_NEON_FP_SQRT_S_Q:
15229 case TYPE_NEON_FP_SQRT_D_Q:
15230 case TYPE_NEON_FP_DIV_S:
15231 case TYPE_NEON_FP_DIV_D:
15232 case TYPE_NEON_FP_DIV_S_Q:
15233 case TYPE_NEON_FP_DIV_D_Q:
15234 return false;
15235 default:
15236 return true;
15240 /* Target-specific selftests. */
15242 #if CHECKING_P
15244 namespace selftest {
15246 /* Selftest for the RTL loader.
15247 Verify that the RTL loader copes with a dump from
15248 print_rtx_function. This is essentially just a test that class
15249 function_reader can handle a real dump, but it also verifies
15250 that lookup_reg_by_dump_name correctly handles hard regs.
15251 The presence of hard reg names in the dump means that the test is
15252 target-specific, hence it is in this file. */
15254 static void
15255 aarch64_test_loading_full_dump ()
15257 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15259 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15261 rtx_insn *insn_1 = get_insn_by_uid (1);
15262 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15264 rtx_insn *insn_15 = get_insn_by_uid (15);
15265 ASSERT_EQ (INSN, GET_CODE (insn_15));
15266 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15268 /* Verify crtl->return_rtx. */
15269 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15270 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15271 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15274 /* Run all target-specific selftests. */
15276 static void
15277 aarch64_run_selftests (void)
15279 aarch64_test_loading_full_dump ();
15282 } // namespace selftest
15284 #endif /* #if CHECKING_P */
15286 #undef TARGET_ADDRESS_COST
15287 #define TARGET_ADDRESS_COST aarch64_address_cost
15289 /* This hook will determines whether unnamed bitfields affect the alignment
15290 of the containing structure. The hook returns true if the structure
15291 should inherit the alignment requirements of an unnamed bitfield's
15292 type. */
15293 #undef TARGET_ALIGN_ANON_BITFIELD
15294 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15296 #undef TARGET_ASM_ALIGNED_DI_OP
15297 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15299 #undef TARGET_ASM_ALIGNED_HI_OP
15300 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15302 #undef TARGET_ASM_ALIGNED_SI_OP
15303 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15305 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15306 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15307 hook_bool_const_tree_hwi_hwi_const_tree_true
15309 #undef TARGET_ASM_FILE_START
15310 #define TARGET_ASM_FILE_START aarch64_start_file
15312 #undef TARGET_ASM_OUTPUT_MI_THUNK
15313 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15315 #undef TARGET_ASM_SELECT_RTX_SECTION
15316 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15318 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15319 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15321 #undef TARGET_BUILD_BUILTIN_VA_LIST
15322 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15324 #undef TARGET_CALLEE_COPIES
15325 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15327 #undef TARGET_CAN_ELIMINATE
15328 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15330 #undef TARGET_CAN_INLINE_P
15331 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15333 #undef TARGET_CANNOT_FORCE_CONST_MEM
15334 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15336 #undef TARGET_CASE_VALUES_THRESHOLD
15337 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15339 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15340 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15342 /* Only the least significant bit is used for initialization guard
15343 variables. */
15344 #undef TARGET_CXX_GUARD_MASK_BIT
15345 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15347 #undef TARGET_C_MODE_FOR_SUFFIX
15348 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15350 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15351 #undef TARGET_DEFAULT_TARGET_FLAGS
15352 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15353 #endif
15355 #undef TARGET_CLASS_MAX_NREGS
15356 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15358 #undef TARGET_BUILTIN_DECL
15359 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15361 #undef TARGET_BUILTIN_RECIPROCAL
15362 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15364 #undef TARGET_C_EXCESS_PRECISION
15365 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15367 #undef TARGET_EXPAND_BUILTIN
15368 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15370 #undef TARGET_EXPAND_BUILTIN_VA_START
15371 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15373 #undef TARGET_FOLD_BUILTIN
15374 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15376 #undef TARGET_FUNCTION_ARG
15377 #define TARGET_FUNCTION_ARG aarch64_function_arg
15379 #undef TARGET_FUNCTION_ARG_ADVANCE
15380 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15382 #undef TARGET_FUNCTION_ARG_BOUNDARY
15383 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15385 #undef TARGET_FUNCTION_ARG_PADDING
15386 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15388 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15389 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15391 #undef TARGET_FUNCTION_VALUE
15392 #define TARGET_FUNCTION_VALUE aarch64_function_value
15394 #undef TARGET_FUNCTION_VALUE_REGNO_P
15395 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15397 #undef TARGET_FRAME_POINTER_REQUIRED
15398 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15400 #undef TARGET_GIMPLE_FOLD_BUILTIN
15401 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15403 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15404 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15406 #undef TARGET_INIT_BUILTINS
15407 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15409 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15410 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15411 aarch64_ira_change_pseudo_allocno_class
15413 #undef TARGET_LEGITIMATE_ADDRESS_P
15414 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15416 #undef TARGET_LEGITIMATE_CONSTANT_P
15417 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15419 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15420 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15421 aarch64_legitimize_address_displacement
15423 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15424 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15426 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15427 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15428 aarch64_libgcc_floating_mode_supported_p
15430 #undef TARGET_MANGLE_TYPE
15431 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15433 #undef TARGET_MEMORY_MOVE_COST
15434 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15436 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15437 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15439 #undef TARGET_MUST_PASS_IN_STACK
15440 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15442 /* This target hook should return true if accesses to volatile bitfields
15443 should use the narrowest mode possible. It should return false if these
15444 accesses should use the bitfield container type. */
15445 #undef TARGET_NARROW_VOLATILE_BITFIELD
15446 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15448 #undef TARGET_OPTION_OVERRIDE
15449 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15451 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15452 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15453 aarch64_override_options_after_change
15455 #undef TARGET_OPTION_SAVE
15456 #define TARGET_OPTION_SAVE aarch64_option_save
15458 #undef TARGET_OPTION_RESTORE
15459 #define TARGET_OPTION_RESTORE aarch64_option_restore
15461 #undef TARGET_OPTION_PRINT
15462 #define TARGET_OPTION_PRINT aarch64_option_print
15464 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15465 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15467 #undef TARGET_SET_CURRENT_FUNCTION
15468 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15470 #undef TARGET_PASS_BY_REFERENCE
15471 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15473 #undef TARGET_PREFERRED_RELOAD_CLASS
15474 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15476 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15477 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15479 #undef TARGET_PROMOTED_TYPE
15480 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15482 #undef TARGET_SECONDARY_RELOAD
15483 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15485 #undef TARGET_SHIFT_TRUNCATION_MASK
15486 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15488 #undef TARGET_SETUP_INCOMING_VARARGS
15489 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15491 #undef TARGET_STRUCT_VALUE_RTX
15492 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15494 #undef TARGET_REGISTER_MOVE_COST
15495 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15497 #undef TARGET_RETURN_IN_MEMORY
15498 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15500 #undef TARGET_RETURN_IN_MSB
15501 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15503 #undef TARGET_RTX_COSTS
15504 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15506 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15507 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15509 #undef TARGET_SCHED_ISSUE_RATE
15510 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15512 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15513 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15514 aarch64_sched_first_cycle_multipass_dfa_lookahead
15516 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15517 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15518 aarch64_first_cycle_multipass_dfa_lookahead_guard
15520 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15521 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15522 aarch64_get_separate_components
15524 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15525 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15526 aarch64_components_for_bb
15528 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15529 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15530 aarch64_disqualify_components
15532 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15533 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15534 aarch64_emit_prologue_components
15536 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15537 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15538 aarch64_emit_epilogue_components
15540 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15541 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15542 aarch64_set_handled_components
15544 #undef TARGET_TRAMPOLINE_INIT
15545 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15547 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15548 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15550 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15551 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15553 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15554 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15555 aarch64_builtin_support_vector_misalignment
15557 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15558 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15560 #undef TARGET_VECTORIZE_ADD_STMT_COST
15561 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15563 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15564 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15565 aarch64_builtin_vectorization_cost
15567 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15568 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15570 #undef TARGET_VECTORIZE_BUILTINS
15571 #define TARGET_VECTORIZE_BUILTINS
15573 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15574 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15575 aarch64_builtin_vectorized_function
15577 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15578 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15579 aarch64_autovectorize_vector_sizes
15581 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15582 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15583 aarch64_atomic_assign_expand_fenv
15585 /* Section anchor support. */
15587 #undef TARGET_MIN_ANCHOR_OFFSET
15588 #define TARGET_MIN_ANCHOR_OFFSET -256
15590 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15591 byte offset; we can do much more for larger data types, but have no way
15592 to determine the size of the access. We assume accesses are aligned. */
15593 #undef TARGET_MAX_ANCHOR_OFFSET
15594 #define TARGET_MAX_ANCHOR_OFFSET 4095
15596 #undef TARGET_VECTOR_ALIGNMENT
15597 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15599 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15600 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15601 aarch64_simd_vector_alignment_reachable
15603 /* vec_perm support. */
15605 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15606 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15607 aarch64_vectorize_vec_perm_const_ok
15609 #undef TARGET_INIT_LIBFUNCS
15610 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15612 #undef TARGET_FIXED_CONDITION_CODE_REGS
15613 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15615 #undef TARGET_FLAGS_REGNUM
15616 #define TARGET_FLAGS_REGNUM CC_REGNUM
15618 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15619 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15621 #undef TARGET_ASAN_SHADOW_OFFSET
15622 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15624 #undef TARGET_LEGITIMIZE_ADDRESS
15625 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15627 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15628 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15629 aarch64_use_by_pieces_infrastructure_p
15631 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15632 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15634 #undef TARGET_CAN_USE_DOLOOP_P
15635 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15637 #undef TARGET_SCHED_ADJUST_PRIORITY
15638 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15640 #undef TARGET_SCHED_MACRO_FUSION_P
15641 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15643 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15644 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15646 #undef TARGET_SCHED_FUSION_PRIORITY
15647 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15649 #undef TARGET_UNSPEC_MAY_TRAP_P
15650 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15652 #undef TARGET_USE_PSEUDO_PIC_REG
15653 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15655 #undef TARGET_PRINT_OPERAND
15656 #define TARGET_PRINT_OPERAND aarch64_print_operand
15658 #undef TARGET_PRINT_OPERAND_ADDRESS
15659 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15661 #undef TARGET_OPTAB_SUPPORTED_P
15662 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15664 #undef TARGET_OMIT_STRUCT_RETURN_REG
15665 #define TARGET_OMIT_STRUCT_RETURN_REG true
15667 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15668 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15669 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15671 #undef TARGET_HARD_REGNO_MODE_OK
15672 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15674 #undef TARGET_MODES_TIEABLE_P
15675 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15677 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15678 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15679 aarch64_hard_regno_call_part_clobbered
15681 #if CHECKING_P
15682 #undef TARGET_RUN_TARGET_SELFTESTS
15683 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15684 #endif /* #if CHECKING_P */
15686 struct gcc_target targetm = TARGET_INITIALIZER;
15688 #include "gt-aarch64.h"