[AArch64] PR71307: Define union class of POINTER+FP
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobe89c8156976cecf200cd67c1e938c8156c1240c4
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
145 vec_perm_indices);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings =
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
869 /* A processor implementing AArch64. */
870 struct processor
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
929 aarch64_cc;
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
992 machine_mode mode;
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
997 if (best_class != ALL_REGS)
998 return best_class;
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1045 return mode == OImode || mode == CImode || mode == XImode;
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1067 return false;
1070 /* Implement TARGET_HARD_REGNO_NREGS. */
1072 static unsigned int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1075 switch (aarch64_regno_regclass (regno))
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1083 gcc_unreachable ();
1086 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return true;
1106 if (FP_REGNUM_P (regno))
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1110 else
1111 return true;
1114 return false;
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1118 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1119 clobbers the top 64 bits when restoring the bottom 64 bits. */
1121 static bool
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1124 return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1128 machine_mode
1129 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1130 machine_mode mode)
1132 /* Handle modes that fit within single registers. */
1133 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1135 if (GET_MODE_SIZE (mode) >= 4)
1136 return mode;
1137 else
1138 return SImode;
1140 /* Fall back to generic for multi-reg and very large modes. */
1141 else
1142 return choose_hard_reg_mode (regno, nregs, false);
1145 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1146 that strcpy from constants will be faster. */
1148 static HOST_WIDE_INT
1149 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1151 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1152 return MAX (align, BITS_PER_WORD);
1153 return align;
1156 /* Return true if calls to DECL should be treated as
1157 long-calls (ie called via a register). */
1158 static bool
1159 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1161 return false;
1164 /* Return true if calls to symbol-ref SYM should be treated as
1165 long-calls (ie called via a register). */
1166 bool
1167 aarch64_is_long_call_p (rtx sym)
1169 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1172 /* Return true if calls to symbol-ref SYM should not go through
1173 plt stubs. */
1175 bool
1176 aarch64_is_noplt_call_p (rtx sym)
1178 const_tree decl = SYMBOL_REF_DECL (sym);
1180 if (flag_pic
1181 && decl
1182 && (!flag_plt
1183 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1184 && !targetm.binds_local_p (decl))
1185 return true;
1187 return false;
1190 /* Return true if the offsets to a zero/sign-extract operation
1191 represent an expression that matches an extend operation. The
1192 operands represent the paramters from
1194 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1195 bool
1196 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1197 rtx extract_imm)
1199 HOST_WIDE_INT mult_val, extract_val;
1201 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1202 return false;
1204 mult_val = INTVAL (mult_imm);
1205 extract_val = INTVAL (extract_imm);
1207 if (extract_val > 8
1208 && extract_val < GET_MODE_BITSIZE (mode)
1209 && exact_log2 (extract_val & ~7) > 0
1210 && (extract_val & 7) <= 4
1211 && mult_val == (1 << (extract_val & 7)))
1212 return true;
1214 return false;
1217 /* Emit an insn that's a simple single-set. Both the operands must be
1218 known to be valid. */
1219 inline static rtx_insn *
1220 emit_set_insn (rtx x, rtx y)
1222 return emit_insn (gen_rtx_SET (x, y));
1225 /* X and Y are two things to compare using CODE. Emit the compare insn and
1226 return the rtx for register 0 in the proper mode. */
1228 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1230 machine_mode mode = SELECT_CC_MODE (code, x, y);
1231 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1233 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1234 return cc_reg;
1237 /* Build the SYMBOL_REF for __tls_get_addr. */
1239 static GTY(()) rtx tls_get_addr_libfunc;
1242 aarch64_tls_get_addr (void)
1244 if (!tls_get_addr_libfunc)
1245 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1246 return tls_get_addr_libfunc;
1249 /* Return the TLS model to use for ADDR. */
1251 static enum tls_model
1252 tls_symbolic_operand_type (rtx addr)
1254 enum tls_model tls_kind = TLS_MODEL_NONE;
1255 rtx sym, addend;
1257 if (GET_CODE (addr) == CONST)
1259 split_const (addr, &sym, &addend);
1260 if (GET_CODE (sym) == SYMBOL_REF)
1261 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1263 else if (GET_CODE (addr) == SYMBOL_REF)
1264 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1266 return tls_kind;
1269 /* We'll allow lo_sum's in addresses in our legitimate addresses
1270 so that combine would take care of combining addresses where
1271 necessary, but for generation purposes, we'll generate the address
1272 as :
1273 RTL Absolute
1274 tmp = hi (symbol_ref); adrp x1, foo
1275 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1278 PIC TLS
1279 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1280 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1281 bl __tls_get_addr
1284 Load TLS symbol, depending on TLS mechanism and TLS access model.
1286 Global Dynamic - Traditional TLS:
1287 adrp tmp, :tlsgd:imm
1288 add dest, tmp, #:tlsgd_lo12:imm
1289 bl __tls_get_addr
1291 Global Dynamic - TLS Descriptors:
1292 adrp dest, :tlsdesc:imm
1293 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1294 add dest, dest, #:tlsdesc_lo12:imm
1295 blr tmp
1296 mrs tp, tpidr_el0
1297 add dest, dest, tp
1299 Initial Exec:
1300 mrs tp, tpidr_el0
1301 adrp tmp, :gottprel:imm
1302 ldr dest, [tmp, #:gottprel_lo12:imm]
1303 add dest, dest, tp
1305 Local Exec:
1306 mrs tp, tpidr_el0
1307 add t0, tp, #:tprel_hi12:imm, lsl #12
1308 add t0, t0, #:tprel_lo12_nc:imm
1311 static void
1312 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1313 enum aarch64_symbol_type type)
1315 switch (type)
1317 case SYMBOL_SMALL_ABSOLUTE:
1319 /* In ILP32, the mode of dest can be either SImode or DImode. */
1320 rtx tmp_reg = dest;
1321 machine_mode mode = GET_MODE (dest);
1323 gcc_assert (mode == Pmode || mode == ptr_mode);
1325 if (can_create_pseudo_p ())
1326 tmp_reg = gen_reg_rtx (mode);
1328 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1329 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1330 return;
1333 case SYMBOL_TINY_ABSOLUTE:
1334 emit_insn (gen_rtx_SET (dest, imm));
1335 return;
1337 case SYMBOL_SMALL_GOT_28K:
1339 machine_mode mode = GET_MODE (dest);
1340 rtx gp_rtx = pic_offset_table_rtx;
1341 rtx insn;
1342 rtx mem;
1344 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1345 here before rtl expand. Tree IVOPT will generate rtl pattern to
1346 decide rtx costs, in which case pic_offset_table_rtx is not
1347 initialized. For that case no need to generate the first adrp
1348 instruction as the final cost for global variable access is
1349 one instruction. */
1350 if (gp_rtx != NULL)
1352 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1353 using the page base as GOT base, the first page may be wasted,
1354 in the worst scenario, there is only 28K space for GOT).
1356 The generate instruction sequence for accessing global variable
1359 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1361 Only one instruction needed. But we must initialize
1362 pic_offset_table_rtx properly. We generate initialize insn for
1363 every global access, and allow CSE to remove all redundant.
1365 The final instruction sequences will look like the following
1366 for multiply global variables access.
1368 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1370 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1371 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1372 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1373 ... */
1375 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1376 crtl->uses_pic_offset_table = 1;
1377 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1379 if (mode != GET_MODE (gp_rtx))
1380 gp_rtx = gen_lowpart (mode, gp_rtx);
1384 if (mode == ptr_mode)
1386 if (mode == DImode)
1387 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1388 else
1389 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1391 mem = XVECEXP (SET_SRC (insn), 0, 0);
1393 else
1395 gcc_assert (mode == Pmode);
1397 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1398 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1401 /* The operand is expected to be MEM. Whenever the related insn
1402 pattern changed, above code which calculate mem should be
1403 updated. */
1404 gcc_assert (GET_CODE (mem) == MEM);
1405 MEM_READONLY_P (mem) = 1;
1406 MEM_NOTRAP_P (mem) = 1;
1407 emit_insn (insn);
1408 return;
1411 case SYMBOL_SMALL_GOT_4G:
1413 /* In ILP32, the mode of dest can be either SImode or DImode,
1414 while the got entry is always of SImode size. The mode of
1415 dest depends on how dest is used: if dest is assigned to a
1416 pointer (e.g. in the memory), it has SImode; it may have
1417 DImode if dest is dereferenced to access the memeory.
1418 This is why we have to handle three different ldr_got_small
1419 patterns here (two patterns for ILP32). */
1421 rtx insn;
1422 rtx mem;
1423 rtx tmp_reg = dest;
1424 machine_mode mode = GET_MODE (dest);
1426 if (can_create_pseudo_p ())
1427 tmp_reg = gen_reg_rtx (mode);
1429 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1430 if (mode == ptr_mode)
1432 if (mode == DImode)
1433 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1434 else
1435 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1437 mem = XVECEXP (SET_SRC (insn), 0, 0);
1439 else
1441 gcc_assert (mode == Pmode);
1443 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1444 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1447 gcc_assert (GET_CODE (mem) == MEM);
1448 MEM_READONLY_P (mem) = 1;
1449 MEM_NOTRAP_P (mem) = 1;
1450 emit_insn (insn);
1451 return;
1454 case SYMBOL_SMALL_TLSGD:
1456 rtx_insn *insns;
1457 machine_mode mode = GET_MODE (dest);
1458 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1460 start_sequence ();
1461 if (TARGET_ILP32)
1462 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1463 else
1464 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1465 insns = get_insns ();
1466 end_sequence ();
1468 RTL_CONST_CALL_P (insns) = 1;
1469 emit_libcall_block (insns, dest, result, imm);
1470 return;
1473 case SYMBOL_SMALL_TLSDESC:
1475 machine_mode mode = GET_MODE (dest);
1476 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1477 rtx tp;
1479 gcc_assert (mode == Pmode || mode == ptr_mode);
1481 /* In ILP32, the got entry is always of SImode size. Unlike
1482 small GOT, the dest is fixed at reg 0. */
1483 if (TARGET_ILP32)
1484 emit_insn (gen_tlsdesc_small_si (imm));
1485 else
1486 emit_insn (gen_tlsdesc_small_di (imm));
1487 tp = aarch64_load_tp (NULL);
1489 if (mode != Pmode)
1490 tp = gen_lowpart (mode, tp);
1492 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1493 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1494 return;
1497 case SYMBOL_SMALL_TLSIE:
1499 /* In ILP32, the mode of dest can be either SImode or DImode,
1500 while the got entry is always of SImode size. The mode of
1501 dest depends on how dest is used: if dest is assigned to a
1502 pointer (e.g. in the memory), it has SImode; it may have
1503 DImode if dest is dereferenced to access the memeory.
1504 This is why we have to handle three different tlsie_small
1505 patterns here (two patterns for ILP32). */
1506 machine_mode mode = GET_MODE (dest);
1507 rtx tmp_reg = gen_reg_rtx (mode);
1508 rtx tp = aarch64_load_tp (NULL);
1510 if (mode == ptr_mode)
1512 if (mode == DImode)
1513 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1514 else
1516 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1517 tp = gen_lowpart (mode, tp);
1520 else
1522 gcc_assert (mode == Pmode);
1523 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1526 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1527 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1528 return;
1531 case SYMBOL_TLSLE12:
1532 case SYMBOL_TLSLE24:
1533 case SYMBOL_TLSLE32:
1534 case SYMBOL_TLSLE48:
1536 machine_mode mode = GET_MODE (dest);
1537 rtx tp = aarch64_load_tp (NULL);
1539 if (mode != Pmode)
1540 tp = gen_lowpart (mode, tp);
1542 switch (type)
1544 case SYMBOL_TLSLE12:
1545 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1546 (dest, tp, imm));
1547 break;
1548 case SYMBOL_TLSLE24:
1549 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1550 (dest, tp, imm));
1551 break;
1552 case SYMBOL_TLSLE32:
1553 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1554 (dest, imm));
1555 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1556 (dest, dest, tp));
1557 break;
1558 case SYMBOL_TLSLE48:
1559 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1560 (dest, imm));
1561 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1562 (dest, dest, tp));
1563 break;
1564 default:
1565 gcc_unreachable ();
1568 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1569 return;
1572 case SYMBOL_TINY_GOT:
1573 emit_insn (gen_ldr_got_tiny (dest, imm));
1574 return;
1576 case SYMBOL_TINY_TLSIE:
1578 machine_mode mode = GET_MODE (dest);
1579 rtx tp = aarch64_load_tp (NULL);
1581 if (mode == ptr_mode)
1583 if (mode == DImode)
1584 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1585 else
1587 tp = gen_lowpart (mode, tp);
1588 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1591 else
1593 gcc_assert (mode == Pmode);
1594 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1597 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1598 return;
1601 default:
1602 gcc_unreachable ();
1606 /* Emit a move from SRC to DEST. Assume that the move expanders can
1607 handle all moves if !can_create_pseudo_p (). The distinction is
1608 important because, unlike emit_move_insn, the move expanders know
1609 how to force Pmode objects into the constant pool even when the
1610 constant pool address is not itself legitimate. */
1611 static rtx
1612 aarch64_emit_move (rtx dest, rtx src)
1614 return (can_create_pseudo_p ()
1615 ? emit_move_insn (dest, src)
1616 : emit_move_insn_1 (dest, src));
1619 /* Split a 128-bit move operation into two 64-bit move operations,
1620 taking care to handle partial overlap of register to register
1621 copies. Special cases are needed when moving between GP regs and
1622 FP regs. SRC can be a register, constant or memory; DST a register
1623 or memory. If either operand is memory it must not have any side
1624 effects. */
1625 void
1626 aarch64_split_128bit_move (rtx dst, rtx src)
1628 rtx dst_lo, dst_hi;
1629 rtx src_lo, src_hi;
1631 machine_mode mode = GET_MODE (dst);
1633 gcc_assert (mode == TImode || mode == TFmode);
1634 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1635 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1637 if (REG_P (dst) && REG_P (src))
1639 int src_regno = REGNO (src);
1640 int dst_regno = REGNO (dst);
1642 /* Handle FP <-> GP regs. */
1643 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1645 src_lo = gen_lowpart (word_mode, src);
1646 src_hi = gen_highpart (word_mode, src);
1648 if (mode == TImode)
1650 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1651 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1653 else
1655 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1656 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1658 return;
1660 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1662 dst_lo = gen_lowpart (word_mode, dst);
1663 dst_hi = gen_highpart (word_mode, dst);
1665 if (mode == TImode)
1667 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1668 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1670 else
1672 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1673 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1675 return;
1679 dst_lo = gen_lowpart (word_mode, dst);
1680 dst_hi = gen_highpart (word_mode, dst);
1681 src_lo = gen_lowpart (word_mode, src);
1682 src_hi = gen_highpart_mode (word_mode, mode, src);
1684 /* At most one pairing may overlap. */
1685 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1687 aarch64_emit_move (dst_hi, src_hi);
1688 aarch64_emit_move (dst_lo, src_lo);
1690 else
1692 aarch64_emit_move (dst_lo, src_lo);
1693 aarch64_emit_move (dst_hi, src_hi);
1697 bool
1698 aarch64_split_128bit_move_p (rtx dst, rtx src)
1700 return (! REG_P (src)
1701 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1704 /* Split a complex SIMD combine. */
1706 void
1707 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1709 machine_mode src_mode = GET_MODE (src1);
1710 machine_mode dst_mode = GET_MODE (dst);
1712 gcc_assert (VECTOR_MODE_P (dst_mode));
1713 gcc_assert (register_operand (dst, dst_mode)
1714 && register_operand (src1, src_mode)
1715 && register_operand (src2, src_mode));
1717 rtx (*gen) (rtx, rtx, rtx);
1719 switch (src_mode)
1721 case E_V8QImode:
1722 gen = gen_aarch64_simd_combinev8qi;
1723 break;
1724 case E_V4HImode:
1725 gen = gen_aarch64_simd_combinev4hi;
1726 break;
1727 case E_V2SImode:
1728 gen = gen_aarch64_simd_combinev2si;
1729 break;
1730 case E_V4HFmode:
1731 gen = gen_aarch64_simd_combinev4hf;
1732 break;
1733 case E_V2SFmode:
1734 gen = gen_aarch64_simd_combinev2sf;
1735 break;
1736 case E_DImode:
1737 gen = gen_aarch64_simd_combinedi;
1738 break;
1739 case E_DFmode:
1740 gen = gen_aarch64_simd_combinedf;
1741 break;
1742 default:
1743 gcc_unreachable ();
1746 emit_insn (gen (dst, src1, src2));
1747 return;
1750 /* Split a complex SIMD move. */
1752 void
1753 aarch64_split_simd_move (rtx dst, rtx src)
1755 machine_mode src_mode = GET_MODE (src);
1756 machine_mode dst_mode = GET_MODE (dst);
1758 gcc_assert (VECTOR_MODE_P (dst_mode));
1760 if (REG_P (dst) && REG_P (src))
1762 rtx (*gen) (rtx, rtx);
1764 gcc_assert (VECTOR_MODE_P (src_mode));
1766 switch (src_mode)
1768 case E_V16QImode:
1769 gen = gen_aarch64_split_simd_movv16qi;
1770 break;
1771 case E_V8HImode:
1772 gen = gen_aarch64_split_simd_movv8hi;
1773 break;
1774 case E_V4SImode:
1775 gen = gen_aarch64_split_simd_movv4si;
1776 break;
1777 case E_V2DImode:
1778 gen = gen_aarch64_split_simd_movv2di;
1779 break;
1780 case E_V8HFmode:
1781 gen = gen_aarch64_split_simd_movv8hf;
1782 break;
1783 case E_V4SFmode:
1784 gen = gen_aarch64_split_simd_movv4sf;
1785 break;
1786 case E_V2DFmode:
1787 gen = gen_aarch64_split_simd_movv2df;
1788 break;
1789 default:
1790 gcc_unreachable ();
1793 emit_insn (gen (dst, src));
1794 return;
1798 bool
1799 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1800 machine_mode ymode, rtx y)
1802 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1803 gcc_assert (r != NULL);
1804 return rtx_equal_p (x, r);
1808 static rtx
1809 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1811 if (can_create_pseudo_p ())
1812 return force_reg (mode, value);
1813 else
1815 x = aarch64_emit_move (x, value);
1816 return x;
1821 static rtx
1822 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1823 HOST_WIDE_INT offset)
1825 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1827 rtx high;
1828 /* Load the full offset into a register. This
1829 might be improvable in the future. */
1830 high = GEN_INT (offset);
1831 offset = 0;
1832 high = aarch64_force_temporary (mode, temp, high);
1833 reg = aarch64_force_temporary (mode, temp,
1834 gen_rtx_PLUS (mode, high, reg));
1836 return plus_constant (mode, reg, offset);
1839 static int
1840 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1841 scalar_int_mode mode)
1843 int i;
1844 unsigned HOST_WIDE_INT val, val2, mask;
1845 int one_match, zero_match;
1846 int num_insns;
1848 val = INTVAL (imm);
1850 if (aarch64_move_imm (val, mode))
1852 if (generate)
1853 emit_insn (gen_rtx_SET (dest, imm));
1854 return 1;
1857 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1858 (with XXXX non-zero). In that case check to see if the move can be done in
1859 a smaller mode. */
1860 val2 = val & 0xffffffff;
1861 if (mode == DImode
1862 && aarch64_move_imm (val2, SImode)
1863 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1865 if (generate)
1866 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1868 /* Check if we have to emit a second instruction by checking to see
1869 if any of the upper 32 bits of the original DI mode value is set. */
1870 if (val == val2)
1871 return 1;
1873 i = (val >> 48) ? 48 : 32;
1875 if (generate)
1876 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1877 GEN_INT ((val >> i) & 0xffff)));
1879 return 2;
1882 if ((val >> 32) == 0 || mode == SImode)
1884 if (generate)
1886 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1887 if (mode == SImode)
1888 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1889 GEN_INT ((val >> 16) & 0xffff)));
1890 else
1891 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1892 GEN_INT ((val >> 16) & 0xffff)));
1894 return 2;
1897 /* Remaining cases are all for DImode. */
1899 mask = 0xffff;
1900 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1901 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1902 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1903 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1905 if (zero_match != 2 && one_match != 2)
1907 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1908 For a 64-bit bitmask try whether changing 16 bits to all ones or
1909 zeroes creates a valid bitmask. To check any repeated bitmask,
1910 try using 16 bits from the other 32-bit half of val. */
1912 for (i = 0; i < 64; i += 16, mask <<= 16)
1914 val2 = val & ~mask;
1915 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1916 break;
1917 val2 = val | mask;
1918 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1919 break;
1920 val2 = val2 & ~mask;
1921 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1922 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1923 break;
1925 if (i != 64)
1927 if (generate)
1929 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1930 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1931 GEN_INT ((val >> i) & 0xffff)));
1933 return 2;
1937 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1938 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1939 otherwise skip zero bits. */
1941 num_insns = 1;
1942 mask = 0xffff;
1943 val2 = one_match > zero_match ? ~val : val;
1944 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1946 if (generate)
1947 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1948 ? (val | ~(mask << i))
1949 : (val & (mask << i)))));
1950 for (i += 16; i < 64; i += 16)
1952 if ((val2 & (mask << i)) == 0)
1953 continue;
1954 if (generate)
1955 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1956 GEN_INT ((val >> i) & 0xffff)));
1957 num_insns ++;
1960 return num_insns;
1964 void
1965 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1967 machine_mode mode = GET_MODE (dest);
1969 gcc_assert (mode == SImode || mode == DImode);
1971 /* Check on what type of symbol it is. */
1972 scalar_int_mode int_mode;
1973 if ((GET_CODE (imm) == SYMBOL_REF
1974 || GET_CODE (imm) == LABEL_REF
1975 || GET_CODE (imm) == CONST)
1976 && is_a <scalar_int_mode> (mode, &int_mode))
1978 rtx mem, base, offset;
1979 enum aarch64_symbol_type sty;
1981 /* If we have (const (plus symbol offset)), separate out the offset
1982 before we start classifying the symbol. */
1983 split_const (imm, &base, &offset);
1985 sty = aarch64_classify_symbol (base, offset);
1986 switch (sty)
1988 case SYMBOL_FORCE_TO_MEM:
1989 if (offset != const0_rtx
1990 && targetm.cannot_force_const_mem (int_mode, imm))
1992 gcc_assert (can_create_pseudo_p ());
1993 base = aarch64_force_temporary (int_mode, dest, base);
1994 base = aarch64_add_offset (int_mode, NULL, base,
1995 INTVAL (offset));
1996 aarch64_emit_move (dest, base);
1997 return;
2000 mem = force_const_mem (ptr_mode, imm);
2001 gcc_assert (mem);
2003 /* If we aren't generating PC relative literals, then
2004 we need to expand the literal pool access carefully.
2005 This is something that needs to be done in a number
2006 of places, so could well live as a separate function. */
2007 if (!aarch64_pcrelative_literal_loads)
2009 gcc_assert (can_create_pseudo_p ());
2010 base = gen_reg_rtx (ptr_mode);
2011 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2012 if (ptr_mode != Pmode)
2013 base = convert_memory_address (Pmode, base);
2014 mem = gen_rtx_MEM (ptr_mode, base);
2017 if (int_mode != ptr_mode)
2018 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2020 emit_insn (gen_rtx_SET (dest, mem));
2022 return;
2024 case SYMBOL_SMALL_TLSGD:
2025 case SYMBOL_SMALL_TLSDESC:
2026 case SYMBOL_SMALL_TLSIE:
2027 case SYMBOL_SMALL_GOT_28K:
2028 case SYMBOL_SMALL_GOT_4G:
2029 case SYMBOL_TINY_GOT:
2030 case SYMBOL_TINY_TLSIE:
2031 if (offset != const0_rtx)
2033 gcc_assert(can_create_pseudo_p ());
2034 base = aarch64_force_temporary (int_mode, dest, base);
2035 base = aarch64_add_offset (int_mode, NULL, base,
2036 INTVAL (offset));
2037 aarch64_emit_move (dest, base);
2038 return;
2040 /* FALLTHRU */
2042 case SYMBOL_SMALL_ABSOLUTE:
2043 case SYMBOL_TINY_ABSOLUTE:
2044 case SYMBOL_TLSLE12:
2045 case SYMBOL_TLSLE24:
2046 case SYMBOL_TLSLE32:
2047 case SYMBOL_TLSLE48:
2048 aarch64_load_symref_appropriately (dest, imm, sty);
2049 return;
2051 default:
2052 gcc_unreachable ();
2056 if (!CONST_INT_P (imm))
2058 if (GET_CODE (imm) == HIGH)
2059 emit_insn (gen_rtx_SET (dest, imm));
2060 else
2062 rtx mem = force_const_mem (mode, imm);
2063 gcc_assert (mem);
2064 emit_insn (gen_rtx_SET (dest, mem));
2067 return;
2070 aarch64_internal_mov_immediate (dest, imm, true,
2071 as_a <scalar_int_mode> (mode));
2074 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2075 temporary value if necessary. FRAME_RELATED_P should be true if
2076 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2077 to the generated instructions. If SCRATCHREG is known to hold
2078 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2079 immediate again.
2081 Since this function may be used to adjust the stack pointer, we must
2082 ensure that it cannot cause transient stack deallocation (for example
2083 by first incrementing SP and then decrementing when adjusting by a
2084 large immediate). */
2086 static void
2087 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2088 int scratchreg, HOST_WIDE_INT delta,
2089 bool frame_related_p, bool emit_move_imm)
2091 HOST_WIDE_INT mdelta = abs_hwi (delta);
2092 rtx this_rtx = gen_rtx_REG (mode, regnum);
2093 rtx_insn *insn;
2095 if (!mdelta)
2096 return;
2098 /* Single instruction adjustment. */
2099 if (aarch64_uimm12_shift (mdelta))
2101 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2102 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2103 return;
2106 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2107 Only do this if mdelta is not a 16-bit move as adjusting using a move
2108 is better. */
2109 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2111 HOST_WIDE_INT low_off = mdelta & 0xfff;
2113 low_off = delta < 0 ? -low_off : low_off;
2114 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2115 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2116 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2117 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2118 return;
2121 /* Emit a move immediate if required and an addition/subtraction. */
2122 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2123 if (emit_move_imm)
2124 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2125 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2126 : gen_add2_insn (this_rtx, scratch_rtx));
2127 if (frame_related_p)
2129 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2130 rtx adj = plus_constant (mode, this_rtx, delta);
2131 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2135 static inline void
2136 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2137 HOST_WIDE_INT delta)
2139 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2142 static inline void
2143 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2145 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2146 true, emit_move_imm);
2149 static inline void
2150 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2152 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2153 frame_related_p, true);
2156 static bool
2157 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2158 tree exp ATTRIBUTE_UNUSED)
2160 /* Currently, always true. */
2161 return true;
2164 /* Implement TARGET_PASS_BY_REFERENCE. */
2166 static bool
2167 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2168 machine_mode mode,
2169 const_tree type,
2170 bool named ATTRIBUTE_UNUSED)
2172 HOST_WIDE_INT size;
2173 machine_mode dummymode;
2174 int nregs;
2176 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2177 size = (mode == BLKmode && type)
2178 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2180 /* Aggregates are passed by reference based on their size. */
2181 if (type && AGGREGATE_TYPE_P (type))
2183 size = int_size_in_bytes (type);
2186 /* Variable sized arguments are always returned by reference. */
2187 if (size < 0)
2188 return true;
2190 /* Can this be a candidate to be passed in fp/simd register(s)? */
2191 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2192 &dummymode, &nregs,
2193 NULL))
2194 return false;
2196 /* Arguments which are variable sized or larger than 2 registers are
2197 passed by reference unless they are a homogenous floating point
2198 aggregate. */
2199 return size > 2 * UNITS_PER_WORD;
2202 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2203 static bool
2204 aarch64_return_in_msb (const_tree valtype)
2206 machine_mode dummy_mode;
2207 int dummy_int;
2209 /* Never happens in little-endian mode. */
2210 if (!BYTES_BIG_ENDIAN)
2211 return false;
2213 /* Only composite types smaller than or equal to 16 bytes can
2214 be potentially returned in registers. */
2215 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2216 || int_size_in_bytes (valtype) <= 0
2217 || int_size_in_bytes (valtype) > 16)
2218 return false;
2220 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2221 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2222 is always passed/returned in the least significant bits of fp/simd
2223 register(s). */
2224 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2225 &dummy_mode, &dummy_int, NULL))
2226 return false;
2228 return true;
2231 /* Implement TARGET_FUNCTION_VALUE.
2232 Define how to find the value returned by a function. */
2234 static rtx
2235 aarch64_function_value (const_tree type, const_tree func,
2236 bool outgoing ATTRIBUTE_UNUSED)
2238 machine_mode mode;
2239 int unsignedp;
2240 int count;
2241 machine_mode ag_mode;
2243 mode = TYPE_MODE (type);
2244 if (INTEGRAL_TYPE_P (type))
2245 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2247 if (aarch64_return_in_msb (type))
2249 HOST_WIDE_INT size = int_size_in_bytes (type);
2251 if (size % UNITS_PER_WORD != 0)
2253 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2254 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2258 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2259 &ag_mode, &count, NULL))
2261 if (!aarch64_composite_type_p (type, mode))
2263 gcc_assert (count == 1 && mode == ag_mode);
2264 return gen_rtx_REG (mode, V0_REGNUM);
2266 else
2268 int i;
2269 rtx par;
2271 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2272 for (i = 0; i < count; i++)
2274 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2275 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2276 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2277 XVECEXP (par, 0, i) = tmp;
2279 return par;
2282 else
2283 return gen_rtx_REG (mode, R0_REGNUM);
2286 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2287 Return true if REGNO is the number of a hard register in which the values
2288 of called function may come back. */
2290 static bool
2291 aarch64_function_value_regno_p (const unsigned int regno)
2293 /* Maximum of 16 bytes can be returned in the general registers. Examples
2294 of 16-byte return values are: 128-bit integers and 16-byte small
2295 structures (excluding homogeneous floating-point aggregates). */
2296 if (regno == R0_REGNUM || regno == R1_REGNUM)
2297 return true;
2299 /* Up to four fp/simd registers can return a function value, e.g. a
2300 homogeneous floating-point aggregate having four members. */
2301 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2302 return TARGET_FLOAT;
2304 return false;
2307 /* Implement TARGET_RETURN_IN_MEMORY.
2309 If the type T of the result of a function is such that
2310 void func (T arg)
2311 would require that arg be passed as a value in a register (or set of
2312 registers) according to the parameter passing rules, then the result
2313 is returned in the same registers as would be used for such an
2314 argument. */
2316 static bool
2317 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2319 HOST_WIDE_INT size;
2320 machine_mode ag_mode;
2321 int count;
2323 if (!AGGREGATE_TYPE_P (type)
2324 && TREE_CODE (type) != COMPLEX_TYPE
2325 && TREE_CODE (type) != VECTOR_TYPE)
2326 /* Simple scalar types always returned in registers. */
2327 return false;
2329 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2330 type,
2331 &ag_mode,
2332 &count,
2333 NULL))
2334 return false;
2336 /* Types larger than 2 registers returned in memory. */
2337 size = int_size_in_bytes (type);
2338 return (size < 0 || size > 2 * UNITS_PER_WORD);
2341 static bool
2342 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2343 const_tree type, int *nregs)
2345 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2346 return aarch64_vfp_is_call_or_return_candidate (mode,
2347 type,
2348 &pcum->aapcs_vfp_rmode,
2349 nregs,
2350 NULL);
2353 /* Given MODE and TYPE of a function argument, return the alignment in
2354 bits. The idea is to suppress any stronger alignment requested by
2355 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2356 This is a helper function for local use only. */
2358 static unsigned int
2359 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2361 if (!type)
2362 return GET_MODE_ALIGNMENT (mode);
2364 if (integer_zerop (TYPE_SIZE (type)))
2365 return 0;
2367 gcc_assert (TYPE_MODE (type) == mode);
2369 if (!AGGREGATE_TYPE_P (type))
2370 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2372 if (TREE_CODE (type) == ARRAY_TYPE)
2373 return TYPE_ALIGN (TREE_TYPE (type));
2375 unsigned int alignment = 0;
2376 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2377 if (TREE_CODE (field) == FIELD_DECL)
2378 alignment = std::max (alignment, DECL_ALIGN (field));
2380 return alignment;
2383 /* Layout a function argument according to the AAPCS64 rules. The rule
2384 numbers refer to the rule numbers in the AAPCS64. */
2386 static void
2387 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2388 const_tree type,
2389 bool named ATTRIBUTE_UNUSED)
2391 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2392 int ncrn, nvrn, nregs;
2393 bool allocate_ncrn, allocate_nvrn;
2394 HOST_WIDE_INT size;
2396 /* We need to do this once per argument. */
2397 if (pcum->aapcs_arg_processed)
2398 return;
2400 pcum->aapcs_arg_processed = true;
2402 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2403 size
2404 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2405 UNITS_PER_WORD);
2407 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2408 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2409 mode,
2410 type,
2411 &nregs);
2413 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2414 The following code thus handles passing by SIMD/FP registers first. */
2416 nvrn = pcum->aapcs_nvrn;
2418 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2419 and homogenous short-vector aggregates (HVA). */
2420 if (allocate_nvrn)
2422 if (!TARGET_FLOAT)
2423 aarch64_err_no_fpadvsimd (mode, "argument");
2425 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2427 pcum->aapcs_nextnvrn = nvrn + nregs;
2428 if (!aarch64_composite_type_p (type, mode))
2430 gcc_assert (nregs == 1);
2431 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2433 else
2435 rtx par;
2436 int i;
2437 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2438 for (i = 0; i < nregs; i++)
2440 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2441 V0_REGNUM + nvrn + i);
2442 tmp = gen_rtx_EXPR_LIST
2443 (VOIDmode, tmp,
2444 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2445 XVECEXP (par, 0, i) = tmp;
2447 pcum->aapcs_reg = par;
2449 return;
2451 else
2453 /* C.3 NSRN is set to 8. */
2454 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2455 goto on_stack;
2459 ncrn = pcum->aapcs_ncrn;
2460 nregs = size / UNITS_PER_WORD;
2462 /* C6 - C9. though the sign and zero extension semantics are
2463 handled elsewhere. This is the case where the argument fits
2464 entirely general registers. */
2465 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2468 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2470 /* C.8 if the argument has an alignment of 16 then the NGRN is
2471 rounded up to the next even number. */
2472 if (nregs == 2
2473 && ncrn % 2
2474 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2475 comparison is there because for > 16 * BITS_PER_UNIT
2476 alignment nregs should be > 2 and therefore it should be
2477 passed by reference rather than value. */
2478 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2480 ++ncrn;
2481 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2484 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2485 A reg is still generated for it, but the caller should be smart
2486 enough not to use it. */
2487 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2488 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2489 else
2491 rtx par;
2492 int i;
2494 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2495 for (i = 0; i < nregs; i++)
2497 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2498 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2499 GEN_INT (i * UNITS_PER_WORD));
2500 XVECEXP (par, 0, i) = tmp;
2502 pcum->aapcs_reg = par;
2505 pcum->aapcs_nextncrn = ncrn + nregs;
2506 return;
2509 /* C.11 */
2510 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2512 /* The argument is passed on stack; record the needed number of words for
2513 this argument and align the total size if necessary. */
2514 on_stack:
2515 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2517 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2518 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2519 16 / UNITS_PER_WORD);
2520 return;
2523 /* Implement TARGET_FUNCTION_ARG. */
2525 static rtx
2526 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2527 const_tree type, bool named)
2529 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2530 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2532 if (mode == VOIDmode)
2533 return NULL_RTX;
2535 aarch64_layout_arg (pcum_v, mode, type, named);
2536 return pcum->aapcs_reg;
2539 void
2540 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2541 const_tree fntype ATTRIBUTE_UNUSED,
2542 rtx libname ATTRIBUTE_UNUSED,
2543 const_tree fndecl ATTRIBUTE_UNUSED,
2544 unsigned n_named ATTRIBUTE_UNUSED)
2546 pcum->aapcs_ncrn = 0;
2547 pcum->aapcs_nvrn = 0;
2548 pcum->aapcs_nextncrn = 0;
2549 pcum->aapcs_nextnvrn = 0;
2550 pcum->pcs_variant = ARM_PCS_AAPCS64;
2551 pcum->aapcs_reg = NULL_RTX;
2552 pcum->aapcs_arg_processed = false;
2553 pcum->aapcs_stack_words = 0;
2554 pcum->aapcs_stack_size = 0;
2556 if (!TARGET_FLOAT
2557 && fndecl && TREE_PUBLIC (fndecl)
2558 && fntype && fntype != error_mark_node)
2560 const_tree type = TREE_TYPE (fntype);
2561 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2562 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2563 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2564 &mode, &nregs, NULL))
2565 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2567 return;
2570 static void
2571 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2572 machine_mode mode,
2573 const_tree type,
2574 bool named)
2576 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2577 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2579 aarch64_layout_arg (pcum_v, mode, type, named);
2580 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2581 != (pcum->aapcs_stack_words != 0));
2582 pcum->aapcs_arg_processed = false;
2583 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2584 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2585 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2586 pcum->aapcs_stack_words = 0;
2587 pcum->aapcs_reg = NULL_RTX;
2591 bool
2592 aarch64_function_arg_regno_p (unsigned regno)
2594 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2595 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2598 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2599 PARM_BOUNDARY bits of alignment, but will be given anything up
2600 to STACK_BOUNDARY bits if the type requires it. This makes sure
2601 that both before and after the layout of each argument, the Next
2602 Stacked Argument Address (NSAA) will have a minimum alignment of
2603 8 bytes. */
2605 static unsigned int
2606 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2608 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2609 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2612 /* Implement TARGET_FUNCTION_ARG_PADDING.
2614 Small aggregate types are placed in the lowest memory address.
2616 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2618 static pad_direction
2619 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2621 /* On little-endian targets, the least significant byte of every stack
2622 argument is passed at the lowest byte address of the stack slot. */
2623 if (!BYTES_BIG_ENDIAN)
2624 return PAD_UPWARD;
2626 /* Otherwise, integral, floating-point and pointer types are padded downward:
2627 the least significant byte of a stack argument is passed at the highest
2628 byte address of the stack slot. */
2629 if (type
2630 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2631 || POINTER_TYPE_P (type))
2632 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2633 return PAD_DOWNWARD;
2635 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2636 return PAD_UPWARD;
2639 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2641 It specifies padding for the last (may also be the only)
2642 element of a block move between registers and memory. If
2643 assuming the block is in the memory, padding upward means that
2644 the last element is padded after its highest significant byte,
2645 while in downward padding, the last element is padded at the
2646 its least significant byte side.
2648 Small aggregates and small complex types are always padded
2649 upwards.
2651 We don't need to worry about homogeneous floating-point or
2652 short-vector aggregates; their move is not affected by the
2653 padding direction determined here. Regardless of endianness,
2654 each element of such an aggregate is put in the least
2655 significant bits of a fp/simd register.
2657 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2658 register has useful data, and return the opposite if the most
2659 significant byte does. */
2661 bool
2662 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2663 bool first ATTRIBUTE_UNUSED)
2666 /* Small composite types are always padded upward. */
2667 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2669 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2670 : GET_MODE_SIZE (mode));
2671 if (size < 2 * UNITS_PER_WORD)
2672 return true;
2675 /* Otherwise, use the default padding. */
2676 return !BYTES_BIG_ENDIAN;
2679 static scalar_int_mode
2680 aarch64_libgcc_cmp_return_mode (void)
2682 return SImode;
2685 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2687 /* We use the 12-bit shifted immediate arithmetic instructions so values
2688 must be multiple of (1 << 12), i.e. 4096. */
2689 #define ARITH_FACTOR 4096
2691 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2692 #error Cannot use simple address calculation for stack probing
2693 #endif
2695 /* The pair of scratch registers used for stack probing. */
2696 #define PROBE_STACK_FIRST_REG 9
2697 #define PROBE_STACK_SECOND_REG 10
2699 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2700 inclusive. These are offsets from the current stack pointer. */
2702 static void
2703 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2705 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2707 /* See the same assertion on PROBE_INTERVAL above. */
2708 gcc_assert ((first % ARITH_FACTOR) == 0);
2710 /* See if we have a constant small number of probes to generate. If so,
2711 that's the easy case. */
2712 if (size <= PROBE_INTERVAL)
2714 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2716 emit_set_insn (reg1,
2717 plus_constant (Pmode,
2718 stack_pointer_rtx, -(first + base)));
2719 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2722 /* The run-time loop is made up of 8 insns in the generic case while the
2723 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2724 else if (size <= 4 * PROBE_INTERVAL)
2726 HOST_WIDE_INT i, rem;
2728 emit_set_insn (reg1,
2729 plus_constant (Pmode,
2730 stack_pointer_rtx,
2731 -(first + PROBE_INTERVAL)));
2732 emit_stack_probe (reg1);
2734 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2735 it exceeds SIZE. If only two probes are needed, this will not
2736 generate any code. Then probe at FIRST + SIZE. */
2737 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2739 emit_set_insn (reg1,
2740 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2741 emit_stack_probe (reg1);
2744 rem = size - (i - PROBE_INTERVAL);
2745 if (rem > 256)
2747 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2749 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2750 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2752 else
2753 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2756 /* Otherwise, do the same as above, but in a loop. Note that we must be
2757 extra careful with variables wrapping around because we might be at
2758 the very top (or the very bottom) of the address space and we have
2759 to be able to handle this case properly; in particular, we use an
2760 equality test for the loop condition. */
2761 else
2763 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2765 /* Step 1: round SIZE to the previous multiple of the interval. */
2767 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2770 /* Step 2: compute initial and final value of the loop counter. */
2772 /* TEST_ADDR = SP + FIRST. */
2773 emit_set_insn (reg1,
2774 plus_constant (Pmode, stack_pointer_rtx, -first));
2776 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2777 HOST_WIDE_INT adjustment = - (first + rounded_size);
2778 if (! aarch64_uimm12_shift (adjustment))
2780 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2781 true, Pmode);
2782 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2784 else
2786 emit_set_insn (reg2,
2787 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2790 /* Step 3: the loop
2794 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2795 probe at TEST_ADDR
2797 while (TEST_ADDR != LAST_ADDR)
2799 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2800 until it is equal to ROUNDED_SIZE. */
2802 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2805 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2806 that SIZE is equal to ROUNDED_SIZE. */
2808 if (size != rounded_size)
2810 HOST_WIDE_INT rem = size - rounded_size;
2812 if (rem > 256)
2814 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2816 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2817 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2819 else
2820 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2824 /* Make sure nothing is scheduled before we are done. */
2825 emit_insn (gen_blockage ());
2828 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2829 absolute addresses. */
2831 const char *
2832 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2834 static int labelno = 0;
2835 char loop_lab[32];
2836 rtx xops[2];
2838 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2840 /* Loop. */
2841 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2843 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2844 xops[0] = reg1;
2845 xops[1] = GEN_INT (PROBE_INTERVAL);
2846 output_asm_insn ("sub\t%0, %0, %1", xops);
2848 /* Probe at TEST_ADDR. */
2849 output_asm_insn ("str\txzr, [%0]", xops);
2851 /* Test if TEST_ADDR == LAST_ADDR. */
2852 xops[1] = reg2;
2853 output_asm_insn ("cmp\t%0, %1", xops);
2855 /* Branch. */
2856 fputs ("\tb.ne\t", asm_out_file);
2857 assemble_name_raw (asm_out_file, loop_lab);
2858 fputc ('\n', asm_out_file);
2860 return "";
2863 static bool
2864 aarch64_frame_pointer_required (void)
2866 /* In aarch64_override_options_after_change
2867 flag_omit_leaf_frame_pointer turns off the frame pointer by
2868 default. Turn it back on now if we've not got a leaf
2869 function. */
2870 if (flag_omit_leaf_frame_pointer
2871 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2872 return true;
2874 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2875 if (crtl->calls_eh_return)
2876 return true;
2878 return false;
2881 /* Mark the registers that need to be saved by the callee and calculate
2882 the size of the callee-saved registers area and frame record (both FP
2883 and LR may be omitted). */
2884 static void
2885 aarch64_layout_frame (void)
2887 HOST_WIDE_INT offset = 0;
2888 int regno, last_fp_reg = INVALID_REGNUM;
2890 if (reload_completed && cfun->machine->frame.laid_out)
2891 return;
2893 #define SLOT_NOT_REQUIRED (-2)
2894 #define SLOT_REQUIRED (-1)
2896 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2897 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2899 /* First mark all the registers that really need to be saved... */
2900 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2901 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2903 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2904 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2906 /* ... that includes the eh data registers (if needed)... */
2907 if (crtl->calls_eh_return)
2908 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2909 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2910 = SLOT_REQUIRED;
2912 /* ... and any callee saved register that dataflow says is live. */
2913 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2914 if (df_regs_ever_live_p (regno)
2915 && (regno == R30_REGNUM
2916 || !call_used_regs[regno]))
2917 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2919 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2920 if (df_regs_ever_live_p (regno)
2921 && !call_used_regs[regno])
2923 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2924 last_fp_reg = regno;
2927 if (frame_pointer_needed)
2929 /* FP and LR are placed in the linkage record. */
2930 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2931 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2932 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2933 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2934 offset += 2 * UNITS_PER_WORD;
2937 /* Now assign stack slots for them. */
2938 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2939 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2941 cfun->machine->frame.reg_offset[regno] = offset;
2942 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2943 cfun->machine->frame.wb_candidate1 = regno;
2944 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2945 cfun->machine->frame.wb_candidate2 = regno;
2946 offset += UNITS_PER_WORD;
2949 HOST_WIDE_INT max_int_offset = offset;
2950 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2951 bool has_align_gap = offset != max_int_offset;
2953 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2954 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2956 /* If there is an alignment gap between integer and fp callee-saves,
2957 allocate the last fp register to it if possible. */
2958 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2960 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2961 break;
2964 cfun->machine->frame.reg_offset[regno] = offset;
2965 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2966 cfun->machine->frame.wb_candidate1 = regno;
2967 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2968 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2969 cfun->machine->frame.wb_candidate2 = regno;
2970 offset += UNITS_PER_WORD;
2973 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2975 cfun->machine->frame.saved_regs_size = offset;
2977 HOST_WIDE_INT varargs_and_saved_regs_size
2978 = offset + cfun->machine->frame.saved_varargs_size;
2980 cfun->machine->frame.hard_fp_offset
2981 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2982 STACK_BOUNDARY / BITS_PER_UNIT);
2984 cfun->machine->frame.frame_size
2985 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2986 + crtl->outgoing_args_size,
2987 STACK_BOUNDARY / BITS_PER_UNIT);
2989 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2991 cfun->machine->frame.initial_adjust = 0;
2992 cfun->machine->frame.final_adjust = 0;
2993 cfun->machine->frame.callee_adjust = 0;
2994 cfun->machine->frame.callee_offset = 0;
2996 HOST_WIDE_INT max_push_offset = 0;
2997 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2998 max_push_offset = 512;
2999 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3000 max_push_offset = 256;
3002 if (cfun->machine->frame.frame_size < max_push_offset
3003 && crtl->outgoing_args_size == 0)
3005 /* Simple, small frame with no outgoing arguments:
3006 stp reg1, reg2, [sp, -frame_size]!
3007 stp reg3, reg4, [sp, 16] */
3008 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3010 else if ((crtl->outgoing_args_size
3011 + cfun->machine->frame.saved_regs_size < 512)
3012 && !(cfun->calls_alloca
3013 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3015 /* Frame with small outgoing arguments:
3016 sub sp, sp, frame_size
3017 stp reg1, reg2, [sp, outgoing_args_size]
3018 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3019 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3020 cfun->machine->frame.callee_offset
3021 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3023 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3025 /* Frame with large outgoing arguments but a small local area:
3026 stp reg1, reg2, [sp, -hard_fp_offset]!
3027 stp reg3, reg4, [sp, 16]
3028 sub sp, sp, outgoing_args_size */
3029 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3030 cfun->machine->frame.final_adjust
3031 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3033 else if (!frame_pointer_needed
3034 && varargs_and_saved_regs_size < max_push_offset)
3036 /* Frame with large local area and outgoing arguments (this pushes the
3037 callee-saves first, followed by the locals and outgoing area):
3038 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3039 stp reg3, reg4, [sp, 16]
3040 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3041 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3042 cfun->machine->frame.final_adjust
3043 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3044 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3045 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3047 else
3049 /* Frame with large local area and outgoing arguments using frame pointer:
3050 sub sp, sp, hard_fp_offset
3051 stp x29, x30, [sp, 0]
3052 add x29, sp, 0
3053 stp reg3, reg4, [sp, 16]
3054 sub sp, sp, outgoing_args_size */
3055 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3056 cfun->machine->frame.final_adjust
3057 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3060 cfun->machine->frame.laid_out = true;
3063 /* Return true if the register REGNO is saved on entry to
3064 the current function. */
3066 static bool
3067 aarch64_register_saved_on_entry (int regno)
3069 return cfun->machine->frame.reg_offset[regno] >= 0;
3072 /* Return the next register up from REGNO up to LIMIT for the callee
3073 to save. */
3075 static unsigned
3076 aarch64_next_callee_save (unsigned regno, unsigned limit)
3078 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3079 regno ++;
3080 return regno;
3083 /* Push the register number REGNO of mode MODE to the stack with write-back
3084 adjusting the stack by ADJUSTMENT. */
3086 static void
3087 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3088 HOST_WIDE_INT adjustment)
3090 rtx base_rtx = stack_pointer_rtx;
3091 rtx insn, reg, mem;
3093 reg = gen_rtx_REG (mode, regno);
3094 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3095 plus_constant (Pmode, base_rtx, -adjustment));
3096 mem = gen_frame_mem (mode, mem);
3098 insn = emit_move_insn (mem, reg);
3099 RTX_FRAME_RELATED_P (insn) = 1;
3102 /* Generate and return an instruction to store the pair of registers
3103 REG and REG2 of mode MODE to location BASE with write-back adjusting
3104 the stack location BASE by ADJUSTMENT. */
3106 static rtx
3107 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3108 HOST_WIDE_INT adjustment)
3110 switch (mode)
3112 case E_DImode:
3113 return gen_storewb_pairdi_di (base, base, reg, reg2,
3114 GEN_INT (-adjustment),
3115 GEN_INT (UNITS_PER_WORD - adjustment));
3116 case E_DFmode:
3117 return gen_storewb_pairdf_di (base, base, reg, reg2,
3118 GEN_INT (-adjustment),
3119 GEN_INT (UNITS_PER_WORD - adjustment));
3120 default:
3121 gcc_unreachable ();
3125 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3126 stack pointer by ADJUSTMENT. */
3128 static void
3129 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3131 rtx_insn *insn;
3132 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3134 if (regno2 == INVALID_REGNUM)
3135 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3137 rtx reg1 = gen_rtx_REG (mode, regno1);
3138 rtx reg2 = gen_rtx_REG (mode, regno2);
3140 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3141 reg2, adjustment));
3142 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3143 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3144 RTX_FRAME_RELATED_P (insn) = 1;
3147 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3148 adjusting it by ADJUSTMENT afterwards. */
3150 static rtx
3151 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3152 HOST_WIDE_INT adjustment)
3154 switch (mode)
3156 case E_DImode:
3157 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3158 GEN_INT (UNITS_PER_WORD));
3159 case E_DFmode:
3160 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3161 GEN_INT (UNITS_PER_WORD));
3162 default:
3163 gcc_unreachable ();
3167 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3168 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3169 into CFI_OPS. */
3171 static void
3172 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3173 rtx *cfi_ops)
3175 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3176 rtx reg1 = gen_rtx_REG (mode, regno1);
3178 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3180 if (regno2 == INVALID_REGNUM)
3182 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3183 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3184 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3186 else
3188 rtx reg2 = gen_rtx_REG (mode, regno2);
3189 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3190 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3191 reg2, adjustment));
3195 /* Generate and return a store pair instruction of mode MODE to store
3196 register REG1 to MEM1 and register REG2 to MEM2. */
3198 static rtx
3199 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3200 rtx reg2)
3202 switch (mode)
3204 case E_DImode:
3205 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3207 case E_DFmode:
3208 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3210 default:
3211 gcc_unreachable ();
3215 /* Generate and regurn a load pair isntruction of mode MODE to load register
3216 REG1 from MEM1 and register REG2 from MEM2. */
3218 static rtx
3219 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3220 rtx mem2)
3222 switch (mode)
3224 case E_DImode:
3225 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3227 case E_DFmode:
3228 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3230 default:
3231 gcc_unreachable ();
3235 /* Return TRUE if return address signing should be enabled for the current
3236 function, otherwise return FALSE. */
3238 bool
3239 aarch64_return_address_signing_enabled (void)
3241 /* This function should only be called after frame laid out. */
3242 gcc_assert (cfun->machine->frame.laid_out);
3244 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3245 if it's LR is pushed onto stack. */
3246 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3247 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3248 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3251 /* Emit code to save the callee-saved registers from register number START
3252 to LIMIT to the stack at the location starting at offset START_OFFSET,
3253 skipping any write-back candidates if SKIP_WB is true. */
3255 static void
3256 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3257 unsigned start, unsigned limit, bool skip_wb)
3259 rtx_insn *insn;
3260 unsigned regno;
3261 unsigned regno2;
3263 for (regno = aarch64_next_callee_save (start, limit);
3264 regno <= limit;
3265 regno = aarch64_next_callee_save (regno + 1, limit))
3267 rtx reg, mem;
3268 HOST_WIDE_INT offset;
3270 if (skip_wb
3271 && (regno == cfun->machine->frame.wb_candidate1
3272 || regno == cfun->machine->frame.wb_candidate2))
3273 continue;
3275 if (cfun->machine->reg_is_wrapped_separately[regno])
3276 continue;
3278 reg = gen_rtx_REG (mode, regno);
3279 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3280 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3281 offset));
3283 regno2 = aarch64_next_callee_save (regno + 1, limit);
3285 if (regno2 <= limit
3286 && !cfun->machine->reg_is_wrapped_separately[regno2]
3287 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3288 == cfun->machine->frame.reg_offset[regno2]))
3291 rtx reg2 = gen_rtx_REG (mode, regno2);
3292 rtx mem2;
3294 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3295 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3296 offset));
3297 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3298 reg2));
3300 /* The first part of a frame-related parallel insn is
3301 always assumed to be relevant to the frame
3302 calculations; subsequent parts, are only
3303 frame-related if explicitly marked. */
3304 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3305 regno = regno2;
3307 else
3308 insn = emit_move_insn (mem, reg);
3310 RTX_FRAME_RELATED_P (insn) = 1;
3314 /* Emit code to restore the callee registers of mode MODE from register
3315 number START up to and including LIMIT. Restore from the stack offset
3316 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3317 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3319 static void
3320 aarch64_restore_callee_saves (machine_mode mode,
3321 HOST_WIDE_INT start_offset, unsigned start,
3322 unsigned limit, bool skip_wb, rtx *cfi_ops)
3324 rtx base_rtx = stack_pointer_rtx;
3325 unsigned regno;
3326 unsigned regno2;
3327 HOST_WIDE_INT offset;
3329 for (regno = aarch64_next_callee_save (start, limit);
3330 regno <= limit;
3331 regno = aarch64_next_callee_save (regno + 1, limit))
3333 if (cfun->machine->reg_is_wrapped_separately[regno])
3334 continue;
3336 rtx reg, mem;
3338 if (skip_wb
3339 && (regno == cfun->machine->frame.wb_candidate1
3340 || regno == cfun->machine->frame.wb_candidate2))
3341 continue;
3343 reg = gen_rtx_REG (mode, regno);
3344 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3345 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3347 regno2 = aarch64_next_callee_save (regno + 1, limit);
3349 if (regno2 <= limit
3350 && !cfun->machine->reg_is_wrapped_separately[regno2]
3351 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3352 == cfun->machine->frame.reg_offset[regno2]))
3354 rtx reg2 = gen_rtx_REG (mode, regno2);
3355 rtx mem2;
3357 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3358 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3359 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3361 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3362 regno = regno2;
3364 else
3365 emit_move_insn (reg, mem);
3366 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3370 static inline bool
3371 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3372 HOST_WIDE_INT offset)
3374 return offset >= -256 && offset < 256;
3377 static inline bool
3378 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3380 return (offset >= 0
3381 && offset < 4096 * GET_MODE_SIZE (mode)
3382 && offset % GET_MODE_SIZE (mode) == 0);
3385 bool
3386 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3388 return (offset >= -64 * GET_MODE_SIZE (mode)
3389 && offset < 64 * GET_MODE_SIZE (mode)
3390 && offset % GET_MODE_SIZE (mode) == 0);
3393 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3395 static sbitmap
3396 aarch64_get_separate_components (void)
3398 aarch64_layout_frame ();
3400 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3401 bitmap_clear (components);
3403 /* The registers we need saved to the frame. */
3404 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3405 if (aarch64_register_saved_on_entry (regno))
3407 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3408 if (!frame_pointer_needed)
3409 offset += cfun->machine->frame.frame_size
3410 - cfun->machine->frame.hard_fp_offset;
3411 /* Check that we can access the stack slot of the register with one
3412 direct load with no adjustments needed. */
3413 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3414 bitmap_set_bit (components, regno);
3417 /* Don't mess with the hard frame pointer. */
3418 if (frame_pointer_needed)
3419 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3421 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3422 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3423 /* If aarch64_layout_frame has chosen registers to store/restore with
3424 writeback don't interfere with them to avoid having to output explicit
3425 stack adjustment instructions. */
3426 if (reg2 != INVALID_REGNUM)
3427 bitmap_clear_bit (components, reg2);
3428 if (reg1 != INVALID_REGNUM)
3429 bitmap_clear_bit (components, reg1);
3431 bitmap_clear_bit (components, LR_REGNUM);
3432 bitmap_clear_bit (components, SP_REGNUM);
3434 return components;
3437 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3439 static sbitmap
3440 aarch64_components_for_bb (basic_block bb)
3442 bitmap in = DF_LIVE_IN (bb);
3443 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3444 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3446 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3447 bitmap_clear (components);
3449 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3450 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3451 if ((!call_used_regs[regno])
3452 && (bitmap_bit_p (in, regno)
3453 || bitmap_bit_p (gen, regno)
3454 || bitmap_bit_p (kill, regno)))
3455 bitmap_set_bit (components, regno);
3457 return components;
3460 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3461 Nothing to do for aarch64. */
3463 static void
3464 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3468 /* Return the next set bit in BMP from START onwards. Return the total number
3469 of bits in BMP if no set bit is found at or after START. */
3471 static unsigned int
3472 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3474 unsigned int nbits = SBITMAP_SIZE (bmp);
3475 if (start == nbits)
3476 return start;
3478 gcc_assert (start < nbits);
3479 for (unsigned int i = start; i < nbits; i++)
3480 if (bitmap_bit_p (bmp, i))
3481 return i;
3483 return nbits;
3486 /* Do the work for aarch64_emit_prologue_components and
3487 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3488 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3489 for these components or the epilogue sequence. That is, it determines
3490 whether we should emit stores or loads and what kind of CFA notes to attach
3491 to the insns. Otherwise the logic for the two sequences is very
3492 similar. */
3494 static void
3495 aarch64_process_components (sbitmap components, bool prologue_p)
3497 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3498 ? HARD_FRAME_POINTER_REGNUM
3499 : STACK_POINTER_REGNUM);
3501 unsigned last_regno = SBITMAP_SIZE (components);
3502 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3503 rtx_insn *insn = NULL;
3505 while (regno != last_regno)
3507 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3508 so DFmode for the vector registers is enough. */
3509 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3510 rtx reg = gen_rtx_REG (mode, regno);
3511 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3512 if (!frame_pointer_needed)
3513 offset += cfun->machine->frame.frame_size
3514 - cfun->machine->frame.hard_fp_offset;
3515 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3516 rtx mem = gen_frame_mem (mode, addr);
3518 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3519 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3520 /* No more registers to handle after REGNO.
3521 Emit a single save/restore and exit. */
3522 if (regno2 == last_regno)
3524 insn = emit_insn (set);
3525 RTX_FRAME_RELATED_P (insn) = 1;
3526 if (prologue_p)
3527 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3528 else
3529 add_reg_note (insn, REG_CFA_RESTORE, reg);
3530 break;
3533 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3534 /* The next register is not of the same class or its offset is not
3535 mergeable with the current one into a pair. */
3536 if (!satisfies_constraint_Ump (mem)
3537 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3538 || (offset2 - cfun->machine->frame.reg_offset[regno])
3539 != GET_MODE_SIZE (mode))
3541 insn = emit_insn (set);
3542 RTX_FRAME_RELATED_P (insn) = 1;
3543 if (prologue_p)
3544 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3545 else
3546 add_reg_note (insn, REG_CFA_RESTORE, reg);
3548 regno = regno2;
3549 continue;
3552 /* REGNO2 can be saved/restored in a pair with REGNO. */
3553 rtx reg2 = gen_rtx_REG (mode, regno2);
3554 if (!frame_pointer_needed)
3555 offset2 += cfun->machine->frame.frame_size
3556 - cfun->machine->frame.hard_fp_offset;
3557 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3558 rtx mem2 = gen_frame_mem (mode, addr2);
3559 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3560 : gen_rtx_SET (reg2, mem2);
3562 if (prologue_p)
3563 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3564 else
3565 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3567 RTX_FRAME_RELATED_P (insn) = 1;
3568 if (prologue_p)
3570 add_reg_note (insn, REG_CFA_OFFSET, set);
3571 add_reg_note (insn, REG_CFA_OFFSET, set2);
3573 else
3575 add_reg_note (insn, REG_CFA_RESTORE, reg);
3576 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3579 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3583 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3585 static void
3586 aarch64_emit_prologue_components (sbitmap components)
3588 aarch64_process_components (components, true);
3591 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3593 static void
3594 aarch64_emit_epilogue_components (sbitmap components)
3596 aarch64_process_components (components, false);
3599 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3601 static void
3602 aarch64_set_handled_components (sbitmap components)
3604 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3605 if (bitmap_bit_p (components, regno))
3606 cfun->machine->reg_is_wrapped_separately[regno] = true;
3609 /* AArch64 stack frames generated by this compiler look like:
3611 +-------------------------------+
3613 | incoming stack arguments |
3615 +-------------------------------+
3616 | | <-- incoming stack pointer (aligned)
3617 | callee-allocated save area |
3618 | for register varargs |
3620 +-------------------------------+
3621 | local variables | <-- frame_pointer_rtx
3623 +-------------------------------+
3624 | padding0 | \
3625 +-------------------------------+ |
3626 | callee-saved registers | | frame.saved_regs_size
3627 +-------------------------------+ |
3628 | LR' | |
3629 +-------------------------------+ |
3630 | FP' | / <- hard_frame_pointer_rtx (aligned)
3631 +-------------------------------+
3632 | dynamic allocation |
3633 +-------------------------------+
3634 | padding |
3635 +-------------------------------+
3636 | outgoing stack arguments | <-- arg_pointer
3638 +-------------------------------+
3639 | | <-- stack_pointer_rtx (aligned)
3641 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3642 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3643 unchanged. */
3645 /* Generate the prologue instructions for entry into a function.
3646 Establish the stack frame by decreasing the stack pointer with a
3647 properly calculated size and, if necessary, create a frame record
3648 filled with the values of LR and previous frame pointer. The
3649 current FP is also set up if it is in use. */
3651 void
3652 aarch64_expand_prologue (void)
3654 aarch64_layout_frame ();
3656 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3657 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3658 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3659 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3660 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3661 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3662 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3663 rtx_insn *insn;
3665 /* Sign return address for functions. */
3666 if (aarch64_return_address_signing_enabled ())
3668 insn = emit_insn (gen_pacisp ());
3669 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3670 RTX_FRAME_RELATED_P (insn) = 1;
3673 if (flag_stack_usage_info)
3674 current_function_static_stack_size = frame_size;
3676 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3678 if (crtl->is_leaf && !cfun->calls_alloca)
3680 if (frame_size > PROBE_INTERVAL
3681 && frame_size > get_stack_check_protect ())
3682 aarch64_emit_probe_stack_range (get_stack_check_protect (),
3683 (frame_size
3684 - get_stack_check_protect ()));
3686 else if (frame_size > 0)
3687 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
3690 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3692 if (callee_adjust != 0)
3693 aarch64_push_regs (reg1, reg2, callee_adjust);
3695 if (frame_pointer_needed)
3697 if (callee_adjust == 0)
3698 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3699 R30_REGNUM, false);
3700 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3701 stack_pointer_rtx,
3702 GEN_INT (callee_offset)));
3703 RTX_FRAME_RELATED_P (insn) = 1;
3704 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3707 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3708 callee_adjust != 0 || frame_pointer_needed);
3709 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3710 callee_adjust != 0 || frame_pointer_needed);
3711 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3714 /* Return TRUE if we can use a simple_return insn.
3716 This function checks whether the callee saved stack is empty, which
3717 means no restore actions are need. The pro_and_epilogue will use
3718 this to check whether shrink-wrapping opt is feasible. */
3720 bool
3721 aarch64_use_return_insn_p (void)
3723 if (!reload_completed)
3724 return false;
3726 if (crtl->profile)
3727 return false;
3729 aarch64_layout_frame ();
3731 return cfun->machine->frame.frame_size == 0;
3734 /* Generate the epilogue instructions for returning from a function.
3735 This is almost exactly the reverse of the prolog sequence, except
3736 that we need to insert barriers to avoid scheduling loads that read
3737 from a deallocated stack, and we optimize the unwind records by
3738 emitting them all together if possible. */
3739 void
3740 aarch64_expand_epilogue (bool for_sibcall)
3742 aarch64_layout_frame ();
3744 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3745 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3746 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3747 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3748 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3749 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3750 rtx cfi_ops = NULL;
3751 rtx_insn *insn;
3753 /* We need to add memory barrier to prevent read from deallocated stack. */
3754 bool need_barrier_p = (get_frame_size ()
3755 + cfun->machine->frame.saved_varargs_size) != 0;
3757 /* Emit a barrier to prevent loads from a deallocated stack. */
3758 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3759 || crtl->calls_eh_return)
3761 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3762 need_barrier_p = false;
3765 /* Restore the stack pointer from the frame pointer if it may not
3766 be the same as the stack pointer. */
3767 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3769 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3770 hard_frame_pointer_rtx,
3771 GEN_INT (-callee_offset)));
3772 /* If writeback is used when restoring callee-saves, the CFA
3773 is restored on the instruction doing the writeback. */
3774 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3776 else
3777 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3779 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3780 callee_adjust != 0, &cfi_ops);
3781 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3782 callee_adjust != 0, &cfi_ops);
3784 if (need_barrier_p)
3785 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3787 if (callee_adjust != 0)
3788 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3790 if (callee_adjust != 0 || initial_adjust > 65536)
3792 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3793 insn = get_last_insn ();
3794 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3795 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3796 RTX_FRAME_RELATED_P (insn) = 1;
3797 cfi_ops = NULL;
3800 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3802 if (cfi_ops)
3804 /* Emit delayed restores and reset the CFA to be SP. */
3805 insn = get_last_insn ();
3806 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3807 REG_NOTES (insn) = cfi_ops;
3808 RTX_FRAME_RELATED_P (insn) = 1;
3811 /* We prefer to emit the combined return/authenticate instruction RETAA,
3812 however there are three cases in which we must instead emit an explicit
3813 authentication instruction.
3815 1) Sibcalls don't return in a normal way, so if we're about to call one
3816 we must authenticate.
3818 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3819 generating code for !TARGET_ARMV8_3 we can't use it and must
3820 explicitly authenticate.
3822 3) On an eh_return path we make extra stack adjustments to update the
3823 canonical frame address to be the exception handler's CFA. We want
3824 to authenticate using the CFA of the function which calls eh_return.
3826 if (aarch64_return_address_signing_enabled ()
3827 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3829 insn = emit_insn (gen_autisp ());
3830 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3831 RTX_FRAME_RELATED_P (insn) = 1;
3834 /* Stack adjustment for exception handler. */
3835 if (crtl->calls_eh_return)
3837 /* We need to unwind the stack by the offset computed by
3838 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3839 to be SP; letting the CFA move during this adjustment
3840 is just as correct as retaining the CFA from the body
3841 of the function. Therefore, do nothing special. */
3842 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3845 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3846 if (!for_sibcall)
3847 emit_jump_insn (ret_rtx);
3850 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3851 normally or return to a previous frame after unwinding.
3853 An EH return uses a single shared return sequence. The epilogue is
3854 exactly like a normal epilogue except that it has an extra input
3855 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3856 that must be applied after the frame has been destroyed. An extra label
3857 is inserted before the epilogue which initializes this register to zero,
3858 and this is the entry point for a normal return.
3860 An actual EH return updates the return address, initializes the stack
3861 adjustment and jumps directly into the epilogue (bypassing the zeroing
3862 of the adjustment). Since the return address is typically saved on the
3863 stack when a function makes a call, the saved LR must be updated outside
3864 the epilogue.
3866 This poses problems as the store is generated well before the epilogue,
3867 so the offset of LR is not known yet. Also optimizations will remove the
3868 store as it appears dead, even after the epilogue is generated (as the
3869 base or offset for loading LR is different in many cases).
3871 To avoid these problems this implementation forces the frame pointer
3872 in eh_return functions so that the location of LR is fixed and known early.
3873 It also marks the store volatile, so no optimization is permitted to
3874 remove the store. */
3876 aarch64_eh_return_handler_rtx (void)
3878 rtx tmp = gen_frame_mem (Pmode,
3879 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3881 /* Mark the store volatile, so no optimization is permitted to remove it. */
3882 MEM_VOLATILE_P (tmp) = true;
3883 return tmp;
3886 /* Output code to add DELTA to the first argument, and then jump
3887 to FUNCTION. Used for C++ multiple inheritance. */
3888 static void
3889 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3890 HOST_WIDE_INT delta,
3891 HOST_WIDE_INT vcall_offset,
3892 tree function)
3894 /* The this pointer is always in x0. Note that this differs from
3895 Arm where the this pointer maybe bumped to r1 if r0 is required
3896 to return a pointer to an aggregate. On AArch64 a result value
3897 pointer will be in x8. */
3898 int this_regno = R0_REGNUM;
3899 rtx this_rtx, temp0, temp1, addr, funexp;
3900 rtx_insn *insn;
3902 reload_completed = 1;
3903 emit_note (NOTE_INSN_PROLOGUE_END);
3905 if (vcall_offset == 0)
3906 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3907 else
3909 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3911 this_rtx = gen_rtx_REG (Pmode, this_regno);
3912 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3913 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3915 addr = this_rtx;
3916 if (delta != 0)
3918 if (delta >= -256 && delta < 256)
3919 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3920 plus_constant (Pmode, this_rtx, delta));
3921 else
3922 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3925 if (Pmode == ptr_mode)
3926 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3927 else
3928 aarch64_emit_move (temp0,
3929 gen_rtx_ZERO_EXTEND (Pmode,
3930 gen_rtx_MEM (ptr_mode, addr)));
3932 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3933 addr = plus_constant (Pmode, temp0, vcall_offset);
3934 else
3936 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3937 Pmode);
3938 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3941 if (Pmode == ptr_mode)
3942 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3943 else
3944 aarch64_emit_move (temp1,
3945 gen_rtx_SIGN_EXTEND (Pmode,
3946 gen_rtx_MEM (ptr_mode, addr)));
3948 emit_insn (gen_add2_insn (this_rtx, temp1));
3951 /* Generate a tail call to the target function. */
3952 if (!TREE_USED (function))
3954 assemble_external (function);
3955 TREE_USED (function) = 1;
3957 funexp = XEXP (DECL_RTL (function), 0);
3958 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3959 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3960 SIBLING_CALL_P (insn) = 1;
3962 insn = get_insns ();
3963 shorten_branches (insn);
3964 final_start_function (insn, file, 1);
3965 final (insn, file, 1);
3966 final_end_function ();
3968 /* Stop pretending to be a post-reload pass. */
3969 reload_completed = 0;
3972 static bool
3973 aarch64_tls_referenced_p (rtx x)
3975 if (!TARGET_HAVE_TLS)
3976 return false;
3977 subrtx_iterator::array_type array;
3978 FOR_EACH_SUBRTX (iter, array, x, ALL)
3980 const_rtx x = *iter;
3981 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3982 return true;
3983 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3984 TLS offsets, not real symbol references. */
3985 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3986 iter.skip_subrtxes ();
3988 return false;
3992 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3993 a left shift of 0 or 12 bits. */
3994 bool
3995 aarch64_uimm12_shift (HOST_WIDE_INT val)
3997 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3998 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
4003 /* Return true if val is an immediate that can be loaded into a
4004 register by a MOVZ instruction. */
4005 static bool
4006 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
4008 if (GET_MODE_SIZE (mode) > 4)
4010 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4011 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4012 return 1;
4014 else
4016 /* Ignore sign extension. */
4017 val &= (HOST_WIDE_INT) 0xffffffff;
4019 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4020 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4023 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4025 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4027 0x0000000100000001ull,
4028 0x0001000100010001ull,
4029 0x0101010101010101ull,
4030 0x1111111111111111ull,
4031 0x5555555555555555ull,
4035 /* Return true if val is a valid bitmask immediate. */
4037 bool
4038 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4040 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4041 int bits;
4043 /* Check for a single sequence of one bits and return quickly if so.
4044 The special cases of all ones and all zeroes returns false. */
4045 val = (unsigned HOST_WIDE_INT) val_in;
4046 tmp = val + (val & -val);
4048 if (tmp == (tmp & -tmp))
4049 return (val + 1) > 1;
4051 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4052 if (mode == SImode)
4053 val = (val << 32) | (val & 0xffffffff);
4055 /* Invert if the immediate doesn't start with a zero bit - this means we
4056 only need to search for sequences of one bits. */
4057 if (val & 1)
4058 val = ~val;
4060 /* Find the first set bit and set tmp to val with the first sequence of one
4061 bits removed. Return success if there is a single sequence of ones. */
4062 first_one = val & -val;
4063 tmp = val & (val + first_one);
4065 if (tmp == 0)
4066 return true;
4068 /* Find the next set bit and compute the difference in bit position. */
4069 next_one = tmp & -tmp;
4070 bits = clz_hwi (first_one) - clz_hwi (next_one);
4071 mask = val ^ tmp;
4073 /* Check the bit position difference is a power of 2, and that the first
4074 sequence of one bits fits within 'bits' bits. */
4075 if ((mask >> bits) != 0 || bits != (bits & -bits))
4076 return false;
4078 /* Check the sequence of one bits is repeated 64/bits times. */
4079 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4082 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4083 Assumed precondition: VAL_IN Is not zero. */
4085 unsigned HOST_WIDE_INT
4086 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4088 int lowest_bit_set = ctz_hwi (val_in);
4089 int highest_bit_set = floor_log2 (val_in);
4090 gcc_assert (val_in != 0);
4092 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4093 (HOST_WIDE_INT_1U << lowest_bit_set));
4096 /* Create constant where bits outside of lowest bit set to highest bit set
4097 are set to 1. */
4099 unsigned HOST_WIDE_INT
4100 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4102 return val_in | ~aarch64_and_split_imm1 (val_in);
4105 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4107 bool
4108 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4110 scalar_int_mode int_mode;
4111 if (!is_a <scalar_int_mode> (mode, &int_mode))
4112 return false;
4114 if (aarch64_bitmask_imm (val_in, int_mode))
4115 return false;
4117 if (aarch64_move_imm (val_in, int_mode))
4118 return false;
4120 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4122 return aarch64_bitmask_imm (imm2, int_mode);
4125 /* Return true if val is an immediate that can be loaded into a
4126 register in a single instruction. */
4127 bool
4128 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4130 scalar_int_mode int_mode;
4131 if (!is_a <scalar_int_mode> (mode, &int_mode))
4132 return false;
4134 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4135 return 1;
4136 return aarch64_bitmask_imm (val, int_mode);
4139 static bool
4140 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4142 rtx base, offset;
4144 if (GET_CODE (x) == HIGH)
4145 return true;
4147 split_const (x, &base, &offset);
4148 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4150 if (aarch64_classify_symbol (base, offset)
4151 != SYMBOL_FORCE_TO_MEM)
4152 return true;
4153 else
4154 /* Avoid generating a 64-bit relocation in ILP32; leave
4155 to aarch64_expand_mov_immediate to handle it properly. */
4156 return mode != ptr_mode;
4159 return aarch64_tls_referenced_p (x);
4162 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4163 The expansion for a table switch is quite expensive due to the number
4164 of instructions, the table lookup and hard to predict indirect jump.
4165 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4166 set, otherwise use tables for > 16 cases as a tradeoff between size and
4167 performance. When optimizing for size, use the default setting. */
4169 static unsigned int
4170 aarch64_case_values_threshold (void)
4172 /* Use the specified limit for the number of cases before using jump
4173 tables at higher optimization levels. */
4174 if (optimize > 2
4175 && selected_cpu->tune->max_case_values != 0)
4176 return selected_cpu->tune->max_case_values;
4177 else
4178 return optimize_size ? default_case_values_threshold () : 17;
4181 /* Return true if register REGNO is a valid index register.
4182 STRICT_P is true if REG_OK_STRICT is in effect. */
4184 bool
4185 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4187 if (!HARD_REGISTER_NUM_P (regno))
4189 if (!strict_p)
4190 return true;
4192 if (!reg_renumber)
4193 return false;
4195 regno = reg_renumber[regno];
4197 return GP_REGNUM_P (regno);
4200 /* Return true if register REGNO is a valid base register for mode MODE.
4201 STRICT_P is true if REG_OK_STRICT is in effect. */
4203 bool
4204 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4206 if (!HARD_REGISTER_NUM_P (regno))
4208 if (!strict_p)
4209 return true;
4211 if (!reg_renumber)
4212 return false;
4214 regno = reg_renumber[regno];
4217 /* The fake registers will be eliminated to either the stack or
4218 hard frame pointer, both of which are usually valid base registers.
4219 Reload deals with the cases where the eliminated form isn't valid. */
4220 return (GP_REGNUM_P (regno)
4221 || regno == SP_REGNUM
4222 || regno == FRAME_POINTER_REGNUM
4223 || regno == ARG_POINTER_REGNUM);
4226 /* Return true if X is a valid base register for mode MODE.
4227 STRICT_P is true if REG_OK_STRICT is in effect. */
4229 static bool
4230 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4232 if (!strict_p
4233 && GET_CODE (x) == SUBREG
4234 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4235 x = SUBREG_REG (x);
4237 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4240 /* Return true if address offset is a valid index. If it is, fill in INFO
4241 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4243 static bool
4244 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4245 machine_mode mode, bool strict_p)
4247 enum aarch64_address_type type;
4248 rtx index;
4249 int shift;
4251 /* (reg:P) */
4252 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4253 && GET_MODE (x) == Pmode)
4255 type = ADDRESS_REG_REG;
4256 index = x;
4257 shift = 0;
4259 /* (sign_extend:DI (reg:SI)) */
4260 else if ((GET_CODE (x) == SIGN_EXTEND
4261 || GET_CODE (x) == ZERO_EXTEND)
4262 && GET_MODE (x) == DImode
4263 && GET_MODE (XEXP (x, 0)) == SImode)
4265 type = (GET_CODE (x) == SIGN_EXTEND)
4266 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267 index = XEXP (x, 0);
4268 shift = 0;
4270 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4271 else if (GET_CODE (x) == MULT
4272 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274 && GET_MODE (XEXP (x, 0)) == DImode
4275 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276 && CONST_INT_P (XEXP (x, 1)))
4278 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280 index = XEXP (XEXP (x, 0), 0);
4281 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4283 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4284 else if (GET_CODE (x) == ASHIFT
4285 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4286 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4287 && GET_MODE (XEXP (x, 0)) == DImode
4288 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4289 && CONST_INT_P (XEXP (x, 1)))
4291 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4292 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293 index = XEXP (XEXP (x, 0), 0);
4294 shift = INTVAL (XEXP (x, 1));
4296 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4297 else if ((GET_CODE (x) == SIGN_EXTRACT
4298 || GET_CODE (x) == ZERO_EXTRACT)
4299 && GET_MODE (x) == DImode
4300 && GET_CODE (XEXP (x, 0)) == MULT
4301 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4302 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4304 type = (GET_CODE (x) == SIGN_EXTRACT)
4305 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4306 index = XEXP (XEXP (x, 0), 0);
4307 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4308 if (INTVAL (XEXP (x, 1)) != 32 + shift
4309 || INTVAL (XEXP (x, 2)) != 0)
4310 shift = -1;
4312 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4313 (const_int 0xffffffff<<shift)) */
4314 else if (GET_CODE (x) == AND
4315 && GET_MODE (x) == DImode
4316 && GET_CODE (XEXP (x, 0)) == MULT
4317 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4318 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4319 && CONST_INT_P (XEXP (x, 1)))
4321 type = ADDRESS_REG_UXTW;
4322 index = XEXP (XEXP (x, 0), 0);
4323 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4324 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4325 shift = -1;
4327 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4328 else if ((GET_CODE (x) == SIGN_EXTRACT
4329 || GET_CODE (x) == ZERO_EXTRACT)
4330 && GET_MODE (x) == DImode
4331 && GET_CODE (XEXP (x, 0)) == ASHIFT
4332 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4333 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4335 type = (GET_CODE (x) == SIGN_EXTRACT)
4336 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4337 index = XEXP (XEXP (x, 0), 0);
4338 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4339 if (INTVAL (XEXP (x, 1)) != 32 + shift
4340 || INTVAL (XEXP (x, 2)) != 0)
4341 shift = -1;
4343 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4344 (const_int 0xffffffff<<shift)) */
4345 else if (GET_CODE (x) == AND
4346 && GET_MODE (x) == DImode
4347 && GET_CODE (XEXP (x, 0)) == ASHIFT
4348 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4349 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4350 && CONST_INT_P (XEXP (x, 1)))
4352 type = ADDRESS_REG_UXTW;
4353 index = XEXP (XEXP (x, 0), 0);
4354 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4355 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4356 shift = -1;
4358 /* (mult:P (reg:P) (const_int scale)) */
4359 else if (GET_CODE (x) == MULT
4360 && GET_MODE (x) == Pmode
4361 && GET_MODE (XEXP (x, 0)) == Pmode
4362 && CONST_INT_P (XEXP (x, 1)))
4364 type = ADDRESS_REG_REG;
4365 index = XEXP (x, 0);
4366 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4368 /* (ashift:P (reg:P) (const_int shift)) */
4369 else if (GET_CODE (x) == ASHIFT
4370 && GET_MODE (x) == Pmode
4371 && GET_MODE (XEXP (x, 0)) == Pmode
4372 && CONST_INT_P (XEXP (x, 1)))
4374 type = ADDRESS_REG_REG;
4375 index = XEXP (x, 0);
4376 shift = INTVAL (XEXP (x, 1));
4378 else
4379 return false;
4381 if (!strict_p
4382 && GET_CODE (index) == SUBREG
4383 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4384 index = SUBREG_REG (index);
4386 if ((shift == 0 ||
4387 (shift > 0 && shift <= 3
4388 && (1 << shift) == GET_MODE_SIZE (mode)))
4389 && REG_P (index)
4390 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4392 info->type = type;
4393 info->offset = index;
4394 info->shift = shift;
4395 return true;
4398 return false;
4401 /* Return true if MODE is one of the modes for which we
4402 support LDP/STP operations. */
4404 static bool
4405 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4407 return mode == SImode || mode == DImode
4408 || mode == SFmode || mode == DFmode
4409 || (aarch64_vector_mode_supported_p (mode)
4410 && GET_MODE_SIZE (mode) == 8);
4413 /* Return true if REGNO is a virtual pointer register, or an eliminable
4414 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4415 include stack_pointer or hard_frame_pointer. */
4416 static bool
4417 virt_or_elim_regno_p (unsigned regno)
4419 return ((regno >= FIRST_VIRTUAL_REGISTER
4420 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4421 || regno == FRAME_POINTER_REGNUM
4422 || regno == ARG_POINTER_REGNUM);
4425 /* Return true if X is a valid address for machine mode MODE. If it is,
4426 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4427 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4429 static bool
4430 aarch64_classify_address (struct aarch64_address_info *info,
4431 rtx x, machine_mode mode,
4432 RTX_CODE outer_code, bool strict_p)
4434 enum rtx_code code = GET_CODE (x);
4435 rtx op0, op1;
4437 /* On BE, we use load/store pair for all large int mode load/stores.
4438 TI/TFmode may also use a load/store pair. */
4439 bool load_store_pair_p = (outer_code == PARALLEL
4440 || mode == TImode
4441 || mode == TFmode
4442 || (BYTES_BIG_ENDIAN
4443 && aarch64_vect_struct_mode_p (mode)));
4445 bool allow_reg_index_p =
4446 !load_store_pair_p
4447 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4448 && !aarch64_vect_struct_mode_p (mode);
4450 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4451 REG addressing. */
4452 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4453 && (code != POST_INC && code != REG))
4454 return false;
4456 switch (code)
4458 case REG:
4459 case SUBREG:
4460 info->type = ADDRESS_REG_IMM;
4461 info->base = x;
4462 info->offset = const0_rtx;
4463 return aarch64_base_register_rtx_p (x, strict_p);
4465 case PLUS:
4466 op0 = XEXP (x, 0);
4467 op1 = XEXP (x, 1);
4469 if (! strict_p
4470 && REG_P (op0)
4471 && virt_or_elim_regno_p (REGNO (op0))
4472 && CONST_INT_P (op1))
4474 info->type = ADDRESS_REG_IMM;
4475 info->base = op0;
4476 info->offset = op1;
4478 return true;
4481 if (GET_MODE_SIZE (mode) != 0
4482 && CONST_INT_P (op1)
4483 && aarch64_base_register_rtx_p (op0, strict_p))
4485 HOST_WIDE_INT offset = INTVAL (op1);
4487 info->type = ADDRESS_REG_IMM;
4488 info->base = op0;
4489 info->offset = op1;
4491 /* TImode and TFmode values are allowed in both pairs of X
4492 registers and individual Q registers. The available
4493 address modes are:
4494 X,X: 7-bit signed scaled offset
4495 Q: 9-bit signed offset
4496 We conservatively require an offset representable in either mode.
4497 When performing the check for pairs of X registers i.e. LDP/STP
4498 pass down DImode since that is the natural size of the LDP/STP
4499 instruction memory accesses. */
4500 if (mode == TImode || mode == TFmode)
4501 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4502 && (offset_9bit_signed_unscaled_p (mode, offset)
4503 || offset_12bit_unsigned_scaled_p (mode, offset)));
4505 /* A 7bit offset check because OImode will emit a ldp/stp
4506 instruction (only big endian will get here).
4507 For ldp/stp instructions, the offset is scaled for the size of a
4508 single element of the pair. */
4509 if (mode == OImode)
4510 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4512 /* Three 9/12 bit offsets checks because CImode will emit three
4513 ldr/str instructions (only big endian will get here). */
4514 if (mode == CImode)
4515 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4516 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4517 || offset_12bit_unsigned_scaled_p (V16QImode,
4518 offset + 32)));
4520 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4521 instructions (only big endian will get here). */
4522 if (mode == XImode)
4523 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4524 && aarch64_offset_7bit_signed_scaled_p (TImode,
4525 offset + 32));
4527 if (load_store_pair_p)
4528 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4529 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4530 else
4531 return (offset_9bit_signed_unscaled_p (mode, offset)
4532 || offset_12bit_unsigned_scaled_p (mode, offset));
4535 if (allow_reg_index_p)
4537 /* Look for base + (scaled/extended) index register. */
4538 if (aarch64_base_register_rtx_p (op0, strict_p)
4539 && aarch64_classify_index (info, op1, mode, strict_p))
4541 info->base = op0;
4542 return true;
4544 if (aarch64_base_register_rtx_p (op1, strict_p)
4545 && aarch64_classify_index (info, op0, mode, strict_p))
4547 info->base = op1;
4548 return true;
4552 return false;
4554 case POST_INC:
4555 case POST_DEC:
4556 case PRE_INC:
4557 case PRE_DEC:
4558 info->type = ADDRESS_REG_WB;
4559 info->base = XEXP (x, 0);
4560 info->offset = NULL_RTX;
4561 return aarch64_base_register_rtx_p (info->base, strict_p);
4563 case POST_MODIFY:
4564 case PRE_MODIFY:
4565 info->type = ADDRESS_REG_WB;
4566 info->base = XEXP (x, 0);
4567 if (GET_CODE (XEXP (x, 1)) == PLUS
4568 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4569 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4570 && aarch64_base_register_rtx_p (info->base, strict_p))
4572 HOST_WIDE_INT offset;
4573 info->offset = XEXP (XEXP (x, 1), 1);
4574 offset = INTVAL (info->offset);
4576 /* TImode and TFmode values are allowed in both pairs of X
4577 registers and individual Q registers. The available
4578 address modes are:
4579 X,X: 7-bit signed scaled offset
4580 Q: 9-bit signed offset
4581 We conservatively require an offset representable in either mode.
4583 if (mode == TImode || mode == TFmode)
4584 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4585 && offset_9bit_signed_unscaled_p (mode, offset));
4587 if (load_store_pair_p)
4588 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4589 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4590 else
4591 return offset_9bit_signed_unscaled_p (mode, offset);
4593 return false;
4595 case CONST:
4596 case SYMBOL_REF:
4597 case LABEL_REF:
4598 /* load literal: pc-relative constant pool entry. Only supported
4599 for SI mode or larger. */
4600 info->type = ADDRESS_SYMBOLIC;
4602 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4604 rtx sym, addend;
4606 split_const (x, &sym, &addend);
4607 return ((GET_CODE (sym) == LABEL_REF
4608 || (GET_CODE (sym) == SYMBOL_REF
4609 && CONSTANT_POOL_ADDRESS_P (sym)
4610 && aarch64_pcrelative_literal_loads)));
4612 return false;
4614 case LO_SUM:
4615 info->type = ADDRESS_LO_SUM;
4616 info->base = XEXP (x, 0);
4617 info->offset = XEXP (x, 1);
4618 if (allow_reg_index_p
4619 && aarch64_base_register_rtx_p (info->base, strict_p))
4621 rtx sym, offs;
4622 split_const (info->offset, &sym, &offs);
4623 if (GET_CODE (sym) == SYMBOL_REF
4624 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4626 /* The symbol and offset must be aligned to the access size. */
4627 unsigned int align;
4628 unsigned int ref_size;
4630 if (CONSTANT_POOL_ADDRESS_P (sym))
4631 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4632 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4634 tree exp = SYMBOL_REF_DECL (sym);
4635 align = TYPE_ALIGN (TREE_TYPE (exp));
4636 align = aarch64_constant_alignment (exp, align);
4638 else if (SYMBOL_REF_DECL (sym))
4639 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4640 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4641 && SYMBOL_REF_BLOCK (sym) != NULL)
4642 align = SYMBOL_REF_BLOCK (sym)->alignment;
4643 else
4644 align = BITS_PER_UNIT;
4646 ref_size = GET_MODE_SIZE (mode);
4647 if (ref_size == 0)
4648 ref_size = GET_MODE_SIZE (DImode);
4650 return ((INTVAL (offs) & (ref_size - 1)) == 0
4651 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4654 return false;
4656 default:
4657 return false;
4661 /* Return true if the address X is valid for a PRFM instruction.
4662 STRICT_P is true if we should do strict checking with
4663 aarch64_classify_address. */
4665 bool
4666 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4668 struct aarch64_address_info addr;
4670 /* PRFM accepts the same addresses as DImode... */
4671 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4672 if (!res)
4673 return false;
4675 /* ... except writeback forms. */
4676 return addr.type != ADDRESS_REG_WB;
4679 bool
4680 aarch64_symbolic_address_p (rtx x)
4682 rtx offset;
4684 split_const (x, &x, &offset);
4685 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4688 /* Classify the base of symbolic expression X. */
4690 enum aarch64_symbol_type
4691 aarch64_classify_symbolic_expression (rtx x)
4693 rtx offset;
4695 split_const (x, &x, &offset);
4696 return aarch64_classify_symbol (x, offset);
4700 /* Return TRUE if X is a legitimate address for accessing memory in
4701 mode MODE. */
4702 static bool
4703 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4705 struct aarch64_address_info addr;
4707 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4710 /* Return TRUE if X is a legitimate address for accessing memory in
4711 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4712 pair operation. */
4713 bool
4714 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4715 RTX_CODE outer_code, bool strict_p)
4717 struct aarch64_address_info addr;
4719 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4722 /* Split an out-of-range address displacement into a base and offset.
4723 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4724 to increase opportunities for sharing the base address of different sizes.
4725 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4726 static bool
4727 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4729 HOST_WIDE_INT offset = INTVAL (*disp);
4730 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4732 if (mode == TImode || mode == TFmode
4733 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4734 base = (offset + 0x100) & ~0x1ff;
4736 *off = GEN_INT (base);
4737 *disp = GEN_INT (offset - base);
4738 return true;
4741 /* Return the binary representation of floating point constant VALUE in INTVAL.
4742 If the value cannot be converted, return false without setting INTVAL.
4743 The conversion is done in the given MODE. */
4744 bool
4745 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4748 /* We make a general exception for 0. */
4749 if (aarch64_float_const_zero_rtx_p (value))
4751 *intval = 0;
4752 return true;
4755 machine_mode mode = GET_MODE (value);
4756 if (GET_CODE (value) != CONST_DOUBLE
4757 || !SCALAR_FLOAT_MODE_P (mode)
4758 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4759 /* Only support up to DF mode. */
4760 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4761 return false;
4763 unsigned HOST_WIDE_INT ival = 0;
4765 long res[2];
4766 real_to_target (res,
4767 CONST_DOUBLE_REAL_VALUE (value),
4768 REAL_MODE_FORMAT (mode));
4770 if (mode == DFmode)
4772 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4773 ival = zext_hwi (res[order], 32);
4774 ival |= (zext_hwi (res[1 - order], 32) << 32);
4776 else
4777 ival = zext_hwi (res[0], 32);
4779 *intval = ival;
4780 return true;
4783 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4784 single MOV(+MOVK) followed by an FMOV. */
4785 bool
4786 aarch64_float_const_rtx_p (rtx x)
4788 machine_mode mode = GET_MODE (x);
4789 if (mode == VOIDmode)
4790 return false;
4792 /* Determine whether it's cheaper to write float constants as
4793 mov/movk pairs over ldr/adrp pairs. */
4794 unsigned HOST_WIDE_INT ival;
4796 if (GET_CODE (x) == CONST_DOUBLE
4797 && SCALAR_FLOAT_MODE_P (mode)
4798 && aarch64_reinterpret_float_as_int (x, &ival))
4800 scalar_int_mode imode = (mode == HFmode
4801 ? SImode
4802 : int_mode_for_mode (mode).require ());
4803 int num_instr = aarch64_internal_mov_immediate
4804 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4805 return num_instr < 3;
4808 return false;
4811 /* Return TRUE if rtx X is immediate constant 0.0 */
4812 bool
4813 aarch64_float_const_zero_rtx_p (rtx x)
4815 if (GET_MODE (x) == VOIDmode)
4816 return false;
4818 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4819 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4820 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4823 /* Return TRUE if rtx X is immediate constant that fits in a single
4824 MOVI immediate operation. */
4825 bool
4826 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4828 if (!TARGET_SIMD)
4829 return false;
4831 machine_mode vmode;
4832 scalar_int_mode imode;
4833 unsigned HOST_WIDE_INT ival;
4835 if (GET_CODE (x) == CONST_DOUBLE
4836 && SCALAR_FLOAT_MODE_P (mode))
4838 if (!aarch64_reinterpret_float_as_int (x, &ival))
4839 return false;
4841 /* We make a general exception for 0. */
4842 if (aarch64_float_const_zero_rtx_p (x))
4843 return true;
4845 imode = int_mode_for_mode (mode).require ();
4847 else if (GET_CODE (x) == CONST_INT
4848 && is_a <scalar_int_mode> (mode, &imode))
4849 ival = INTVAL (x);
4850 else
4851 return false;
4853 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4854 a 128 bit vector mode. */
4855 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4857 vmode = aarch64_simd_container_mode (imode, width);
4858 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4860 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4864 /* Return the fixed registers used for condition codes. */
4866 static bool
4867 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4869 *p1 = CC_REGNUM;
4870 *p2 = INVALID_REGNUM;
4871 return true;
4874 /* This function is used by the call expanders of the machine description.
4875 RESULT is the register in which the result is returned. It's NULL for
4876 "call" and "sibcall".
4877 MEM is the location of the function call.
4878 SIBCALL indicates whether this function call is normal call or sibling call.
4879 It will generate different pattern accordingly. */
4881 void
4882 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4884 rtx call, callee, tmp;
4885 rtvec vec;
4886 machine_mode mode;
4888 gcc_assert (MEM_P (mem));
4889 callee = XEXP (mem, 0);
4890 mode = GET_MODE (callee);
4891 gcc_assert (mode == Pmode);
4893 /* Decide if we should generate indirect calls by loading the
4894 address of the callee into a register before performing
4895 the branch-and-link. */
4896 if (SYMBOL_REF_P (callee)
4897 ? (aarch64_is_long_call_p (callee)
4898 || aarch64_is_noplt_call_p (callee))
4899 : !REG_P (callee))
4900 XEXP (mem, 0) = force_reg (mode, callee);
4902 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4904 if (result != NULL_RTX)
4905 call = gen_rtx_SET (result, call);
4907 if (sibcall)
4908 tmp = ret_rtx;
4909 else
4910 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4912 vec = gen_rtvec (2, call, tmp);
4913 call = gen_rtx_PARALLEL (VOIDmode, vec);
4915 aarch64_emit_call_insn (call);
4918 /* Emit call insn with PAT and do aarch64-specific handling. */
4920 void
4921 aarch64_emit_call_insn (rtx pat)
4923 rtx insn = emit_call_insn (pat);
4925 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4926 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4927 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4930 machine_mode
4931 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4933 /* All floating point compares return CCFP if it is an equality
4934 comparison, and CCFPE otherwise. */
4935 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4937 switch (code)
4939 case EQ:
4940 case NE:
4941 case UNORDERED:
4942 case ORDERED:
4943 case UNLT:
4944 case UNLE:
4945 case UNGT:
4946 case UNGE:
4947 case UNEQ:
4948 case LTGT:
4949 return CCFPmode;
4951 case LT:
4952 case LE:
4953 case GT:
4954 case GE:
4955 return CCFPEmode;
4957 default:
4958 gcc_unreachable ();
4962 /* Equality comparisons of short modes against zero can be performed
4963 using the TST instruction with the appropriate bitmask. */
4964 if (y == const0_rtx && REG_P (x)
4965 && (code == EQ || code == NE)
4966 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4967 return CC_NZmode;
4969 /* Similarly, comparisons of zero_extends from shorter modes can
4970 be performed using an ANDS with an immediate mask. */
4971 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4972 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4973 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4974 && (code == EQ || code == NE))
4975 return CC_NZmode;
4977 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4978 && y == const0_rtx
4979 && (code == EQ || code == NE || code == LT || code == GE)
4980 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4981 || GET_CODE (x) == NEG
4982 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4983 && CONST_INT_P (XEXP (x, 2)))))
4984 return CC_NZmode;
4986 /* A compare with a shifted operand. Because of canonicalization,
4987 the comparison will have to be swapped when we emit the assembly
4988 code. */
4989 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4990 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4991 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4992 || GET_CODE (x) == LSHIFTRT
4993 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4994 return CC_SWPmode;
4996 /* Similarly for a negated operand, but we can only do this for
4997 equalities. */
4998 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4999 && (REG_P (y) || GET_CODE (y) == SUBREG)
5000 && (code == EQ || code == NE)
5001 && GET_CODE (x) == NEG)
5002 return CC_Zmode;
5004 /* A test for unsigned overflow. */
5005 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
5006 && code == NE
5007 && GET_CODE (x) == PLUS
5008 && GET_CODE (y) == ZERO_EXTEND)
5009 return CC_Cmode;
5011 /* For everything else, return CCmode. */
5012 return CCmode;
5015 static int
5016 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5019 aarch64_get_condition_code (rtx x)
5021 machine_mode mode = GET_MODE (XEXP (x, 0));
5022 enum rtx_code comp_code = GET_CODE (x);
5024 if (GET_MODE_CLASS (mode) != MODE_CC)
5025 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5026 return aarch64_get_condition_code_1 (mode, comp_code);
5029 static int
5030 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5032 switch (mode)
5034 case E_CCFPmode:
5035 case E_CCFPEmode:
5036 switch (comp_code)
5038 case GE: return AARCH64_GE;
5039 case GT: return AARCH64_GT;
5040 case LE: return AARCH64_LS;
5041 case LT: return AARCH64_MI;
5042 case NE: return AARCH64_NE;
5043 case EQ: return AARCH64_EQ;
5044 case ORDERED: return AARCH64_VC;
5045 case UNORDERED: return AARCH64_VS;
5046 case UNLT: return AARCH64_LT;
5047 case UNLE: return AARCH64_LE;
5048 case UNGT: return AARCH64_HI;
5049 case UNGE: return AARCH64_PL;
5050 default: return -1;
5052 break;
5054 case E_CCmode:
5055 switch (comp_code)
5057 case NE: return AARCH64_NE;
5058 case EQ: return AARCH64_EQ;
5059 case GE: return AARCH64_GE;
5060 case GT: return AARCH64_GT;
5061 case LE: return AARCH64_LE;
5062 case LT: return AARCH64_LT;
5063 case GEU: return AARCH64_CS;
5064 case GTU: return AARCH64_HI;
5065 case LEU: return AARCH64_LS;
5066 case LTU: return AARCH64_CC;
5067 default: return -1;
5069 break;
5071 case E_CC_SWPmode:
5072 switch (comp_code)
5074 case NE: return AARCH64_NE;
5075 case EQ: return AARCH64_EQ;
5076 case GE: return AARCH64_LE;
5077 case GT: return AARCH64_LT;
5078 case LE: return AARCH64_GE;
5079 case LT: return AARCH64_GT;
5080 case GEU: return AARCH64_LS;
5081 case GTU: return AARCH64_CC;
5082 case LEU: return AARCH64_CS;
5083 case LTU: return AARCH64_HI;
5084 default: return -1;
5086 break;
5088 case E_CC_NZmode:
5089 switch (comp_code)
5091 case NE: return AARCH64_NE;
5092 case EQ: return AARCH64_EQ;
5093 case GE: return AARCH64_PL;
5094 case LT: return AARCH64_MI;
5095 default: return -1;
5097 break;
5099 case E_CC_Zmode:
5100 switch (comp_code)
5102 case NE: return AARCH64_NE;
5103 case EQ: return AARCH64_EQ;
5104 default: return -1;
5106 break;
5108 case E_CC_Cmode:
5109 switch (comp_code)
5111 case NE: return AARCH64_CS;
5112 case EQ: return AARCH64_CC;
5113 default: return -1;
5115 break;
5117 default:
5118 return -1;
5121 return -1;
5124 bool
5125 aarch64_const_vec_all_same_in_range_p (rtx x,
5126 HOST_WIDE_INT minval,
5127 HOST_WIDE_INT maxval)
5129 HOST_WIDE_INT firstval;
5130 int count, i;
5132 if (GET_CODE (x) != CONST_VECTOR
5133 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5134 return false;
5136 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5137 if (firstval < minval || firstval > maxval)
5138 return false;
5140 count = CONST_VECTOR_NUNITS (x);
5141 for (i = 1; i < count; i++)
5142 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5143 return false;
5145 return true;
5148 bool
5149 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5151 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5155 /* N Z C V. */
5156 #define AARCH64_CC_V 1
5157 #define AARCH64_CC_C (1 << 1)
5158 #define AARCH64_CC_Z (1 << 2)
5159 #define AARCH64_CC_N (1 << 3)
5161 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5162 static const int aarch64_nzcv_codes[] =
5164 0, /* EQ, Z == 1. */
5165 AARCH64_CC_Z, /* NE, Z == 0. */
5166 0, /* CS, C == 1. */
5167 AARCH64_CC_C, /* CC, C == 0. */
5168 0, /* MI, N == 1. */
5169 AARCH64_CC_N, /* PL, N == 0. */
5170 0, /* VS, V == 1. */
5171 AARCH64_CC_V, /* VC, V == 0. */
5172 0, /* HI, C ==1 && Z == 0. */
5173 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5174 AARCH64_CC_V, /* GE, N == V. */
5175 0, /* LT, N != V. */
5176 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5177 0, /* LE, !(Z == 0 && N == V). */
5178 0, /* AL, Any. */
5179 0 /* NV, Any. */
5182 /* Print operand X to file F in a target specific manner according to CODE.
5183 The acceptable formatting commands given by CODE are:
5184 'c': An integer or symbol address without a preceding #
5185 sign.
5186 'e': Print the sign/zero-extend size as a character 8->b,
5187 16->h, 32->w.
5188 'p': Prints N such that 2^N == X (X must be power of 2 and
5189 const int).
5190 'P': Print the number of non-zero bits in X (a const_int).
5191 'H': Print the higher numbered register of a pair (TImode)
5192 of regs.
5193 'm': Print a condition (eq, ne, etc).
5194 'M': Same as 'm', but invert condition.
5195 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5196 'S/T/U/V': Print a FP/SIMD register name for a register list.
5197 The register printed is the FP/SIMD register name
5198 of X + 0/1/2/3 for S/T/U/V.
5199 'R': Print a scalar FP/SIMD register name + 1.
5200 'X': Print bottom 16 bits of integer constant in hex.
5201 'w/x': Print a general register name or the zero register
5202 (32-bit or 64-bit).
5203 '0': Print a normal operand, if it's a general register,
5204 then we assume DImode.
5205 'k': Print NZCV for conditional compare instructions.
5206 'A': Output address constant representing the first
5207 argument of X, specifying a relocation offset
5208 if appropriate.
5209 'L': Output constant address specified by X
5210 with a relocation offset if appropriate.
5211 'G': Prints address of X, specifying a PC relative
5212 relocation mode if appropriate. */
5214 static void
5215 aarch64_print_operand (FILE *f, rtx x, int code)
5217 switch (code)
5219 case 'c':
5220 switch (GET_CODE (x))
5222 case CONST_INT:
5223 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5224 break;
5226 case SYMBOL_REF:
5227 output_addr_const (f, x);
5228 break;
5230 case CONST:
5231 if (GET_CODE (XEXP (x, 0)) == PLUS
5232 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5234 output_addr_const (f, x);
5235 break;
5237 /* Fall through. */
5239 default:
5240 output_operand_lossage ("Unsupported operand for code '%c'", code);
5242 break;
5244 case 'e':
5246 int n;
5248 if (!CONST_INT_P (x)
5249 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5251 output_operand_lossage ("invalid operand for '%%%c'", code);
5252 return;
5255 switch (n)
5257 case 3:
5258 fputc ('b', f);
5259 break;
5260 case 4:
5261 fputc ('h', f);
5262 break;
5263 case 5:
5264 fputc ('w', f);
5265 break;
5266 default:
5267 output_operand_lossage ("invalid operand for '%%%c'", code);
5268 return;
5271 break;
5273 case 'p':
5275 int n;
5277 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5279 output_operand_lossage ("invalid operand for '%%%c'", code);
5280 return;
5283 asm_fprintf (f, "%d", n);
5285 break;
5287 case 'P':
5288 if (!CONST_INT_P (x))
5290 output_operand_lossage ("invalid operand for '%%%c'", code);
5291 return;
5294 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5295 break;
5297 case 'H':
5298 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5300 output_operand_lossage ("invalid operand for '%%%c'", code);
5301 return;
5304 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5305 break;
5307 case 'M':
5308 case 'm':
5310 int cond_code;
5311 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5312 if (x == const_true_rtx)
5314 if (code == 'M')
5315 fputs ("nv", f);
5316 return;
5319 if (!COMPARISON_P (x))
5321 output_operand_lossage ("invalid operand for '%%%c'", code);
5322 return;
5325 cond_code = aarch64_get_condition_code (x);
5326 gcc_assert (cond_code >= 0);
5327 if (code == 'M')
5328 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5329 fputs (aarch64_condition_codes[cond_code], f);
5331 break;
5333 case 'b':
5334 case 'h':
5335 case 's':
5336 case 'd':
5337 case 'q':
5338 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5340 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5341 return;
5343 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5344 break;
5346 case 'S':
5347 case 'T':
5348 case 'U':
5349 case 'V':
5350 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5352 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5353 return;
5355 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5356 break;
5358 case 'R':
5359 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5361 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5362 return;
5364 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5365 break;
5367 case 'X':
5368 if (!CONST_INT_P (x))
5370 output_operand_lossage ("invalid operand for '%%%c'", code);
5371 return;
5373 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5374 break;
5376 case 'w':
5377 case 'x':
5378 if (x == const0_rtx
5379 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5381 asm_fprintf (f, "%czr", code);
5382 break;
5385 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5387 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5388 break;
5391 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5393 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5394 break;
5397 /* Fall through */
5399 case 0:
5400 if (x == NULL)
5402 output_operand_lossage ("missing operand");
5403 return;
5406 switch (GET_CODE (x))
5408 case REG:
5409 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5410 break;
5412 case MEM:
5413 output_address (GET_MODE (x), XEXP (x, 0));
5414 /* Check all memory references are Pmode - even with ILP32. */
5415 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5416 break;
5418 case CONST:
5419 case LABEL_REF:
5420 case SYMBOL_REF:
5421 output_addr_const (asm_out_file, x);
5422 break;
5424 case CONST_INT:
5425 asm_fprintf (f, "%wd", INTVAL (x));
5426 break;
5428 case CONST_VECTOR:
5429 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5431 gcc_assert (
5432 aarch64_const_vec_all_same_in_range_p (x,
5433 HOST_WIDE_INT_MIN,
5434 HOST_WIDE_INT_MAX));
5435 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5437 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5439 fputc ('0', f);
5441 else
5442 gcc_unreachable ();
5443 break;
5445 case CONST_DOUBLE:
5446 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5447 be getting CONST_DOUBLEs holding integers. */
5448 gcc_assert (GET_MODE (x) != VOIDmode);
5449 if (aarch64_float_const_zero_rtx_p (x))
5451 fputc ('0', f);
5452 break;
5454 else if (aarch64_float_const_representable_p (x))
5456 #define buf_size 20
5457 char float_buf[buf_size] = {'\0'};
5458 real_to_decimal_for_mode (float_buf,
5459 CONST_DOUBLE_REAL_VALUE (x),
5460 buf_size, buf_size,
5461 1, GET_MODE (x));
5462 asm_fprintf (asm_out_file, "%s", float_buf);
5463 break;
5464 #undef buf_size
5466 output_operand_lossage ("invalid constant");
5467 return;
5468 default:
5469 output_operand_lossage ("invalid operand");
5470 return;
5472 break;
5474 case 'A':
5475 if (GET_CODE (x) == HIGH)
5476 x = XEXP (x, 0);
5478 switch (aarch64_classify_symbolic_expression (x))
5480 case SYMBOL_SMALL_GOT_4G:
5481 asm_fprintf (asm_out_file, ":got:");
5482 break;
5484 case SYMBOL_SMALL_TLSGD:
5485 asm_fprintf (asm_out_file, ":tlsgd:");
5486 break;
5488 case SYMBOL_SMALL_TLSDESC:
5489 asm_fprintf (asm_out_file, ":tlsdesc:");
5490 break;
5492 case SYMBOL_SMALL_TLSIE:
5493 asm_fprintf (asm_out_file, ":gottprel:");
5494 break;
5496 case SYMBOL_TLSLE24:
5497 asm_fprintf (asm_out_file, ":tprel:");
5498 break;
5500 case SYMBOL_TINY_GOT:
5501 gcc_unreachable ();
5502 break;
5504 default:
5505 break;
5507 output_addr_const (asm_out_file, x);
5508 break;
5510 case 'L':
5511 switch (aarch64_classify_symbolic_expression (x))
5513 case SYMBOL_SMALL_GOT_4G:
5514 asm_fprintf (asm_out_file, ":lo12:");
5515 break;
5517 case SYMBOL_SMALL_TLSGD:
5518 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5519 break;
5521 case SYMBOL_SMALL_TLSDESC:
5522 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5523 break;
5525 case SYMBOL_SMALL_TLSIE:
5526 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5527 break;
5529 case SYMBOL_TLSLE12:
5530 asm_fprintf (asm_out_file, ":tprel_lo12:");
5531 break;
5533 case SYMBOL_TLSLE24:
5534 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5535 break;
5537 case SYMBOL_TINY_GOT:
5538 asm_fprintf (asm_out_file, ":got:");
5539 break;
5541 case SYMBOL_TINY_TLSIE:
5542 asm_fprintf (asm_out_file, ":gottprel:");
5543 break;
5545 default:
5546 break;
5548 output_addr_const (asm_out_file, x);
5549 break;
5551 case 'G':
5552 switch (aarch64_classify_symbolic_expression (x))
5554 case SYMBOL_TLSLE24:
5555 asm_fprintf (asm_out_file, ":tprel_hi12:");
5556 break;
5557 default:
5558 break;
5560 output_addr_const (asm_out_file, x);
5561 break;
5563 case 'k':
5565 HOST_WIDE_INT cond_code;
5567 if (!CONST_INT_P (x))
5569 output_operand_lossage ("invalid operand for '%%%c'", code);
5570 return;
5573 cond_code = INTVAL (x);
5574 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5575 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5577 break;
5579 default:
5580 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5581 return;
5585 static void
5586 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5588 struct aarch64_address_info addr;
5590 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5591 switch (addr.type)
5593 case ADDRESS_REG_IMM:
5594 if (addr.offset == const0_rtx)
5595 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5596 else
5597 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5598 INTVAL (addr.offset));
5599 return;
5601 case ADDRESS_REG_REG:
5602 if (addr.shift == 0)
5603 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5604 reg_names [REGNO (addr.offset)]);
5605 else
5606 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5607 reg_names [REGNO (addr.offset)], addr.shift);
5608 return;
5610 case ADDRESS_REG_UXTW:
5611 if (addr.shift == 0)
5612 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5613 REGNO (addr.offset) - R0_REGNUM);
5614 else
5615 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5616 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5617 return;
5619 case ADDRESS_REG_SXTW:
5620 if (addr.shift == 0)
5621 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5622 REGNO (addr.offset) - R0_REGNUM);
5623 else
5624 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5625 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5626 return;
5628 case ADDRESS_REG_WB:
5629 switch (GET_CODE (x))
5631 case PRE_INC:
5632 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5633 GET_MODE_SIZE (mode));
5634 return;
5635 case POST_INC:
5636 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5637 GET_MODE_SIZE (mode));
5638 return;
5639 case PRE_DEC:
5640 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5641 GET_MODE_SIZE (mode));
5642 return;
5643 case POST_DEC:
5644 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5645 GET_MODE_SIZE (mode));
5646 return;
5647 case PRE_MODIFY:
5648 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5649 INTVAL (addr.offset));
5650 return;
5651 case POST_MODIFY:
5652 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5653 INTVAL (addr.offset));
5654 return;
5655 default:
5656 break;
5658 break;
5660 case ADDRESS_LO_SUM:
5661 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5662 output_addr_const (f, addr.offset);
5663 asm_fprintf (f, "]");
5664 return;
5666 case ADDRESS_SYMBOLIC:
5667 break;
5670 output_addr_const (f, x);
5673 bool
5674 aarch64_label_mentioned_p (rtx x)
5676 const char *fmt;
5677 int i;
5679 if (GET_CODE (x) == LABEL_REF)
5680 return true;
5682 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5683 referencing instruction, but they are constant offsets, not
5684 symbols. */
5685 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5686 return false;
5688 fmt = GET_RTX_FORMAT (GET_CODE (x));
5689 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5691 if (fmt[i] == 'E')
5693 int j;
5695 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5696 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5697 return 1;
5699 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5700 return 1;
5703 return 0;
5706 /* Implement REGNO_REG_CLASS. */
5708 enum reg_class
5709 aarch64_regno_regclass (unsigned regno)
5711 if (GP_REGNUM_P (regno))
5712 return GENERAL_REGS;
5714 if (regno == SP_REGNUM)
5715 return STACK_REG;
5717 if (regno == FRAME_POINTER_REGNUM
5718 || regno == ARG_POINTER_REGNUM)
5719 return POINTER_REGS;
5721 if (FP_REGNUM_P (regno))
5722 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5724 return NO_REGS;
5727 static rtx
5728 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5730 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5731 where mask is selected by alignment and size of the offset.
5732 We try to pick as large a range for the offset as possible to
5733 maximize the chance of a CSE. However, for aligned addresses
5734 we limit the range to 4k so that structures with different sized
5735 elements are likely to use the same base. We need to be careful
5736 not to split a CONST for some forms of address expression, otherwise
5737 it will generate sub-optimal code. */
5739 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5741 rtx base = XEXP (x, 0);
5742 rtx offset_rtx = XEXP (x, 1);
5743 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5745 if (GET_CODE (base) == PLUS)
5747 rtx op0 = XEXP (base, 0);
5748 rtx op1 = XEXP (base, 1);
5750 /* Force any scaling into a temp for CSE. */
5751 op0 = force_reg (Pmode, op0);
5752 op1 = force_reg (Pmode, op1);
5754 /* Let the pointer register be in op0. */
5755 if (REG_POINTER (op1))
5756 std::swap (op0, op1);
5758 /* If the pointer is virtual or frame related, then we know that
5759 virtual register instantiation or register elimination is going
5760 to apply a second constant. We want the two constants folded
5761 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5762 if (virt_or_elim_regno_p (REGNO (op0)))
5764 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5765 NULL_RTX, true, OPTAB_DIRECT);
5766 return gen_rtx_PLUS (Pmode, base, op1);
5769 /* Otherwise, in order to encourage CSE (and thence loop strength
5770 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5771 base = expand_binop (Pmode, add_optab, op0, op1,
5772 NULL_RTX, true, OPTAB_DIRECT);
5773 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5776 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5777 HOST_WIDE_INT base_offset;
5778 if (GET_MODE_SIZE (mode) > 16)
5779 base_offset = (offset + 0x400) & ~0x7f0;
5780 /* For offsets aren't a multiple of the access size, the limit is
5781 -256...255. */
5782 else if (offset & (GET_MODE_SIZE (mode) - 1))
5784 base_offset = (offset + 0x100) & ~0x1ff;
5786 /* BLKmode typically uses LDP of X-registers. */
5787 if (mode == BLKmode)
5788 base_offset = (offset + 512) & ~0x3ff;
5790 /* Small negative offsets are supported. */
5791 else if (IN_RANGE (offset, -256, 0))
5792 base_offset = 0;
5793 else if (mode == TImode || mode == TFmode)
5794 base_offset = (offset + 0x100) & ~0x1ff;
5795 /* Use 12-bit offset by access size. */
5796 else
5797 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5799 if (base_offset != 0)
5801 base = plus_constant (Pmode, base, base_offset);
5802 base = force_operand (base, NULL_RTX);
5803 return plus_constant (Pmode, base, offset - base_offset);
5807 return x;
5810 /* Return the reload icode required for a constant pool in mode. */
5811 static enum insn_code
5812 aarch64_constant_pool_reload_icode (machine_mode mode)
5814 switch (mode)
5816 case E_SFmode:
5817 return CODE_FOR_aarch64_reload_movcpsfdi;
5819 case E_DFmode:
5820 return CODE_FOR_aarch64_reload_movcpdfdi;
5822 case E_TFmode:
5823 return CODE_FOR_aarch64_reload_movcptfdi;
5825 case E_V8QImode:
5826 return CODE_FOR_aarch64_reload_movcpv8qidi;
5828 case E_V16QImode:
5829 return CODE_FOR_aarch64_reload_movcpv16qidi;
5831 case E_V4HImode:
5832 return CODE_FOR_aarch64_reload_movcpv4hidi;
5834 case E_V8HImode:
5835 return CODE_FOR_aarch64_reload_movcpv8hidi;
5837 case E_V2SImode:
5838 return CODE_FOR_aarch64_reload_movcpv2sidi;
5840 case E_V4SImode:
5841 return CODE_FOR_aarch64_reload_movcpv4sidi;
5843 case E_V2DImode:
5844 return CODE_FOR_aarch64_reload_movcpv2didi;
5846 case E_V2DFmode:
5847 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5849 default:
5850 gcc_unreachable ();
5853 gcc_unreachable ();
5855 static reg_class_t
5856 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5857 reg_class_t rclass,
5858 machine_mode mode,
5859 secondary_reload_info *sri)
5862 /* If we have to disable direct literal pool loads and stores because the
5863 function is too big, then we need a scratch register. */
5864 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5865 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5866 || targetm.vector_mode_supported_p (GET_MODE (x)))
5867 && !aarch64_pcrelative_literal_loads)
5869 sri->icode = aarch64_constant_pool_reload_icode (mode);
5870 return NO_REGS;
5873 /* Without the TARGET_SIMD instructions we cannot move a Q register
5874 to a Q register directly. We need a scratch. */
5875 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5876 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5877 && reg_class_subset_p (rclass, FP_REGS))
5879 if (mode == TFmode)
5880 sri->icode = CODE_FOR_aarch64_reload_movtf;
5881 else if (mode == TImode)
5882 sri->icode = CODE_FOR_aarch64_reload_movti;
5883 return NO_REGS;
5886 /* A TFmode or TImode memory access should be handled via an FP_REGS
5887 because AArch64 has richer addressing modes for LDR/STR instructions
5888 than LDP/STP instructions. */
5889 if (TARGET_FLOAT && rclass == GENERAL_REGS
5890 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5891 return FP_REGS;
5893 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5894 return GENERAL_REGS;
5896 return NO_REGS;
5899 static bool
5900 aarch64_can_eliminate (const int from, const int to)
5902 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5903 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5905 if (frame_pointer_needed)
5907 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5908 return true;
5909 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5910 return false;
5911 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5912 && !cfun->calls_alloca)
5913 return true;
5914 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5915 return true;
5917 return false;
5919 else
5921 /* If we decided that we didn't need a leaf frame pointer but then used
5922 LR in the function, then we'll want a frame pointer after all, so
5923 prevent this elimination to ensure a frame pointer is used. */
5924 if (to == STACK_POINTER_REGNUM
5925 && flag_omit_leaf_frame_pointer
5926 && df_regs_ever_live_p (LR_REGNUM))
5927 return false;
5930 return true;
5933 HOST_WIDE_INT
5934 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5936 aarch64_layout_frame ();
5938 if (to == HARD_FRAME_POINTER_REGNUM)
5940 if (from == ARG_POINTER_REGNUM)
5941 return cfun->machine->frame.hard_fp_offset;
5943 if (from == FRAME_POINTER_REGNUM)
5944 return cfun->machine->frame.hard_fp_offset
5945 - cfun->machine->frame.locals_offset;
5948 if (to == STACK_POINTER_REGNUM)
5950 if (from == FRAME_POINTER_REGNUM)
5951 return cfun->machine->frame.frame_size
5952 - cfun->machine->frame.locals_offset;
5955 return cfun->machine->frame.frame_size;
5958 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5959 previous frame. */
5962 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5964 if (count != 0)
5965 return const0_rtx;
5966 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5970 static void
5971 aarch64_asm_trampoline_template (FILE *f)
5973 if (TARGET_ILP32)
5975 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5976 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5978 else
5980 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5981 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5983 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5984 assemble_aligned_integer (4, const0_rtx);
5985 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5986 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5989 static void
5990 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5992 rtx fnaddr, mem, a_tramp;
5993 const int tramp_code_sz = 16;
5995 /* Don't need to copy the trailing D-words, we fill those in below. */
5996 emit_block_move (m_tramp, assemble_trampoline_template (),
5997 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5998 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5999 fnaddr = XEXP (DECL_RTL (fndecl), 0);
6000 if (GET_MODE (fnaddr) != ptr_mode)
6001 fnaddr = convert_memory_address (ptr_mode, fnaddr);
6002 emit_move_insn (mem, fnaddr);
6004 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
6005 emit_move_insn (mem, chain_value);
6007 /* XXX We should really define a "clear_cache" pattern and use
6008 gen_clear_cache(). */
6009 a_tramp = XEXP (m_tramp, 0);
6010 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
6011 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6012 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6013 ptr_mode);
6016 static unsigned char
6017 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6019 switch (regclass)
6021 case CALLER_SAVE_REGS:
6022 case POINTER_REGS:
6023 case GENERAL_REGS:
6024 case ALL_REGS:
6025 case POINTER_AND_FP_REGS:
6026 case FP_REGS:
6027 case FP_LO_REGS:
6028 return
6029 aarch64_vector_mode_p (mode)
6030 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6031 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6032 case STACK_REG:
6033 return 1;
6035 case NO_REGS:
6036 return 0;
6038 default:
6039 break;
6041 gcc_unreachable ();
6044 static reg_class_t
6045 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6047 if (regclass == POINTER_REGS)
6048 return GENERAL_REGS;
6050 if (regclass == STACK_REG)
6052 if (REG_P(x)
6053 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6054 return regclass;
6056 return NO_REGS;
6059 /* Register eliminiation can result in a request for
6060 SP+constant->FP_REGS. We cannot support such operations which
6061 use SP as source and an FP_REG as destination, so reject out
6062 right now. */
6063 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6065 rtx lhs = XEXP (x, 0);
6067 /* Look through a possible SUBREG introduced by ILP32. */
6068 if (GET_CODE (lhs) == SUBREG)
6069 lhs = SUBREG_REG (lhs);
6071 gcc_assert (REG_P (lhs));
6072 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6073 POINTER_REGS));
6074 return NO_REGS;
6077 return regclass;
6080 void
6081 aarch64_asm_output_labelref (FILE* f, const char *name)
6083 asm_fprintf (f, "%U%s", name);
6086 static void
6087 aarch64_elf_asm_constructor (rtx symbol, int priority)
6089 if (priority == DEFAULT_INIT_PRIORITY)
6090 default_ctor_section_asm_out_constructor (symbol, priority);
6091 else
6093 section *s;
6094 /* While priority is known to be in range [0, 65535], so 18 bytes
6095 would be enough, the compiler might not know that. To avoid
6096 -Wformat-truncation false positive, use a larger size. */
6097 char buf[23];
6098 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6099 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6100 switch_to_section (s);
6101 assemble_align (POINTER_SIZE);
6102 assemble_aligned_integer (POINTER_BYTES, symbol);
6106 static void
6107 aarch64_elf_asm_destructor (rtx symbol, int priority)
6109 if (priority == DEFAULT_INIT_PRIORITY)
6110 default_dtor_section_asm_out_destructor (symbol, priority);
6111 else
6113 section *s;
6114 /* While priority is known to be in range [0, 65535], so 18 bytes
6115 would be enough, the compiler might not know that. To avoid
6116 -Wformat-truncation false positive, use a larger size. */
6117 char buf[23];
6118 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6119 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6120 switch_to_section (s);
6121 assemble_align (POINTER_SIZE);
6122 assemble_aligned_integer (POINTER_BYTES, symbol);
6126 const char*
6127 aarch64_output_casesi (rtx *operands)
6129 char buf[100];
6130 char label[100];
6131 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6132 int index;
6133 static const char *const patterns[4][2] =
6136 "ldrb\t%w3, [%0,%w1,uxtw]",
6137 "add\t%3, %4, %w3, sxtb #2"
6140 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6141 "add\t%3, %4, %w3, sxth #2"
6144 "ldr\t%w3, [%0,%w1,uxtw #2]",
6145 "add\t%3, %4, %w3, sxtw #2"
6147 /* We assume that DImode is only generated when not optimizing and
6148 that we don't really need 64-bit address offsets. That would
6149 imply an object file with 8GB of code in a single function! */
6151 "ldr\t%w3, [%0,%w1,uxtw #2]",
6152 "add\t%3, %4, %w3, sxtw #2"
6156 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6158 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6159 index = exact_log2 (GET_MODE_SIZE (mode));
6161 gcc_assert (index >= 0 && index <= 3);
6163 /* Need to implement table size reduction, by chaning the code below. */
6164 output_asm_insn (patterns[index][0], operands);
6165 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6166 snprintf (buf, sizeof (buf),
6167 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6168 output_asm_insn (buf, operands);
6169 output_asm_insn (patterns[index][1], operands);
6170 output_asm_insn ("br\t%3", operands);
6171 assemble_label (asm_out_file, label);
6172 return "";
6176 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6177 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6178 operator. */
6181 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6183 if (shift >= 0 && shift <= 3)
6185 int size;
6186 for (size = 8; size <= 32; size *= 2)
6188 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6189 if (mask == bits << shift)
6190 return size;
6193 return 0;
6196 /* Constant pools are per function only when PC relative
6197 literal loads are true or we are in the large memory
6198 model. */
6200 static inline bool
6201 aarch64_can_use_per_function_literal_pools_p (void)
6203 return (aarch64_pcrelative_literal_loads
6204 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6207 static bool
6208 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6210 /* Fixme:: In an ideal world this would work similar
6211 to the logic in aarch64_select_rtx_section but this
6212 breaks bootstrap in gcc go. For now we workaround
6213 this by returning false here. */
6214 return false;
6217 /* Select appropriate section for constants depending
6218 on where we place literal pools. */
6220 static section *
6221 aarch64_select_rtx_section (machine_mode mode,
6222 rtx x,
6223 unsigned HOST_WIDE_INT align)
6225 if (aarch64_can_use_per_function_literal_pools_p ())
6226 return function_section (current_function_decl);
6228 return default_elf_select_rtx_section (mode, x, align);
6231 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6232 void
6233 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6234 HOST_WIDE_INT offset)
6236 /* When using per-function literal pools, we must ensure that any code
6237 section is aligned to the minimal instruction length, lest we get
6238 errors from the assembler re "unaligned instructions". */
6239 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6240 ASM_OUTPUT_ALIGN (f, 2);
6243 /* Costs. */
6245 /* Helper function for rtx cost calculation. Strip a shift expression
6246 from X. Returns the inner operand if successful, or the original
6247 expression on failure. */
6248 static rtx
6249 aarch64_strip_shift (rtx x)
6251 rtx op = x;
6253 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6254 we can convert both to ROR during final output. */
6255 if ((GET_CODE (op) == ASHIFT
6256 || GET_CODE (op) == ASHIFTRT
6257 || GET_CODE (op) == LSHIFTRT
6258 || GET_CODE (op) == ROTATERT
6259 || GET_CODE (op) == ROTATE)
6260 && CONST_INT_P (XEXP (op, 1)))
6261 return XEXP (op, 0);
6263 if (GET_CODE (op) == MULT
6264 && CONST_INT_P (XEXP (op, 1))
6265 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6266 return XEXP (op, 0);
6268 return x;
6271 /* Helper function for rtx cost calculation. Strip an extend
6272 expression from X. Returns the inner operand if successful, or the
6273 original expression on failure. We deal with a number of possible
6274 canonicalization variations here. If STRIP_SHIFT is true, then
6275 we can strip off a shift also. */
6276 static rtx
6277 aarch64_strip_extend (rtx x, bool strip_shift)
6279 scalar_int_mode mode;
6280 rtx op = x;
6282 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6283 return op;
6285 /* Zero and sign extraction of a widened value. */
6286 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6287 && XEXP (op, 2) == const0_rtx
6288 && GET_CODE (XEXP (op, 0)) == MULT
6289 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6290 XEXP (op, 1)))
6291 return XEXP (XEXP (op, 0), 0);
6293 /* It can also be represented (for zero-extend) as an AND with an
6294 immediate. */
6295 if (GET_CODE (op) == AND
6296 && GET_CODE (XEXP (op, 0)) == MULT
6297 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6298 && CONST_INT_P (XEXP (op, 1))
6299 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6300 INTVAL (XEXP (op, 1))) != 0)
6301 return XEXP (XEXP (op, 0), 0);
6303 /* Now handle extended register, as this may also have an optional
6304 left shift by 1..4. */
6305 if (strip_shift
6306 && GET_CODE (op) == ASHIFT
6307 && CONST_INT_P (XEXP (op, 1))
6308 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6309 op = XEXP (op, 0);
6311 if (GET_CODE (op) == ZERO_EXTEND
6312 || GET_CODE (op) == SIGN_EXTEND)
6313 op = XEXP (op, 0);
6315 if (op != x)
6316 return op;
6318 return x;
6321 /* Return true iff CODE is a shift supported in combination
6322 with arithmetic instructions. */
6324 static bool
6325 aarch64_shift_p (enum rtx_code code)
6327 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6331 /* Return true iff X is a cheap shift without a sign extend. */
6333 static bool
6334 aarch64_cheap_mult_shift_p (rtx x)
6336 rtx op0, op1;
6338 op0 = XEXP (x, 0);
6339 op1 = XEXP (x, 1);
6341 if (!(aarch64_tune_params.extra_tuning_flags
6342 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6343 return false;
6345 if (GET_CODE (op0) == SIGN_EXTEND)
6346 return false;
6348 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6349 && UINTVAL (op1) <= 4)
6350 return true;
6352 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6353 return false;
6355 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6357 if (l2 > 0 && l2 <= 4)
6358 return true;
6360 return false;
6363 /* Helper function for rtx cost calculation. Calculate the cost of
6364 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6365 Return the calculated cost of the expression, recursing manually in to
6366 operands where needed. */
6368 static int
6369 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6371 rtx op0, op1;
6372 const struct cpu_cost_table *extra_cost
6373 = aarch64_tune_params.insn_extra_cost;
6374 int cost = 0;
6375 bool compound_p = (outer == PLUS || outer == MINUS);
6376 machine_mode mode = GET_MODE (x);
6378 gcc_checking_assert (code == MULT);
6380 op0 = XEXP (x, 0);
6381 op1 = XEXP (x, 1);
6383 if (VECTOR_MODE_P (mode))
6384 mode = GET_MODE_INNER (mode);
6386 /* Integer multiply/fma. */
6387 if (GET_MODE_CLASS (mode) == MODE_INT)
6389 /* The multiply will be canonicalized as a shift, cost it as such. */
6390 if (aarch64_shift_p (GET_CODE (x))
6391 || (CONST_INT_P (op1)
6392 && exact_log2 (INTVAL (op1)) > 0))
6394 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6395 || GET_CODE (op0) == SIGN_EXTEND;
6396 if (speed)
6398 if (compound_p)
6400 /* If the shift is considered cheap,
6401 then don't add any cost. */
6402 if (aarch64_cheap_mult_shift_p (x))
6404 else if (REG_P (op1))
6405 /* ARITH + shift-by-register. */
6406 cost += extra_cost->alu.arith_shift_reg;
6407 else if (is_extend)
6408 /* ARITH + extended register. We don't have a cost field
6409 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6410 cost += extra_cost->alu.extend_arith;
6411 else
6412 /* ARITH + shift-by-immediate. */
6413 cost += extra_cost->alu.arith_shift;
6415 else
6416 /* LSL (immediate). */
6417 cost += extra_cost->alu.shift;
6420 /* Strip extends as we will have costed them in the case above. */
6421 if (is_extend)
6422 op0 = aarch64_strip_extend (op0, true);
6424 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6426 return cost;
6429 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6430 compound and let the below cases handle it. After all, MNEG is a
6431 special-case alias of MSUB. */
6432 if (GET_CODE (op0) == NEG)
6434 op0 = XEXP (op0, 0);
6435 compound_p = true;
6438 /* Integer multiplies or FMAs have zero/sign extending variants. */
6439 if ((GET_CODE (op0) == ZERO_EXTEND
6440 && GET_CODE (op1) == ZERO_EXTEND)
6441 || (GET_CODE (op0) == SIGN_EXTEND
6442 && GET_CODE (op1) == SIGN_EXTEND))
6444 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6445 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6447 if (speed)
6449 if (compound_p)
6450 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6451 cost += extra_cost->mult[0].extend_add;
6452 else
6453 /* MUL/SMULL/UMULL. */
6454 cost += extra_cost->mult[0].extend;
6457 return cost;
6460 /* This is either an integer multiply or a MADD. In both cases
6461 we want to recurse and cost the operands. */
6462 cost += rtx_cost (op0, mode, MULT, 0, speed);
6463 cost += rtx_cost (op1, mode, MULT, 1, speed);
6465 if (speed)
6467 if (compound_p)
6468 /* MADD/MSUB. */
6469 cost += extra_cost->mult[mode == DImode].add;
6470 else
6471 /* MUL. */
6472 cost += extra_cost->mult[mode == DImode].simple;
6475 return cost;
6477 else
6479 if (speed)
6481 /* Floating-point FMA/FMUL can also support negations of the
6482 operands, unless the rounding mode is upward or downward in
6483 which case FNMUL is different than FMUL with operand negation. */
6484 bool neg0 = GET_CODE (op0) == NEG;
6485 bool neg1 = GET_CODE (op1) == NEG;
6486 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6488 if (neg0)
6489 op0 = XEXP (op0, 0);
6490 if (neg1)
6491 op1 = XEXP (op1, 0);
6494 if (compound_p)
6495 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6496 cost += extra_cost->fp[mode == DFmode].fma;
6497 else
6498 /* FMUL/FNMUL. */
6499 cost += extra_cost->fp[mode == DFmode].mult;
6502 cost += rtx_cost (op0, mode, MULT, 0, speed);
6503 cost += rtx_cost (op1, mode, MULT, 1, speed);
6504 return cost;
6508 static int
6509 aarch64_address_cost (rtx x,
6510 machine_mode mode,
6511 addr_space_t as ATTRIBUTE_UNUSED,
6512 bool speed)
6514 enum rtx_code c = GET_CODE (x);
6515 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6516 struct aarch64_address_info info;
6517 int cost = 0;
6518 info.shift = 0;
6520 if (!aarch64_classify_address (&info, x, mode, c, false))
6522 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6524 /* This is a CONST or SYMBOL ref which will be split
6525 in a different way depending on the code model in use.
6526 Cost it through the generic infrastructure. */
6527 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6528 /* Divide through by the cost of one instruction to
6529 bring it to the same units as the address costs. */
6530 cost_symbol_ref /= COSTS_N_INSNS (1);
6531 /* The cost is then the cost of preparing the address,
6532 followed by an immediate (possibly 0) offset. */
6533 return cost_symbol_ref + addr_cost->imm_offset;
6535 else
6537 /* This is most likely a jump table from a case
6538 statement. */
6539 return addr_cost->register_offset;
6543 switch (info.type)
6545 case ADDRESS_LO_SUM:
6546 case ADDRESS_SYMBOLIC:
6547 case ADDRESS_REG_IMM:
6548 cost += addr_cost->imm_offset;
6549 break;
6551 case ADDRESS_REG_WB:
6552 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6553 cost += addr_cost->pre_modify;
6554 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6555 cost += addr_cost->post_modify;
6556 else
6557 gcc_unreachable ();
6559 break;
6561 case ADDRESS_REG_REG:
6562 cost += addr_cost->register_offset;
6563 break;
6565 case ADDRESS_REG_SXTW:
6566 cost += addr_cost->register_sextend;
6567 break;
6569 case ADDRESS_REG_UXTW:
6570 cost += addr_cost->register_zextend;
6571 break;
6573 default:
6574 gcc_unreachable ();
6578 if (info.shift > 0)
6580 /* For the sake of calculating the cost of the shifted register
6581 component, we can treat same sized modes in the same way. */
6582 switch (GET_MODE_BITSIZE (mode))
6584 case 16:
6585 cost += addr_cost->addr_scale_costs.hi;
6586 break;
6588 case 32:
6589 cost += addr_cost->addr_scale_costs.si;
6590 break;
6592 case 64:
6593 cost += addr_cost->addr_scale_costs.di;
6594 break;
6596 /* We can't tell, or this is a 128-bit vector. */
6597 default:
6598 cost += addr_cost->addr_scale_costs.ti;
6599 break;
6603 return cost;
6606 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6607 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6608 to be taken. */
6611 aarch64_branch_cost (bool speed_p, bool predictable_p)
6613 /* When optimizing for speed, use the cost of unpredictable branches. */
6614 const struct cpu_branch_cost *branch_costs =
6615 aarch64_tune_params.branch_costs;
6617 if (!speed_p || predictable_p)
6618 return branch_costs->predictable;
6619 else
6620 return branch_costs->unpredictable;
6623 /* Return true if the RTX X in mode MODE is a zero or sign extract
6624 usable in an ADD or SUB (extended register) instruction. */
6625 static bool
6626 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6628 /* Catch add with a sign extract.
6629 This is add_<optab><mode>_multp2. */
6630 if (GET_CODE (x) == SIGN_EXTRACT
6631 || GET_CODE (x) == ZERO_EXTRACT)
6633 rtx op0 = XEXP (x, 0);
6634 rtx op1 = XEXP (x, 1);
6635 rtx op2 = XEXP (x, 2);
6637 if (GET_CODE (op0) == MULT
6638 && CONST_INT_P (op1)
6639 && op2 == const0_rtx
6640 && CONST_INT_P (XEXP (op0, 1))
6641 && aarch64_is_extend_from_extract (mode,
6642 XEXP (op0, 1),
6643 op1))
6645 return true;
6648 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6649 No shift. */
6650 else if (GET_CODE (x) == SIGN_EXTEND
6651 || GET_CODE (x) == ZERO_EXTEND)
6652 return REG_P (XEXP (x, 0));
6654 return false;
6657 static bool
6658 aarch64_frint_unspec_p (unsigned int u)
6660 switch (u)
6662 case UNSPEC_FRINTZ:
6663 case UNSPEC_FRINTP:
6664 case UNSPEC_FRINTM:
6665 case UNSPEC_FRINTA:
6666 case UNSPEC_FRINTN:
6667 case UNSPEC_FRINTX:
6668 case UNSPEC_FRINTI:
6669 return true;
6671 default:
6672 return false;
6676 /* Return true iff X is an rtx that will match an extr instruction
6677 i.e. as described in the *extr<mode>5_insn family of patterns.
6678 OP0 and OP1 will be set to the operands of the shifts involved
6679 on success and will be NULL_RTX otherwise. */
6681 static bool
6682 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6684 rtx op0, op1;
6685 scalar_int_mode mode;
6686 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6687 return false;
6689 *res_op0 = NULL_RTX;
6690 *res_op1 = NULL_RTX;
6692 if (GET_CODE (x) != IOR)
6693 return false;
6695 op0 = XEXP (x, 0);
6696 op1 = XEXP (x, 1);
6698 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6699 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6701 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6702 if (GET_CODE (op1) == ASHIFT)
6703 std::swap (op0, op1);
6705 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6706 return false;
6708 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6709 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6711 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6712 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6714 *res_op0 = XEXP (op0, 0);
6715 *res_op1 = XEXP (op1, 0);
6716 return true;
6720 return false;
6723 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6724 storing it in *COST. Result is true if the total cost of the operation
6725 has now been calculated. */
6726 static bool
6727 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6729 rtx inner;
6730 rtx comparator;
6731 enum rtx_code cmpcode;
6733 if (COMPARISON_P (op0))
6735 inner = XEXP (op0, 0);
6736 comparator = XEXP (op0, 1);
6737 cmpcode = GET_CODE (op0);
6739 else
6741 inner = op0;
6742 comparator = const0_rtx;
6743 cmpcode = NE;
6746 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6748 /* Conditional branch. */
6749 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6750 return true;
6751 else
6753 if (cmpcode == NE || cmpcode == EQ)
6755 if (comparator == const0_rtx)
6757 /* TBZ/TBNZ/CBZ/CBNZ. */
6758 if (GET_CODE (inner) == ZERO_EXTRACT)
6759 /* TBZ/TBNZ. */
6760 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6761 ZERO_EXTRACT, 0, speed);
6762 else
6763 /* CBZ/CBNZ. */
6764 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6766 return true;
6769 else if (cmpcode == LT || cmpcode == GE)
6771 /* TBZ/TBNZ. */
6772 if (comparator == const0_rtx)
6773 return true;
6777 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6779 /* CCMP. */
6780 if (GET_CODE (op1) == COMPARE)
6782 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6783 if (XEXP (op1, 1) == const0_rtx)
6784 *cost += 1;
6785 if (speed)
6787 machine_mode mode = GET_MODE (XEXP (op1, 0));
6788 const struct cpu_cost_table *extra_cost
6789 = aarch64_tune_params.insn_extra_cost;
6791 if (GET_MODE_CLASS (mode) == MODE_INT)
6792 *cost += extra_cost->alu.arith;
6793 else
6794 *cost += extra_cost->fp[mode == DFmode].compare;
6796 return true;
6799 /* It's a conditional operation based on the status flags,
6800 so it must be some flavor of CSEL. */
6802 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6803 if (GET_CODE (op1) == NEG
6804 || GET_CODE (op1) == NOT
6805 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6806 op1 = XEXP (op1, 0);
6807 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6809 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6810 op1 = XEXP (op1, 0);
6811 op2 = XEXP (op2, 0);
6814 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6815 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6816 return true;
6819 /* We don't know what this is, cost all operands. */
6820 return false;
6823 /* Check whether X is a bitfield operation of the form shift + extend that
6824 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6825 operand to which the bitfield operation is applied. Otherwise return
6826 NULL_RTX. */
6828 static rtx
6829 aarch64_extend_bitfield_pattern_p (rtx x)
6831 rtx_code outer_code = GET_CODE (x);
6832 machine_mode outer_mode = GET_MODE (x);
6834 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6835 && outer_mode != SImode && outer_mode != DImode)
6836 return NULL_RTX;
6838 rtx inner = XEXP (x, 0);
6839 rtx_code inner_code = GET_CODE (inner);
6840 machine_mode inner_mode = GET_MODE (inner);
6841 rtx op = NULL_RTX;
6843 switch (inner_code)
6845 case ASHIFT:
6846 if (CONST_INT_P (XEXP (inner, 1))
6847 && (inner_mode == QImode || inner_mode == HImode))
6848 op = XEXP (inner, 0);
6849 break;
6850 case LSHIFTRT:
6851 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6852 && (inner_mode == QImode || inner_mode == HImode))
6853 op = XEXP (inner, 0);
6854 break;
6855 case ASHIFTRT:
6856 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6857 && (inner_mode == QImode || inner_mode == HImode))
6858 op = XEXP (inner, 0);
6859 break;
6860 default:
6861 break;
6864 return op;
6867 /* Return true if the mask and a shift amount from an RTX of the form
6868 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6869 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6871 bool
6872 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6873 rtx shft_amnt)
6875 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6876 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6877 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6878 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6881 /* Calculate the cost of calculating X, storing it in *COST. Result
6882 is true if the total cost of the operation has now been calculated. */
6883 static bool
6884 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6885 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6887 rtx op0, op1, op2;
6888 const struct cpu_cost_table *extra_cost
6889 = aarch64_tune_params.insn_extra_cost;
6890 int code = GET_CODE (x);
6891 scalar_int_mode int_mode;
6893 /* By default, assume that everything has equivalent cost to the
6894 cheapest instruction. Any additional costs are applied as a delta
6895 above this default. */
6896 *cost = COSTS_N_INSNS (1);
6898 switch (code)
6900 case SET:
6901 /* The cost depends entirely on the operands to SET. */
6902 *cost = 0;
6903 op0 = SET_DEST (x);
6904 op1 = SET_SRC (x);
6906 switch (GET_CODE (op0))
6908 case MEM:
6909 if (speed)
6911 rtx address = XEXP (op0, 0);
6912 if (VECTOR_MODE_P (mode))
6913 *cost += extra_cost->ldst.storev;
6914 else if (GET_MODE_CLASS (mode) == MODE_INT)
6915 *cost += extra_cost->ldst.store;
6916 else if (mode == SFmode)
6917 *cost += extra_cost->ldst.storef;
6918 else if (mode == DFmode)
6919 *cost += extra_cost->ldst.stored;
6921 *cost +=
6922 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6923 0, speed));
6926 *cost += rtx_cost (op1, mode, SET, 1, speed);
6927 return true;
6929 case SUBREG:
6930 if (! REG_P (SUBREG_REG (op0)))
6931 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6933 /* Fall through. */
6934 case REG:
6935 /* The cost is one per vector-register copied. */
6936 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6938 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6939 / GET_MODE_SIZE (V4SImode);
6940 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6942 /* const0_rtx is in general free, but we will use an
6943 instruction to set a register to 0. */
6944 else if (REG_P (op1) || op1 == const0_rtx)
6946 /* The cost is 1 per register copied. */
6947 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6948 / UNITS_PER_WORD;
6949 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6951 else
6952 /* Cost is just the cost of the RHS of the set. */
6953 *cost += rtx_cost (op1, mode, SET, 1, speed);
6954 return true;
6956 case ZERO_EXTRACT:
6957 case SIGN_EXTRACT:
6958 /* Bit-field insertion. Strip any redundant widening of
6959 the RHS to meet the width of the target. */
6960 if (GET_CODE (op1) == SUBREG)
6961 op1 = SUBREG_REG (op1);
6962 if ((GET_CODE (op1) == ZERO_EXTEND
6963 || GET_CODE (op1) == SIGN_EXTEND)
6964 && CONST_INT_P (XEXP (op0, 1))
6965 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6966 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6967 op1 = XEXP (op1, 0);
6969 if (CONST_INT_P (op1))
6971 /* MOV immediate is assumed to always be cheap. */
6972 *cost = COSTS_N_INSNS (1);
6974 else
6976 /* BFM. */
6977 if (speed)
6978 *cost += extra_cost->alu.bfi;
6979 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6982 return true;
6984 default:
6985 /* We can't make sense of this, assume default cost. */
6986 *cost = COSTS_N_INSNS (1);
6987 return false;
6989 return false;
6991 case CONST_INT:
6992 /* If an instruction can incorporate a constant within the
6993 instruction, the instruction's expression avoids calling
6994 rtx_cost() on the constant. If rtx_cost() is called on a
6995 constant, then it is usually because the constant must be
6996 moved into a register by one or more instructions.
6998 The exception is constant 0, which can be expressed
6999 as XZR/WZR and is therefore free. The exception to this is
7000 if we have (set (reg) (const0_rtx)) in which case we must cost
7001 the move. However, we can catch that when we cost the SET, so
7002 we don't need to consider that here. */
7003 if (x == const0_rtx)
7004 *cost = 0;
7005 else
7007 /* To an approximation, building any other constant is
7008 proportionally expensive to the number of instructions
7009 required to build that constant. This is true whether we
7010 are compiling for SPEED or otherwise. */
7011 if (!is_a <scalar_int_mode> (mode, &int_mode))
7012 int_mode = word_mode;
7013 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7014 (NULL_RTX, x, false, int_mode));
7016 return true;
7018 case CONST_DOUBLE:
7020 /* First determine number of instructions to do the move
7021 as an integer constant. */
7022 if (!aarch64_float_const_representable_p (x)
7023 && !aarch64_can_const_movi_rtx_p (x, mode)
7024 && aarch64_float_const_rtx_p (x))
7026 unsigned HOST_WIDE_INT ival;
7027 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7028 gcc_assert (succeed);
7030 scalar_int_mode imode = (mode == HFmode
7031 ? SImode
7032 : int_mode_for_mode (mode).require ());
7033 int ncost = aarch64_internal_mov_immediate
7034 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7035 *cost += COSTS_N_INSNS (ncost);
7036 return true;
7039 if (speed)
7041 /* mov[df,sf]_aarch64. */
7042 if (aarch64_float_const_representable_p (x))
7043 /* FMOV (scalar immediate). */
7044 *cost += extra_cost->fp[mode == DFmode].fpconst;
7045 else if (!aarch64_float_const_zero_rtx_p (x))
7047 /* This will be a load from memory. */
7048 if (mode == DFmode)
7049 *cost += extra_cost->ldst.loadd;
7050 else
7051 *cost += extra_cost->ldst.loadf;
7053 else
7054 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7055 or MOV v0.s[0], wzr - neither of which are modeled by the
7056 cost tables. Just use the default cost. */
7061 return true;
7063 case MEM:
7064 if (speed)
7066 /* For loads we want the base cost of a load, plus an
7067 approximation for the additional cost of the addressing
7068 mode. */
7069 rtx address = XEXP (x, 0);
7070 if (VECTOR_MODE_P (mode))
7071 *cost += extra_cost->ldst.loadv;
7072 else if (GET_MODE_CLASS (mode) == MODE_INT)
7073 *cost += extra_cost->ldst.load;
7074 else if (mode == SFmode)
7075 *cost += extra_cost->ldst.loadf;
7076 else if (mode == DFmode)
7077 *cost += extra_cost->ldst.loadd;
7079 *cost +=
7080 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7081 0, speed));
7084 return true;
7086 case NEG:
7087 op0 = XEXP (x, 0);
7089 if (VECTOR_MODE_P (mode))
7091 if (speed)
7093 /* FNEG. */
7094 *cost += extra_cost->vect.alu;
7096 return false;
7099 if (GET_MODE_CLASS (mode) == MODE_INT)
7101 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7102 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7104 /* CSETM. */
7105 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7106 return true;
7109 /* Cost this as SUB wzr, X. */
7110 op0 = CONST0_RTX (mode);
7111 op1 = XEXP (x, 0);
7112 goto cost_minus;
7115 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7117 /* Support (neg(fma...)) as a single instruction only if
7118 sign of zeros is unimportant. This matches the decision
7119 making in aarch64.md. */
7120 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7122 /* FNMADD. */
7123 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7124 return true;
7126 if (GET_CODE (op0) == MULT)
7128 /* FNMUL. */
7129 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7130 return true;
7132 if (speed)
7133 /* FNEG. */
7134 *cost += extra_cost->fp[mode == DFmode].neg;
7135 return false;
7138 return false;
7140 case CLRSB:
7141 case CLZ:
7142 if (speed)
7144 if (VECTOR_MODE_P (mode))
7145 *cost += extra_cost->vect.alu;
7146 else
7147 *cost += extra_cost->alu.clz;
7150 return false;
7152 case COMPARE:
7153 op0 = XEXP (x, 0);
7154 op1 = XEXP (x, 1);
7156 if (op1 == const0_rtx
7157 && GET_CODE (op0) == AND)
7159 x = op0;
7160 mode = GET_MODE (op0);
7161 goto cost_logic;
7164 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7166 /* TODO: A write to the CC flags possibly costs extra, this
7167 needs encoding in the cost tables. */
7169 mode = GET_MODE (op0);
7170 /* ANDS. */
7171 if (GET_CODE (op0) == AND)
7173 x = op0;
7174 goto cost_logic;
7177 if (GET_CODE (op0) == PLUS)
7179 /* ADDS (and CMN alias). */
7180 x = op0;
7181 goto cost_plus;
7184 if (GET_CODE (op0) == MINUS)
7186 /* SUBS. */
7187 x = op0;
7188 goto cost_minus;
7191 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7192 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7193 && CONST_INT_P (XEXP (op0, 2)))
7195 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7196 Handle it here directly rather than going to cost_logic
7197 since we know the immediate generated for the TST is valid
7198 so we can avoid creating an intermediate rtx for it only
7199 for costing purposes. */
7200 if (speed)
7201 *cost += extra_cost->alu.logical;
7203 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7204 ZERO_EXTRACT, 0, speed);
7205 return true;
7208 if (GET_CODE (op1) == NEG)
7210 /* CMN. */
7211 if (speed)
7212 *cost += extra_cost->alu.arith;
7214 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7215 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7216 return true;
7219 /* CMP.
7221 Compare can freely swap the order of operands, and
7222 canonicalization puts the more complex operation first.
7223 But the integer MINUS logic expects the shift/extend
7224 operation in op1. */
7225 if (! (REG_P (op0)
7226 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7228 op0 = XEXP (x, 1);
7229 op1 = XEXP (x, 0);
7231 goto cost_minus;
7234 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7236 /* FCMP. */
7237 if (speed)
7238 *cost += extra_cost->fp[mode == DFmode].compare;
7240 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7242 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7243 /* FCMP supports constant 0.0 for no extra cost. */
7244 return true;
7246 return false;
7249 if (VECTOR_MODE_P (mode))
7251 /* Vector compare. */
7252 if (speed)
7253 *cost += extra_cost->vect.alu;
7255 if (aarch64_float_const_zero_rtx_p (op1))
7257 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7258 cost. */
7259 return true;
7261 return false;
7263 return false;
7265 case MINUS:
7267 op0 = XEXP (x, 0);
7268 op1 = XEXP (x, 1);
7270 cost_minus:
7271 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7273 /* Detect valid immediates. */
7274 if ((GET_MODE_CLASS (mode) == MODE_INT
7275 || (GET_MODE_CLASS (mode) == MODE_CC
7276 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7277 && CONST_INT_P (op1)
7278 && aarch64_uimm12_shift (INTVAL (op1)))
7280 if (speed)
7281 /* SUB(S) (immediate). */
7282 *cost += extra_cost->alu.arith;
7283 return true;
7286 /* Look for SUB (extended register). */
7287 if (is_a <scalar_int_mode> (mode, &int_mode)
7288 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7290 if (speed)
7291 *cost += extra_cost->alu.extend_arith;
7293 op1 = aarch64_strip_extend (op1, true);
7294 *cost += rtx_cost (op1, VOIDmode,
7295 (enum rtx_code) GET_CODE (op1), 0, speed);
7296 return true;
7299 rtx new_op1 = aarch64_strip_extend (op1, false);
7301 /* Cost this as an FMA-alike operation. */
7302 if ((GET_CODE (new_op1) == MULT
7303 || aarch64_shift_p (GET_CODE (new_op1)))
7304 && code != COMPARE)
7306 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7307 (enum rtx_code) code,
7308 speed);
7309 return true;
7312 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7314 if (speed)
7316 if (VECTOR_MODE_P (mode))
7318 /* Vector SUB. */
7319 *cost += extra_cost->vect.alu;
7321 else if (GET_MODE_CLASS (mode) == MODE_INT)
7323 /* SUB(S). */
7324 *cost += extra_cost->alu.arith;
7326 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7328 /* FSUB. */
7329 *cost += extra_cost->fp[mode == DFmode].addsub;
7332 return true;
7335 case PLUS:
7337 rtx new_op0;
7339 op0 = XEXP (x, 0);
7340 op1 = XEXP (x, 1);
7342 cost_plus:
7343 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7344 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7346 /* CSINC. */
7347 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7348 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7349 return true;
7352 if (GET_MODE_CLASS (mode) == MODE_INT
7353 && CONST_INT_P (op1)
7354 && aarch64_uimm12_shift (INTVAL (op1)))
7356 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7358 if (speed)
7359 /* ADD (immediate). */
7360 *cost += extra_cost->alu.arith;
7361 return true;
7364 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7366 /* Look for ADD (extended register). */
7367 if (is_a <scalar_int_mode> (mode, &int_mode)
7368 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7370 if (speed)
7371 *cost += extra_cost->alu.extend_arith;
7373 op0 = aarch64_strip_extend (op0, true);
7374 *cost += rtx_cost (op0, VOIDmode,
7375 (enum rtx_code) GET_CODE (op0), 0, speed);
7376 return true;
7379 /* Strip any extend, leave shifts behind as we will
7380 cost them through mult_cost. */
7381 new_op0 = aarch64_strip_extend (op0, false);
7383 if (GET_CODE (new_op0) == MULT
7384 || aarch64_shift_p (GET_CODE (new_op0)))
7386 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7387 speed);
7388 return true;
7391 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7393 if (speed)
7395 if (VECTOR_MODE_P (mode))
7397 /* Vector ADD. */
7398 *cost += extra_cost->vect.alu;
7400 else if (GET_MODE_CLASS (mode) == MODE_INT)
7402 /* ADD. */
7403 *cost += extra_cost->alu.arith;
7405 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7407 /* FADD. */
7408 *cost += extra_cost->fp[mode == DFmode].addsub;
7411 return true;
7414 case BSWAP:
7415 *cost = COSTS_N_INSNS (1);
7417 if (speed)
7419 if (VECTOR_MODE_P (mode))
7420 *cost += extra_cost->vect.alu;
7421 else
7422 *cost += extra_cost->alu.rev;
7424 return false;
7426 case IOR:
7427 if (aarch_rev16_p (x))
7429 *cost = COSTS_N_INSNS (1);
7431 if (speed)
7433 if (VECTOR_MODE_P (mode))
7434 *cost += extra_cost->vect.alu;
7435 else
7436 *cost += extra_cost->alu.rev;
7438 return true;
7441 if (aarch64_extr_rtx_p (x, &op0, &op1))
7443 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7444 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7445 if (speed)
7446 *cost += extra_cost->alu.shift;
7448 return true;
7450 /* Fall through. */
7451 case XOR:
7452 case AND:
7453 cost_logic:
7454 op0 = XEXP (x, 0);
7455 op1 = XEXP (x, 1);
7457 if (VECTOR_MODE_P (mode))
7459 if (speed)
7460 *cost += extra_cost->vect.alu;
7461 return true;
7464 if (code == AND
7465 && GET_CODE (op0) == MULT
7466 && CONST_INT_P (XEXP (op0, 1))
7467 && CONST_INT_P (op1)
7468 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7469 INTVAL (op1)) != 0)
7471 /* This is a UBFM/SBFM. */
7472 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7473 if (speed)
7474 *cost += extra_cost->alu.bfx;
7475 return true;
7478 if (is_int_mode (mode, &int_mode))
7480 if (CONST_INT_P (op1))
7482 /* We have a mask + shift version of a UBFIZ
7483 i.e. the *andim_ashift<mode>_bfiz pattern. */
7484 if (GET_CODE (op0) == ASHIFT
7485 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7486 XEXP (op0, 1)))
7488 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7489 (enum rtx_code) code, 0, speed);
7490 if (speed)
7491 *cost += extra_cost->alu.bfx;
7493 return true;
7495 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7497 /* We possibly get the immediate for free, this is not
7498 modelled. */
7499 *cost += rtx_cost (op0, int_mode,
7500 (enum rtx_code) code, 0, speed);
7501 if (speed)
7502 *cost += extra_cost->alu.logical;
7504 return true;
7507 else
7509 rtx new_op0 = op0;
7511 /* Handle ORN, EON, or BIC. */
7512 if (GET_CODE (op0) == NOT)
7513 op0 = XEXP (op0, 0);
7515 new_op0 = aarch64_strip_shift (op0);
7517 /* If we had a shift on op0 then this is a logical-shift-
7518 by-register/immediate operation. Otherwise, this is just
7519 a logical operation. */
7520 if (speed)
7522 if (new_op0 != op0)
7524 /* Shift by immediate. */
7525 if (CONST_INT_P (XEXP (op0, 1)))
7526 *cost += extra_cost->alu.log_shift;
7527 else
7528 *cost += extra_cost->alu.log_shift_reg;
7530 else
7531 *cost += extra_cost->alu.logical;
7534 /* In both cases we want to cost both operands. */
7535 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7536 0, speed);
7537 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7538 1, speed);
7540 return true;
7543 return false;
7545 case NOT:
7546 x = XEXP (x, 0);
7547 op0 = aarch64_strip_shift (x);
7549 if (VECTOR_MODE_P (mode))
7551 /* Vector NOT. */
7552 *cost += extra_cost->vect.alu;
7553 return false;
7556 /* MVN-shifted-reg. */
7557 if (op0 != x)
7559 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7561 if (speed)
7562 *cost += extra_cost->alu.log_shift;
7564 return true;
7566 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7567 Handle the second form here taking care that 'a' in the above can
7568 be a shift. */
7569 else if (GET_CODE (op0) == XOR)
7571 rtx newop0 = XEXP (op0, 0);
7572 rtx newop1 = XEXP (op0, 1);
7573 rtx op0_stripped = aarch64_strip_shift (newop0);
7575 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7576 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7578 if (speed)
7580 if (op0_stripped != newop0)
7581 *cost += extra_cost->alu.log_shift;
7582 else
7583 *cost += extra_cost->alu.logical;
7586 return true;
7588 /* MVN. */
7589 if (speed)
7590 *cost += extra_cost->alu.logical;
7592 return false;
7594 case ZERO_EXTEND:
7596 op0 = XEXP (x, 0);
7597 /* If a value is written in SI mode, then zero extended to DI
7598 mode, the operation will in general be free as a write to
7599 a 'w' register implicitly zeroes the upper bits of an 'x'
7600 register. However, if this is
7602 (set (reg) (zero_extend (reg)))
7604 we must cost the explicit register move. */
7605 if (mode == DImode
7606 && GET_MODE (op0) == SImode
7607 && outer == SET)
7609 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7611 /* If OP_COST is non-zero, then the cost of the zero extend
7612 is effectively the cost of the inner operation. Otherwise
7613 we have a MOV instruction and we take the cost from the MOV
7614 itself. This is true independently of whether we are
7615 optimizing for space or time. */
7616 if (op_cost)
7617 *cost = op_cost;
7619 return true;
7621 else if (MEM_P (op0))
7623 /* All loads can zero extend to any size for free. */
7624 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7625 return true;
7628 op0 = aarch64_extend_bitfield_pattern_p (x);
7629 if (op0)
7631 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7632 if (speed)
7633 *cost += extra_cost->alu.bfx;
7634 return true;
7637 if (speed)
7639 if (VECTOR_MODE_P (mode))
7641 /* UMOV. */
7642 *cost += extra_cost->vect.alu;
7644 else
7646 /* We generate an AND instead of UXTB/UXTH. */
7647 *cost += extra_cost->alu.logical;
7650 return false;
7652 case SIGN_EXTEND:
7653 if (MEM_P (XEXP (x, 0)))
7655 /* LDRSH. */
7656 if (speed)
7658 rtx address = XEXP (XEXP (x, 0), 0);
7659 *cost += extra_cost->ldst.load_sign_extend;
7661 *cost +=
7662 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7663 0, speed));
7665 return true;
7668 op0 = aarch64_extend_bitfield_pattern_p (x);
7669 if (op0)
7671 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7672 if (speed)
7673 *cost += extra_cost->alu.bfx;
7674 return true;
7677 if (speed)
7679 if (VECTOR_MODE_P (mode))
7680 *cost += extra_cost->vect.alu;
7681 else
7682 *cost += extra_cost->alu.extend;
7684 return false;
7686 case ASHIFT:
7687 op0 = XEXP (x, 0);
7688 op1 = XEXP (x, 1);
7690 if (CONST_INT_P (op1))
7692 if (speed)
7694 if (VECTOR_MODE_P (mode))
7696 /* Vector shift (immediate). */
7697 *cost += extra_cost->vect.alu;
7699 else
7701 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7702 aliases. */
7703 *cost += extra_cost->alu.shift;
7707 /* We can incorporate zero/sign extend for free. */
7708 if (GET_CODE (op0) == ZERO_EXTEND
7709 || GET_CODE (op0) == SIGN_EXTEND)
7710 op0 = XEXP (op0, 0);
7712 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7713 return true;
7715 else
7717 if (VECTOR_MODE_P (mode))
7719 if (speed)
7720 /* Vector shift (register). */
7721 *cost += extra_cost->vect.alu;
7723 else
7725 if (speed)
7726 /* LSLV. */
7727 *cost += extra_cost->alu.shift_reg;
7729 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7730 && CONST_INT_P (XEXP (op1, 1))
7731 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7733 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7734 /* We already demanded XEXP (op1, 0) to be REG_P, so
7735 don't recurse into it. */
7736 return true;
7739 return false; /* All arguments need to be in registers. */
7742 case ROTATE:
7743 case ROTATERT:
7744 case LSHIFTRT:
7745 case ASHIFTRT:
7746 op0 = XEXP (x, 0);
7747 op1 = XEXP (x, 1);
7749 if (CONST_INT_P (op1))
7751 /* ASR (immediate) and friends. */
7752 if (speed)
7754 if (VECTOR_MODE_P (mode))
7755 *cost += extra_cost->vect.alu;
7756 else
7757 *cost += extra_cost->alu.shift;
7760 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7761 return true;
7763 else
7765 if (VECTOR_MODE_P (mode))
7767 if (speed)
7768 /* Vector shift (register). */
7769 *cost += extra_cost->vect.alu;
7771 else
7773 if (speed)
7774 /* ASR (register) and friends. */
7775 *cost += extra_cost->alu.shift_reg;
7777 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7778 && CONST_INT_P (XEXP (op1, 1))
7779 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7781 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7782 /* We already demanded XEXP (op1, 0) to be REG_P, so
7783 don't recurse into it. */
7784 return true;
7787 return false; /* All arguments need to be in registers. */
7790 case SYMBOL_REF:
7792 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7793 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7795 /* LDR. */
7796 if (speed)
7797 *cost += extra_cost->ldst.load;
7799 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7800 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7802 /* ADRP, followed by ADD. */
7803 *cost += COSTS_N_INSNS (1);
7804 if (speed)
7805 *cost += 2 * extra_cost->alu.arith;
7807 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7808 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7810 /* ADR. */
7811 if (speed)
7812 *cost += extra_cost->alu.arith;
7815 if (flag_pic)
7817 /* One extra load instruction, after accessing the GOT. */
7818 *cost += COSTS_N_INSNS (1);
7819 if (speed)
7820 *cost += extra_cost->ldst.load;
7822 return true;
7824 case HIGH:
7825 case LO_SUM:
7826 /* ADRP/ADD (immediate). */
7827 if (speed)
7828 *cost += extra_cost->alu.arith;
7829 return true;
7831 case ZERO_EXTRACT:
7832 case SIGN_EXTRACT:
7833 /* UBFX/SBFX. */
7834 if (speed)
7836 if (VECTOR_MODE_P (mode))
7837 *cost += extra_cost->vect.alu;
7838 else
7839 *cost += extra_cost->alu.bfx;
7842 /* We can trust that the immediates used will be correct (there
7843 are no by-register forms), so we need only cost op0. */
7844 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7845 return true;
7847 case MULT:
7848 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7849 /* aarch64_rtx_mult_cost always handles recursion to its
7850 operands. */
7851 return true;
7853 case MOD:
7854 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7855 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7856 an unconditional negate. This case should only ever be reached through
7857 the set_smod_pow2_cheap check in expmed.c. */
7858 if (CONST_INT_P (XEXP (x, 1))
7859 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7860 && (mode == SImode || mode == DImode))
7862 /* We expand to 4 instructions. Reset the baseline. */
7863 *cost = COSTS_N_INSNS (4);
7865 if (speed)
7866 *cost += 2 * extra_cost->alu.logical
7867 + 2 * extra_cost->alu.arith;
7869 return true;
7872 /* Fall-through. */
7873 case UMOD:
7874 if (speed)
7876 /* Slighly prefer UMOD over SMOD. */
7877 if (VECTOR_MODE_P (mode))
7878 *cost += extra_cost->vect.alu;
7879 else if (GET_MODE_CLASS (mode) == MODE_INT)
7880 *cost += (extra_cost->mult[mode == DImode].add
7881 + extra_cost->mult[mode == DImode].idiv
7882 + (code == MOD ? 1 : 0));
7884 return false; /* All arguments need to be in registers. */
7886 case DIV:
7887 case UDIV:
7888 case SQRT:
7889 if (speed)
7891 if (VECTOR_MODE_P (mode))
7892 *cost += extra_cost->vect.alu;
7893 else if (GET_MODE_CLASS (mode) == MODE_INT)
7894 /* There is no integer SQRT, so only DIV and UDIV can get
7895 here. */
7896 *cost += (extra_cost->mult[mode == DImode].idiv
7897 /* Slighly prefer UDIV over SDIV. */
7898 + (code == DIV ? 1 : 0));
7899 else
7900 *cost += extra_cost->fp[mode == DFmode].div;
7902 return false; /* All arguments need to be in registers. */
7904 case IF_THEN_ELSE:
7905 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7906 XEXP (x, 2), cost, speed);
7908 case EQ:
7909 case NE:
7910 case GT:
7911 case GTU:
7912 case LT:
7913 case LTU:
7914 case GE:
7915 case GEU:
7916 case LE:
7917 case LEU:
7919 return false; /* All arguments must be in registers. */
7921 case FMA:
7922 op0 = XEXP (x, 0);
7923 op1 = XEXP (x, 1);
7924 op2 = XEXP (x, 2);
7926 if (speed)
7928 if (VECTOR_MODE_P (mode))
7929 *cost += extra_cost->vect.alu;
7930 else
7931 *cost += extra_cost->fp[mode == DFmode].fma;
7934 /* FMSUB, FNMADD, and FNMSUB are free. */
7935 if (GET_CODE (op0) == NEG)
7936 op0 = XEXP (op0, 0);
7938 if (GET_CODE (op2) == NEG)
7939 op2 = XEXP (op2, 0);
7941 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7942 and the by-element operand as operand 0. */
7943 if (GET_CODE (op1) == NEG)
7944 op1 = XEXP (op1, 0);
7946 /* Catch vector-by-element operations. The by-element operand can
7947 either be (vec_duplicate (vec_select (x))) or just
7948 (vec_select (x)), depending on whether we are multiplying by
7949 a vector or a scalar.
7951 Canonicalization is not very good in these cases, FMA4 will put the
7952 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7953 if (GET_CODE (op0) == VEC_DUPLICATE)
7954 op0 = XEXP (op0, 0);
7955 else if (GET_CODE (op1) == VEC_DUPLICATE)
7956 op1 = XEXP (op1, 0);
7958 if (GET_CODE (op0) == VEC_SELECT)
7959 op0 = XEXP (op0, 0);
7960 else if (GET_CODE (op1) == VEC_SELECT)
7961 op1 = XEXP (op1, 0);
7963 /* If the remaining parameters are not registers,
7964 get the cost to put them into registers. */
7965 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7966 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7967 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7968 return true;
7970 case FLOAT:
7971 case UNSIGNED_FLOAT:
7972 if (speed)
7973 *cost += extra_cost->fp[mode == DFmode].fromint;
7974 return false;
7976 case FLOAT_EXTEND:
7977 if (speed)
7979 if (VECTOR_MODE_P (mode))
7981 /*Vector truncate. */
7982 *cost += extra_cost->vect.alu;
7984 else
7985 *cost += extra_cost->fp[mode == DFmode].widen;
7987 return false;
7989 case FLOAT_TRUNCATE:
7990 if (speed)
7992 if (VECTOR_MODE_P (mode))
7994 /*Vector conversion. */
7995 *cost += extra_cost->vect.alu;
7997 else
7998 *cost += extra_cost->fp[mode == DFmode].narrow;
8000 return false;
8002 case FIX:
8003 case UNSIGNED_FIX:
8004 x = XEXP (x, 0);
8005 /* Strip the rounding part. They will all be implemented
8006 by the fcvt* family of instructions anyway. */
8007 if (GET_CODE (x) == UNSPEC)
8009 unsigned int uns_code = XINT (x, 1);
8011 if (uns_code == UNSPEC_FRINTA
8012 || uns_code == UNSPEC_FRINTM
8013 || uns_code == UNSPEC_FRINTN
8014 || uns_code == UNSPEC_FRINTP
8015 || uns_code == UNSPEC_FRINTZ)
8016 x = XVECEXP (x, 0, 0);
8019 if (speed)
8021 if (VECTOR_MODE_P (mode))
8022 *cost += extra_cost->vect.alu;
8023 else
8024 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8027 /* We can combine fmul by a power of 2 followed by a fcvt into a single
8028 fixed-point fcvt. */
8029 if (GET_CODE (x) == MULT
8030 && ((VECTOR_MODE_P (mode)
8031 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8032 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8034 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8035 0, speed);
8036 return true;
8039 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8040 return true;
8042 case ABS:
8043 if (VECTOR_MODE_P (mode))
8045 /* ABS (vector). */
8046 if (speed)
8047 *cost += extra_cost->vect.alu;
8049 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8051 op0 = XEXP (x, 0);
8053 /* FABD, which is analogous to FADD. */
8054 if (GET_CODE (op0) == MINUS)
8056 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8057 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8058 if (speed)
8059 *cost += extra_cost->fp[mode == DFmode].addsub;
8061 return true;
8063 /* Simple FABS is analogous to FNEG. */
8064 if (speed)
8065 *cost += extra_cost->fp[mode == DFmode].neg;
8067 else
8069 /* Integer ABS will either be split to
8070 two arithmetic instructions, or will be an ABS
8071 (scalar), which we don't model. */
8072 *cost = COSTS_N_INSNS (2);
8073 if (speed)
8074 *cost += 2 * extra_cost->alu.arith;
8076 return false;
8078 case SMAX:
8079 case SMIN:
8080 if (speed)
8082 if (VECTOR_MODE_P (mode))
8083 *cost += extra_cost->vect.alu;
8084 else
8086 /* FMAXNM/FMINNM/FMAX/FMIN.
8087 TODO: This may not be accurate for all implementations, but
8088 we do not model this in the cost tables. */
8089 *cost += extra_cost->fp[mode == DFmode].addsub;
8092 return false;
8094 case UNSPEC:
8095 /* The floating point round to integer frint* instructions. */
8096 if (aarch64_frint_unspec_p (XINT (x, 1)))
8098 if (speed)
8099 *cost += extra_cost->fp[mode == DFmode].roundint;
8101 return false;
8104 if (XINT (x, 1) == UNSPEC_RBIT)
8106 if (speed)
8107 *cost += extra_cost->alu.rev;
8109 return false;
8111 break;
8113 case TRUNCATE:
8115 /* Decompose <su>muldi3_highpart. */
8116 if (/* (truncate:DI */
8117 mode == DImode
8118 /* (lshiftrt:TI */
8119 && GET_MODE (XEXP (x, 0)) == TImode
8120 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8121 /* (mult:TI */
8122 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8123 /* (ANY_EXTEND:TI (reg:DI))
8124 (ANY_EXTEND:TI (reg:DI))) */
8125 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8126 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8127 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8128 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8129 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8130 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8131 /* (const_int 64) */
8132 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8133 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8135 /* UMULH/SMULH. */
8136 if (speed)
8137 *cost += extra_cost->mult[mode == DImode].extend;
8138 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8139 mode, MULT, 0, speed);
8140 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8141 mode, MULT, 1, speed);
8142 return true;
8145 /* Fall through. */
8146 default:
8147 break;
8150 if (dump_file
8151 && flag_aarch64_verbose_cost)
8152 fprintf (dump_file,
8153 "\nFailed to cost RTX. Assuming default cost.\n");
8155 return true;
8158 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8159 calculated for X. This cost is stored in *COST. Returns true
8160 if the total cost of X was calculated. */
8161 static bool
8162 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8163 int param, int *cost, bool speed)
8165 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8167 if (dump_file
8168 && flag_aarch64_verbose_cost)
8170 print_rtl_single (dump_file, x);
8171 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8172 speed ? "Hot" : "Cold",
8173 *cost, result ? "final" : "partial");
8176 return result;
8179 static int
8180 aarch64_register_move_cost (machine_mode mode,
8181 reg_class_t from_i, reg_class_t to_i)
8183 enum reg_class from = (enum reg_class) from_i;
8184 enum reg_class to = (enum reg_class) to_i;
8185 const struct cpu_regmove_cost *regmove_cost
8186 = aarch64_tune_params.regmove_cost;
8188 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8189 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8190 to = GENERAL_REGS;
8192 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8193 from = GENERAL_REGS;
8195 /* Moving between GPR and stack cost is the same as GP2GP. */
8196 if ((from == GENERAL_REGS && to == STACK_REG)
8197 || (to == GENERAL_REGS && from == STACK_REG))
8198 return regmove_cost->GP2GP;
8200 /* To/From the stack register, we move via the gprs. */
8201 if (to == STACK_REG || from == STACK_REG)
8202 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8203 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8205 if (GET_MODE_SIZE (mode) == 16)
8207 /* 128-bit operations on general registers require 2 instructions. */
8208 if (from == GENERAL_REGS && to == GENERAL_REGS)
8209 return regmove_cost->GP2GP * 2;
8210 else if (from == GENERAL_REGS)
8211 return regmove_cost->GP2FP * 2;
8212 else if (to == GENERAL_REGS)
8213 return regmove_cost->FP2GP * 2;
8215 /* When AdvSIMD instructions are disabled it is not possible to move
8216 a 128-bit value directly between Q registers. This is handled in
8217 secondary reload. A general register is used as a scratch to move
8218 the upper DI value and the lower DI value is moved directly,
8219 hence the cost is the sum of three moves. */
8220 if (! TARGET_SIMD)
8221 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8223 return regmove_cost->FP2FP;
8226 if (from == GENERAL_REGS && to == GENERAL_REGS)
8227 return regmove_cost->GP2GP;
8228 else if (from == GENERAL_REGS)
8229 return regmove_cost->GP2FP;
8230 else if (to == GENERAL_REGS)
8231 return regmove_cost->FP2GP;
8233 return regmove_cost->FP2FP;
8236 static int
8237 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8238 reg_class_t rclass ATTRIBUTE_UNUSED,
8239 bool in ATTRIBUTE_UNUSED)
8241 return aarch64_tune_params.memmov_cost;
8244 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8245 to optimize 1.0/sqrt. */
8247 static bool
8248 use_rsqrt_p (machine_mode mode)
8250 return (!flag_trapping_math
8251 && flag_unsafe_math_optimizations
8252 && ((aarch64_tune_params.approx_modes->recip_sqrt
8253 & AARCH64_APPROX_MODE (mode))
8254 || flag_mrecip_low_precision_sqrt));
8257 /* Function to decide when to use the approximate reciprocal square root
8258 builtin. */
8260 static tree
8261 aarch64_builtin_reciprocal (tree fndecl)
8263 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8265 if (!use_rsqrt_p (mode))
8266 return NULL_TREE;
8267 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8270 typedef rtx (*rsqrte_type) (rtx, rtx);
8272 /* Select reciprocal square root initial estimate insn depending on machine
8273 mode. */
8275 static rsqrte_type
8276 get_rsqrte_type (machine_mode mode)
8278 switch (mode)
8280 case E_DFmode: return gen_aarch64_rsqrtedf;
8281 case E_SFmode: return gen_aarch64_rsqrtesf;
8282 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8283 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8284 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8285 default: gcc_unreachable ();
8289 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8291 /* Select reciprocal square root series step insn depending on machine mode. */
8293 static rsqrts_type
8294 get_rsqrts_type (machine_mode mode)
8296 switch (mode)
8298 case E_DFmode: return gen_aarch64_rsqrtsdf;
8299 case E_SFmode: return gen_aarch64_rsqrtssf;
8300 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8301 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8302 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8303 default: gcc_unreachable ();
8307 /* Emit instruction sequence to compute either the approximate square root
8308 or its approximate reciprocal, depending on the flag RECP, and return
8309 whether the sequence was emitted or not. */
8311 bool
8312 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8314 machine_mode mode = GET_MODE (dst);
8316 if (GET_MODE_INNER (mode) == HFmode)
8318 gcc_assert (!recp);
8319 return false;
8322 if (!recp)
8324 if (!(flag_mlow_precision_sqrt
8325 || (aarch64_tune_params.approx_modes->sqrt
8326 & AARCH64_APPROX_MODE (mode))))
8327 return false;
8329 if (flag_finite_math_only
8330 || flag_trapping_math
8331 || !flag_unsafe_math_optimizations
8332 || optimize_function_for_size_p (cfun))
8333 return false;
8335 else
8336 /* Caller assumes we cannot fail. */
8337 gcc_assert (use_rsqrt_p (mode));
8339 machine_mode mmsk = mode_for_int_vector (mode).require ();
8340 rtx xmsk = gen_reg_rtx (mmsk);
8341 if (!recp)
8342 /* When calculating the approximate square root, compare the
8343 argument with 0.0 and create a mask. */
8344 emit_insn (gen_rtx_SET (xmsk,
8345 gen_rtx_NEG (mmsk,
8346 gen_rtx_EQ (mmsk, src,
8347 CONST0_RTX (mode)))));
8349 /* Estimate the approximate reciprocal square root. */
8350 rtx xdst = gen_reg_rtx (mode);
8351 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8353 /* Iterate over the series twice for SF and thrice for DF. */
8354 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8356 /* Optionally iterate over the series once less for faster performance
8357 while sacrificing the accuracy. */
8358 if ((recp && flag_mrecip_low_precision_sqrt)
8359 || (!recp && flag_mlow_precision_sqrt))
8360 iterations--;
8362 /* Iterate over the series to calculate the approximate reciprocal square
8363 root. */
8364 rtx x1 = gen_reg_rtx (mode);
8365 while (iterations--)
8367 rtx x2 = gen_reg_rtx (mode);
8368 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8370 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8372 if (iterations > 0)
8373 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8376 if (!recp)
8378 /* Qualify the approximate reciprocal square root when the argument is
8379 0.0 by squashing the intermediary result to 0.0. */
8380 rtx xtmp = gen_reg_rtx (mmsk);
8381 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8382 gen_rtx_SUBREG (mmsk, xdst, 0)));
8383 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8385 /* Calculate the approximate square root. */
8386 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8389 /* Finalize the approximation. */
8390 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8392 return true;
8395 typedef rtx (*recpe_type) (rtx, rtx);
8397 /* Select reciprocal initial estimate insn depending on machine mode. */
8399 static recpe_type
8400 get_recpe_type (machine_mode mode)
8402 switch (mode)
8404 case E_SFmode: return (gen_aarch64_frecpesf);
8405 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8406 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8407 case E_DFmode: return (gen_aarch64_frecpedf);
8408 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8409 default: gcc_unreachable ();
8413 typedef rtx (*recps_type) (rtx, rtx, rtx);
8415 /* Select reciprocal series step insn depending on machine mode. */
8417 static recps_type
8418 get_recps_type (machine_mode mode)
8420 switch (mode)
8422 case E_SFmode: return (gen_aarch64_frecpssf);
8423 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8424 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8425 case E_DFmode: return (gen_aarch64_frecpsdf);
8426 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8427 default: gcc_unreachable ();
8431 /* Emit the instruction sequence to compute the approximation for the division
8432 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8434 bool
8435 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8437 machine_mode mode = GET_MODE (quo);
8439 if (GET_MODE_INNER (mode) == HFmode)
8440 return false;
8442 bool use_approx_division_p = (flag_mlow_precision_div
8443 || (aarch64_tune_params.approx_modes->division
8444 & AARCH64_APPROX_MODE (mode)));
8446 if (!flag_finite_math_only
8447 || flag_trapping_math
8448 || !flag_unsafe_math_optimizations
8449 || optimize_function_for_size_p (cfun)
8450 || !use_approx_division_p)
8451 return false;
8453 /* Estimate the approximate reciprocal. */
8454 rtx xrcp = gen_reg_rtx (mode);
8455 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8457 /* Iterate over the series twice for SF and thrice for DF. */
8458 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8460 /* Optionally iterate over the series once less for faster performance,
8461 while sacrificing the accuracy. */
8462 if (flag_mlow_precision_div)
8463 iterations--;
8465 /* Iterate over the series to calculate the approximate reciprocal. */
8466 rtx xtmp = gen_reg_rtx (mode);
8467 while (iterations--)
8469 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8471 if (iterations > 0)
8472 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8475 if (num != CONST1_RTX (mode))
8477 /* As the approximate reciprocal of DEN is already calculated, only
8478 calculate the approximate division when NUM is not 1.0. */
8479 rtx xnum = force_reg (mode, num);
8480 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8483 /* Finalize the approximation. */
8484 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8485 return true;
8488 /* Return the number of instructions that can be issued per cycle. */
8489 static int
8490 aarch64_sched_issue_rate (void)
8492 return aarch64_tune_params.issue_rate;
8495 static int
8496 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8498 int issue_rate = aarch64_sched_issue_rate ();
8500 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8504 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8505 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8506 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8508 static int
8509 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8510 int ready_index)
8512 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8516 /* Vectorizer cost model target hooks. */
8518 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8519 static int
8520 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8521 tree vectype,
8522 int misalign ATTRIBUTE_UNUSED)
8524 unsigned elements;
8525 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8526 bool fp = false;
8528 if (vectype != NULL)
8529 fp = FLOAT_TYPE_P (vectype);
8531 switch (type_of_cost)
8533 case scalar_stmt:
8534 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8536 case scalar_load:
8537 return costs->scalar_load_cost;
8539 case scalar_store:
8540 return costs->scalar_store_cost;
8542 case vector_stmt:
8543 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8545 case vector_load:
8546 return costs->vec_align_load_cost;
8548 case vector_store:
8549 return costs->vec_store_cost;
8551 case vec_to_scalar:
8552 return costs->vec_to_scalar_cost;
8554 case scalar_to_vec:
8555 return costs->scalar_to_vec_cost;
8557 case unaligned_load:
8558 return costs->vec_unalign_load_cost;
8560 case unaligned_store:
8561 return costs->vec_unalign_store_cost;
8563 case cond_branch_taken:
8564 return costs->cond_taken_branch_cost;
8566 case cond_branch_not_taken:
8567 return costs->cond_not_taken_branch_cost;
8569 case vec_perm:
8570 return costs->vec_permute_cost;
8572 case vec_promote_demote:
8573 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8575 case vec_construct:
8576 elements = TYPE_VECTOR_SUBPARTS (vectype);
8577 return elements / 2 + 1;
8579 default:
8580 gcc_unreachable ();
8584 /* Implement targetm.vectorize.add_stmt_cost. */
8585 static unsigned
8586 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8587 struct _stmt_vec_info *stmt_info, int misalign,
8588 enum vect_cost_model_location where)
8590 unsigned *cost = (unsigned *) data;
8591 unsigned retval = 0;
8593 if (flag_vect_cost_model)
8595 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8596 int stmt_cost =
8597 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8599 /* Statements in an inner loop relative to the loop being
8600 vectorized are weighted more heavily. The value here is
8601 arbitrary and could potentially be improved with analysis. */
8602 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8603 count *= 50; /* FIXME */
8605 retval = (unsigned) (count * stmt_cost);
8606 cost[where] += retval;
8609 return retval;
8612 static void initialize_aarch64_code_model (struct gcc_options *);
8614 /* Parse the TO_PARSE string and put the architecture struct that it
8615 selects into RES and the architectural features into ISA_FLAGS.
8616 Return an aarch64_parse_opt_result describing the parse result.
8617 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8619 static enum aarch64_parse_opt_result
8620 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8621 unsigned long *isa_flags)
8623 char *ext;
8624 const struct processor *arch;
8625 char *str = (char *) alloca (strlen (to_parse) + 1);
8626 size_t len;
8628 strcpy (str, to_parse);
8630 ext = strchr (str, '+');
8632 if (ext != NULL)
8633 len = ext - str;
8634 else
8635 len = strlen (str);
8637 if (len == 0)
8638 return AARCH64_PARSE_MISSING_ARG;
8641 /* Loop through the list of supported ARCHes to find a match. */
8642 for (arch = all_architectures; arch->name != NULL; arch++)
8644 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8646 unsigned long isa_temp = arch->flags;
8648 if (ext != NULL)
8650 /* TO_PARSE string contains at least one extension. */
8651 enum aarch64_parse_opt_result ext_res
8652 = aarch64_parse_extension (ext, &isa_temp);
8654 if (ext_res != AARCH64_PARSE_OK)
8655 return ext_res;
8657 /* Extension parsing was successful. Confirm the result
8658 arch and ISA flags. */
8659 *res = arch;
8660 *isa_flags = isa_temp;
8661 return AARCH64_PARSE_OK;
8665 /* ARCH name not found in list. */
8666 return AARCH64_PARSE_INVALID_ARG;
8669 /* Parse the TO_PARSE string and put the result tuning in RES and the
8670 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8671 describing the parse result. If there is an error parsing, RES and
8672 ISA_FLAGS are left unchanged. */
8674 static enum aarch64_parse_opt_result
8675 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8676 unsigned long *isa_flags)
8678 char *ext;
8679 const struct processor *cpu;
8680 char *str = (char *) alloca (strlen (to_parse) + 1);
8681 size_t len;
8683 strcpy (str, to_parse);
8685 ext = strchr (str, '+');
8687 if (ext != NULL)
8688 len = ext - str;
8689 else
8690 len = strlen (str);
8692 if (len == 0)
8693 return AARCH64_PARSE_MISSING_ARG;
8696 /* Loop through the list of supported CPUs to find a match. */
8697 for (cpu = all_cores; cpu->name != NULL; cpu++)
8699 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8701 unsigned long isa_temp = cpu->flags;
8704 if (ext != NULL)
8706 /* TO_PARSE string contains at least one extension. */
8707 enum aarch64_parse_opt_result ext_res
8708 = aarch64_parse_extension (ext, &isa_temp);
8710 if (ext_res != AARCH64_PARSE_OK)
8711 return ext_res;
8713 /* Extension parsing was successfull. Confirm the result
8714 cpu and ISA flags. */
8715 *res = cpu;
8716 *isa_flags = isa_temp;
8717 return AARCH64_PARSE_OK;
8721 /* CPU name not found in list. */
8722 return AARCH64_PARSE_INVALID_ARG;
8725 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8726 Return an aarch64_parse_opt_result describing the parse result.
8727 If the parsing fails the RES does not change. */
8729 static enum aarch64_parse_opt_result
8730 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8732 const struct processor *cpu;
8733 char *str = (char *) alloca (strlen (to_parse) + 1);
8735 strcpy (str, to_parse);
8737 /* Loop through the list of supported CPUs to find a match. */
8738 for (cpu = all_cores; cpu->name != NULL; cpu++)
8740 if (strcmp (cpu->name, str) == 0)
8742 *res = cpu;
8743 return AARCH64_PARSE_OK;
8747 /* CPU name not found in list. */
8748 return AARCH64_PARSE_INVALID_ARG;
8751 /* Parse TOKEN, which has length LENGTH to see if it is an option
8752 described in FLAG. If it is, return the index bit for that fusion type.
8753 If not, error (printing OPTION_NAME) and return zero. */
8755 static unsigned int
8756 aarch64_parse_one_option_token (const char *token,
8757 size_t length,
8758 const struct aarch64_flag_desc *flag,
8759 const char *option_name)
8761 for (; flag->name != NULL; flag++)
8763 if (length == strlen (flag->name)
8764 && !strncmp (flag->name, token, length))
8765 return flag->flag;
8768 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8769 return 0;
8772 /* Parse OPTION which is a comma-separated list of flags to enable.
8773 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8774 default state we inherit from the CPU tuning structures. OPTION_NAME
8775 gives the top-level option we are parsing in the -moverride string,
8776 for use in error messages. */
8778 static unsigned int
8779 aarch64_parse_boolean_options (const char *option,
8780 const struct aarch64_flag_desc *flags,
8781 unsigned int initial_state,
8782 const char *option_name)
8784 const char separator = '.';
8785 const char* specs = option;
8786 const char* ntoken = option;
8787 unsigned int found_flags = initial_state;
8789 while ((ntoken = strchr (specs, separator)))
8791 size_t token_length = ntoken - specs;
8792 unsigned token_ops = aarch64_parse_one_option_token (specs,
8793 token_length,
8794 flags,
8795 option_name);
8796 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8797 in the token stream, reset the supported operations. So:
8799 adrp+add.cmp+branch.none.adrp+add
8801 would have the result of turning on only adrp+add fusion. */
8802 if (!token_ops)
8803 found_flags = 0;
8805 found_flags |= token_ops;
8806 specs = ++ntoken;
8809 /* We ended with a comma, print something. */
8810 if (!(*specs))
8812 error ("%s string ill-formed\n", option_name);
8813 return 0;
8816 /* We still have one more token to parse. */
8817 size_t token_length = strlen (specs);
8818 unsigned token_ops = aarch64_parse_one_option_token (specs,
8819 token_length,
8820 flags,
8821 option_name);
8822 if (!token_ops)
8823 found_flags = 0;
8825 found_flags |= token_ops;
8826 return found_flags;
8829 /* Support for overriding instruction fusion. */
8831 static void
8832 aarch64_parse_fuse_string (const char *fuse_string,
8833 struct tune_params *tune)
8835 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8836 aarch64_fusible_pairs,
8837 tune->fusible_ops,
8838 "fuse=");
8841 /* Support for overriding other tuning flags. */
8843 static void
8844 aarch64_parse_tune_string (const char *tune_string,
8845 struct tune_params *tune)
8847 tune->extra_tuning_flags
8848 = aarch64_parse_boolean_options (tune_string,
8849 aarch64_tuning_flags,
8850 tune->extra_tuning_flags,
8851 "tune=");
8854 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8855 we understand. If it is, extract the option string and handoff to
8856 the appropriate function. */
8858 void
8859 aarch64_parse_one_override_token (const char* token,
8860 size_t length,
8861 struct tune_params *tune)
8863 const struct aarch64_tuning_override_function *fn
8864 = aarch64_tuning_override_functions;
8866 const char *option_part = strchr (token, '=');
8867 if (!option_part)
8869 error ("tuning string missing in option (%s)", token);
8870 return;
8873 /* Get the length of the option name. */
8874 length = option_part - token;
8875 /* Skip the '=' to get to the option string. */
8876 option_part++;
8878 for (; fn->name != NULL; fn++)
8880 if (!strncmp (fn->name, token, length))
8882 fn->parse_override (option_part, tune);
8883 return;
8887 error ("unknown tuning option (%s)",token);
8888 return;
8891 /* A checking mechanism for the implementation of the tls size. */
8893 static void
8894 initialize_aarch64_tls_size (struct gcc_options *opts)
8896 if (aarch64_tls_size == 0)
8897 aarch64_tls_size = 24;
8899 switch (opts->x_aarch64_cmodel_var)
8901 case AARCH64_CMODEL_TINY:
8902 /* Both the default and maximum TLS size allowed under tiny is 1M which
8903 needs two instructions to address, so we clamp the size to 24. */
8904 if (aarch64_tls_size > 24)
8905 aarch64_tls_size = 24;
8906 break;
8907 case AARCH64_CMODEL_SMALL:
8908 /* The maximum TLS size allowed under small is 4G. */
8909 if (aarch64_tls_size > 32)
8910 aarch64_tls_size = 32;
8911 break;
8912 case AARCH64_CMODEL_LARGE:
8913 /* The maximum TLS size allowed under large is 16E.
8914 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8915 if (aarch64_tls_size > 48)
8916 aarch64_tls_size = 48;
8917 break;
8918 default:
8919 gcc_unreachable ();
8922 return;
8925 /* Parse STRING looking for options in the format:
8926 string :: option:string
8927 option :: name=substring
8928 name :: {a-z}
8929 substring :: defined by option. */
8931 static void
8932 aarch64_parse_override_string (const char* input_string,
8933 struct tune_params* tune)
8935 const char separator = ':';
8936 size_t string_length = strlen (input_string) + 1;
8937 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8938 char *string = string_root;
8939 strncpy (string, input_string, string_length);
8940 string[string_length - 1] = '\0';
8942 char* ntoken = string;
8944 while ((ntoken = strchr (string, separator)))
8946 size_t token_length = ntoken - string;
8947 /* Make this substring look like a string. */
8948 *ntoken = '\0';
8949 aarch64_parse_one_override_token (string, token_length, tune);
8950 string = ++ntoken;
8953 /* One last option to parse. */
8954 aarch64_parse_one_override_token (string, strlen (string), tune);
8955 free (string_root);
8959 static void
8960 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8962 /* The logic here is that if we are disabling all frame pointer generation
8963 then we do not need to disable leaf frame pointer generation as a
8964 separate operation. But if we are *only* disabling leaf frame pointer
8965 generation then we set flag_omit_frame_pointer to true, but in
8966 aarch64_frame_pointer_required we return false only for leaf functions.
8968 PR 70044: We have to be careful about being called multiple times for the
8969 same function. Once we have decided to set flag_omit_frame_pointer just
8970 so that we can omit leaf frame pointers, we must then not interpret a
8971 second call as meaning that all frame pointer generation should be
8972 omitted. We do this by setting flag_omit_frame_pointer to a special,
8973 non-zero value. */
8974 if (opts->x_flag_omit_frame_pointer == 2)
8975 opts->x_flag_omit_frame_pointer = 0;
8977 if (opts->x_flag_omit_frame_pointer)
8978 opts->x_flag_omit_leaf_frame_pointer = false;
8979 else if (opts->x_flag_omit_leaf_frame_pointer)
8980 opts->x_flag_omit_frame_pointer = 2;
8982 /* If not optimizing for size, set the default
8983 alignment to what the target wants. */
8984 if (!opts->x_optimize_size)
8986 if (opts->x_align_loops <= 0)
8987 opts->x_align_loops = aarch64_tune_params.loop_align;
8988 if (opts->x_align_jumps <= 0)
8989 opts->x_align_jumps = aarch64_tune_params.jump_align;
8990 if (opts->x_align_functions <= 0)
8991 opts->x_align_functions = aarch64_tune_params.function_align;
8994 /* We default to no pc-relative literal loads. */
8996 aarch64_pcrelative_literal_loads = false;
8998 /* If -mpc-relative-literal-loads is set on the command line, this
8999 implies that the user asked for PC relative literal loads. */
9000 if (opts->x_pcrelative_literal_loads == 1)
9001 aarch64_pcrelative_literal_loads = true;
9003 /* In the tiny memory model it makes no sense to disallow PC relative
9004 literal pool loads. */
9005 if (aarch64_cmodel == AARCH64_CMODEL_TINY
9006 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9007 aarch64_pcrelative_literal_loads = true;
9009 /* When enabling the lower precision Newton series for the square root, also
9010 enable it for the reciprocal square root, since the latter is an
9011 intermediary step for the former. */
9012 if (flag_mlow_precision_sqrt)
9013 flag_mrecip_low_precision_sqrt = true;
9016 /* 'Unpack' up the internal tuning structs and update the options
9017 in OPTS. The caller must have set up selected_tune and selected_arch
9018 as all the other target-specific codegen decisions are
9019 derived from them. */
9021 void
9022 aarch64_override_options_internal (struct gcc_options *opts)
9024 aarch64_tune_flags = selected_tune->flags;
9025 aarch64_tune = selected_tune->sched_core;
9026 /* Make a copy of the tuning parameters attached to the core, which
9027 we may later overwrite. */
9028 aarch64_tune_params = *(selected_tune->tune);
9029 aarch64_architecture_version = selected_arch->architecture_version;
9031 if (opts->x_aarch64_override_tune_string)
9032 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9033 &aarch64_tune_params);
9035 /* This target defaults to strict volatile bitfields. */
9036 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9037 opts->x_flag_strict_volatile_bitfields = 1;
9039 initialize_aarch64_code_model (opts);
9040 initialize_aarch64_tls_size (opts);
9042 int queue_depth = 0;
9043 switch (aarch64_tune_params.autoprefetcher_model)
9045 case tune_params::AUTOPREFETCHER_OFF:
9046 queue_depth = -1;
9047 break;
9048 case tune_params::AUTOPREFETCHER_WEAK:
9049 queue_depth = 0;
9050 break;
9051 case tune_params::AUTOPREFETCHER_STRONG:
9052 queue_depth = max_insn_queue_index + 1;
9053 break;
9054 default:
9055 gcc_unreachable ();
9058 /* We don't mind passing in global_options_set here as we don't use
9059 the *options_set structs anyway. */
9060 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9061 queue_depth,
9062 opts->x_param_values,
9063 global_options_set.x_param_values);
9065 /* Set up parameters to be used in prefetching algorithm. Do not
9066 override the defaults unless we are tuning for a core we have
9067 researched values for. */
9068 if (aarch64_tune_params.prefetch->num_slots > 0)
9069 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9070 aarch64_tune_params.prefetch->num_slots,
9071 opts->x_param_values,
9072 global_options_set.x_param_values);
9073 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9074 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9075 aarch64_tune_params.prefetch->l1_cache_size,
9076 opts->x_param_values,
9077 global_options_set.x_param_values);
9078 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9079 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9080 aarch64_tune_params.prefetch->l1_cache_line_size,
9081 opts->x_param_values,
9082 global_options_set.x_param_values);
9083 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9084 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9085 aarch64_tune_params.prefetch->l2_cache_size,
9086 opts->x_param_values,
9087 global_options_set.x_param_values);
9089 /* Enable sw prefetching at specified optimization level for
9090 CPUS that have prefetch. Lower optimization level threshold by 1
9091 when profiling is enabled. */
9092 if (opts->x_flag_prefetch_loop_arrays < 0
9093 && !opts->x_optimize_size
9094 && aarch64_tune_params.prefetch->default_opt_level >= 0
9095 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9096 opts->x_flag_prefetch_loop_arrays = 1;
9098 aarch64_override_options_after_change_1 (opts);
9101 /* Print a hint with a suggestion for a core or architecture name that
9102 most closely resembles what the user passed in STR. ARCH is true if
9103 the user is asking for an architecture name. ARCH is false if the user
9104 is asking for a core name. */
9106 static void
9107 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9109 auto_vec<const char *> candidates;
9110 const struct processor *entry = arch ? all_architectures : all_cores;
9111 for (; entry->name != NULL; entry++)
9112 candidates.safe_push (entry->name);
9113 char *s;
9114 const char *hint = candidates_list_and_hint (str, s, candidates);
9115 if (hint)
9116 inform (input_location, "valid arguments are: %s;"
9117 " did you mean %qs?", s, hint);
9118 XDELETEVEC (s);
9121 /* Print a hint with a suggestion for a core name that most closely resembles
9122 what the user passed in STR. */
9124 inline static void
9125 aarch64_print_hint_for_core (const char *str)
9127 aarch64_print_hint_for_core_or_arch (str, false);
9130 /* Print a hint with a suggestion for an architecture name that most closely
9131 resembles what the user passed in STR. */
9133 inline static void
9134 aarch64_print_hint_for_arch (const char *str)
9136 aarch64_print_hint_for_core_or_arch (str, true);
9139 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9140 specified in STR and throw errors if appropriate. Put the results if
9141 they are valid in RES and ISA_FLAGS. Return whether the option is
9142 valid. */
9144 static bool
9145 aarch64_validate_mcpu (const char *str, const struct processor **res,
9146 unsigned long *isa_flags)
9148 enum aarch64_parse_opt_result parse_res
9149 = aarch64_parse_cpu (str, res, isa_flags);
9151 if (parse_res == AARCH64_PARSE_OK)
9152 return true;
9154 switch (parse_res)
9156 case AARCH64_PARSE_MISSING_ARG:
9157 error ("missing cpu name in %<-mcpu=%s%>", str);
9158 break;
9159 case AARCH64_PARSE_INVALID_ARG:
9160 error ("unknown value %qs for -mcpu", str);
9161 aarch64_print_hint_for_core (str);
9162 break;
9163 case AARCH64_PARSE_INVALID_FEATURE:
9164 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9165 break;
9166 default:
9167 gcc_unreachable ();
9170 return false;
9173 /* Validate a command-line -march option. Parse the arch and extensions
9174 (if any) specified in STR and throw errors if appropriate. Put the
9175 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9176 option is valid. */
9178 static bool
9179 aarch64_validate_march (const char *str, const struct processor **res,
9180 unsigned long *isa_flags)
9182 enum aarch64_parse_opt_result parse_res
9183 = aarch64_parse_arch (str, res, isa_flags);
9185 if (parse_res == AARCH64_PARSE_OK)
9186 return true;
9188 switch (parse_res)
9190 case AARCH64_PARSE_MISSING_ARG:
9191 error ("missing arch name in %<-march=%s%>", str);
9192 break;
9193 case AARCH64_PARSE_INVALID_ARG:
9194 error ("unknown value %qs for -march", str);
9195 aarch64_print_hint_for_arch (str);
9196 break;
9197 case AARCH64_PARSE_INVALID_FEATURE:
9198 error ("invalid feature modifier in %<-march=%s%>", str);
9199 break;
9200 default:
9201 gcc_unreachable ();
9204 return false;
9207 /* Validate a command-line -mtune option. Parse the cpu
9208 specified in STR and throw errors if appropriate. Put the
9209 result, if it is valid, in RES. Return whether the option is
9210 valid. */
9212 static bool
9213 aarch64_validate_mtune (const char *str, const struct processor **res)
9215 enum aarch64_parse_opt_result parse_res
9216 = aarch64_parse_tune (str, res);
9218 if (parse_res == AARCH64_PARSE_OK)
9219 return true;
9221 switch (parse_res)
9223 case AARCH64_PARSE_MISSING_ARG:
9224 error ("missing cpu name in %<-mtune=%s%>", str);
9225 break;
9226 case AARCH64_PARSE_INVALID_ARG:
9227 error ("unknown value %qs for -mtune", str);
9228 aarch64_print_hint_for_core (str);
9229 break;
9230 default:
9231 gcc_unreachable ();
9233 return false;
9236 /* Return the CPU corresponding to the enum CPU.
9237 If it doesn't specify a cpu, return the default. */
9239 static const struct processor *
9240 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9242 if (cpu != aarch64_none)
9243 return &all_cores[cpu];
9245 /* The & 0x3f is to extract the bottom 6 bits that encode the
9246 default cpu as selected by the --with-cpu GCC configure option
9247 in config.gcc.
9248 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9249 flags mechanism should be reworked to make it more sane. */
9250 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9253 /* Return the architecture corresponding to the enum ARCH.
9254 If it doesn't specify a valid architecture, return the default. */
9256 static const struct processor *
9257 aarch64_get_arch (enum aarch64_arch arch)
9259 if (arch != aarch64_no_arch)
9260 return &all_architectures[arch];
9262 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9264 return &all_architectures[cpu->arch];
9267 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9268 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9269 tuning structs. In particular it must set selected_tune and
9270 aarch64_isa_flags that define the available ISA features and tuning
9271 decisions. It must also set selected_arch as this will be used to
9272 output the .arch asm tags for each function. */
9274 static void
9275 aarch64_override_options (void)
9277 unsigned long cpu_isa = 0;
9278 unsigned long arch_isa = 0;
9279 aarch64_isa_flags = 0;
9281 bool valid_cpu = true;
9282 bool valid_tune = true;
9283 bool valid_arch = true;
9285 selected_cpu = NULL;
9286 selected_arch = NULL;
9287 selected_tune = NULL;
9289 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9290 If either of -march or -mtune is given, they override their
9291 respective component of -mcpu. */
9292 if (aarch64_cpu_string)
9293 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9294 &cpu_isa);
9296 if (aarch64_arch_string)
9297 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9298 &arch_isa);
9300 if (aarch64_tune_string)
9301 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9303 /* If the user did not specify a processor, choose the default
9304 one for them. This will be the CPU set during configuration using
9305 --with-cpu, otherwise it is "generic". */
9306 if (!selected_cpu)
9308 if (selected_arch)
9310 selected_cpu = &all_cores[selected_arch->ident];
9311 aarch64_isa_flags = arch_isa;
9312 explicit_arch = selected_arch->arch;
9314 else
9316 /* Get default configure-time CPU. */
9317 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9318 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9321 if (selected_tune)
9322 explicit_tune_core = selected_tune->ident;
9324 /* If both -mcpu and -march are specified check that they are architecturally
9325 compatible, warn if they're not and prefer the -march ISA flags. */
9326 else if (selected_arch)
9328 if (selected_arch->arch != selected_cpu->arch)
9330 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9331 all_architectures[selected_cpu->arch].name,
9332 selected_arch->name);
9334 aarch64_isa_flags = arch_isa;
9335 explicit_arch = selected_arch->arch;
9336 explicit_tune_core = selected_tune ? selected_tune->ident
9337 : selected_cpu->ident;
9339 else
9341 /* -mcpu but no -march. */
9342 aarch64_isa_flags = cpu_isa;
9343 explicit_tune_core = selected_tune ? selected_tune->ident
9344 : selected_cpu->ident;
9345 gcc_assert (selected_cpu);
9346 selected_arch = &all_architectures[selected_cpu->arch];
9347 explicit_arch = selected_arch->arch;
9350 /* Set the arch as well as we will need it when outputing
9351 the .arch directive in assembly. */
9352 if (!selected_arch)
9354 gcc_assert (selected_cpu);
9355 selected_arch = &all_architectures[selected_cpu->arch];
9358 if (!selected_tune)
9359 selected_tune = selected_cpu;
9361 #ifndef HAVE_AS_MABI_OPTION
9362 /* The compiler may have been configured with 2.23.* binutils, which does
9363 not have support for ILP32. */
9364 if (TARGET_ILP32)
9365 error ("Assembler does not support -mabi=ilp32");
9366 #endif
9368 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9369 sorry ("Return address signing is only supported for -mabi=lp64");
9371 /* Make sure we properly set up the explicit options. */
9372 if ((aarch64_cpu_string && valid_cpu)
9373 || (aarch64_tune_string && valid_tune))
9374 gcc_assert (explicit_tune_core != aarch64_none);
9376 if ((aarch64_cpu_string && valid_cpu)
9377 || (aarch64_arch_string && valid_arch))
9378 gcc_assert (explicit_arch != aarch64_no_arch);
9380 aarch64_override_options_internal (&global_options);
9382 /* Save these options as the default ones in case we push and pop them later
9383 while processing functions with potential target attributes. */
9384 target_option_default_node = target_option_current_node
9385 = build_target_option_node (&global_options);
9388 /* Implement targetm.override_options_after_change. */
9390 static void
9391 aarch64_override_options_after_change (void)
9393 aarch64_override_options_after_change_1 (&global_options);
9396 static struct machine_function *
9397 aarch64_init_machine_status (void)
9399 struct machine_function *machine;
9400 machine = ggc_cleared_alloc<machine_function> ();
9401 return machine;
9404 void
9405 aarch64_init_expanders (void)
9407 init_machine_status = aarch64_init_machine_status;
9410 /* A checking mechanism for the implementation of the various code models. */
9411 static void
9412 initialize_aarch64_code_model (struct gcc_options *opts)
9414 if (opts->x_flag_pic)
9416 switch (opts->x_aarch64_cmodel_var)
9418 case AARCH64_CMODEL_TINY:
9419 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9420 break;
9421 case AARCH64_CMODEL_SMALL:
9422 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9423 aarch64_cmodel = (flag_pic == 2
9424 ? AARCH64_CMODEL_SMALL_PIC
9425 : AARCH64_CMODEL_SMALL_SPIC);
9426 #else
9427 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9428 #endif
9429 break;
9430 case AARCH64_CMODEL_LARGE:
9431 sorry ("code model %qs with -f%s", "large",
9432 opts->x_flag_pic > 1 ? "PIC" : "pic");
9433 break;
9434 default:
9435 gcc_unreachable ();
9438 else
9439 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9442 /* Implement TARGET_OPTION_SAVE. */
9444 static void
9445 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9447 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9450 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9451 using the information saved in PTR. */
9453 static void
9454 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9456 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9457 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9458 opts->x_explicit_arch = ptr->x_explicit_arch;
9459 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9460 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9462 aarch64_override_options_internal (opts);
9465 /* Implement TARGET_OPTION_PRINT. */
9467 static void
9468 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9470 const struct processor *cpu
9471 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9472 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9473 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9474 std::string extension
9475 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9477 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9478 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9479 arch->name, extension.c_str ());
9482 static GTY(()) tree aarch64_previous_fndecl;
9484 void
9485 aarch64_reset_previous_fndecl (void)
9487 aarch64_previous_fndecl = NULL;
9490 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9491 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9492 make sure optab availability predicates are recomputed when necessary. */
9494 void
9495 aarch64_save_restore_target_globals (tree new_tree)
9497 if (TREE_TARGET_GLOBALS (new_tree))
9498 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9499 else if (new_tree == target_option_default_node)
9500 restore_target_globals (&default_target_globals);
9501 else
9502 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9505 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9506 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9507 of the function, if such exists. This function may be called multiple
9508 times on a single function so use aarch64_previous_fndecl to avoid
9509 setting up identical state. */
9511 static void
9512 aarch64_set_current_function (tree fndecl)
9514 if (!fndecl || fndecl == aarch64_previous_fndecl)
9515 return;
9517 tree old_tree = (aarch64_previous_fndecl
9518 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9519 : NULL_TREE);
9521 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9523 /* If current function has no attributes but the previous one did,
9524 use the default node. */
9525 if (!new_tree && old_tree)
9526 new_tree = target_option_default_node;
9528 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9529 the default have been handled by aarch64_save_restore_target_globals from
9530 aarch64_pragma_target_parse. */
9531 if (old_tree == new_tree)
9532 return;
9534 aarch64_previous_fndecl = fndecl;
9536 /* First set the target options. */
9537 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9539 aarch64_save_restore_target_globals (new_tree);
9542 /* Enum describing the various ways we can handle attributes.
9543 In many cases we can reuse the generic option handling machinery. */
9545 enum aarch64_attr_opt_type
9547 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9548 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9549 aarch64_attr_enum, /* Attribute sets an enum variable. */
9550 aarch64_attr_custom /* Attribute requires a custom handling function. */
9553 /* All the information needed to handle a target attribute.
9554 NAME is the name of the attribute.
9555 ATTR_TYPE specifies the type of behavior of the attribute as described
9556 in the definition of enum aarch64_attr_opt_type.
9557 ALLOW_NEG is true if the attribute supports a "no-" form.
9558 HANDLER is the function that takes the attribute string and whether
9559 it is a pragma or attribute and handles the option. It is needed only
9560 when the ATTR_TYPE is aarch64_attr_custom.
9561 OPT_NUM is the enum specifying the option that the attribute modifies.
9562 This is needed for attributes that mirror the behavior of a command-line
9563 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9564 aarch64_attr_enum. */
9566 struct aarch64_attribute_info
9568 const char *name;
9569 enum aarch64_attr_opt_type attr_type;
9570 bool allow_neg;
9571 bool (*handler) (const char *, const char *);
9572 enum opt_code opt_num;
9575 /* Handle the ARCH_STR argument to the arch= target attribute.
9576 PRAGMA_OR_ATTR is used in potential error messages. */
9578 static bool
9579 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9581 const struct processor *tmp_arch = NULL;
9582 enum aarch64_parse_opt_result parse_res
9583 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9585 if (parse_res == AARCH64_PARSE_OK)
9587 gcc_assert (tmp_arch);
9588 selected_arch = tmp_arch;
9589 explicit_arch = selected_arch->arch;
9590 return true;
9593 switch (parse_res)
9595 case AARCH64_PARSE_MISSING_ARG:
9596 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9597 break;
9598 case AARCH64_PARSE_INVALID_ARG:
9599 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9600 aarch64_print_hint_for_arch (str);
9601 break;
9602 case AARCH64_PARSE_INVALID_FEATURE:
9603 error ("invalid feature modifier %qs for 'arch' target %s",
9604 str, pragma_or_attr);
9605 break;
9606 default:
9607 gcc_unreachable ();
9610 return false;
9613 /* Handle the argument CPU_STR to the cpu= target attribute.
9614 PRAGMA_OR_ATTR is used in potential error messages. */
9616 static bool
9617 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9619 const struct processor *tmp_cpu = NULL;
9620 enum aarch64_parse_opt_result parse_res
9621 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9623 if (parse_res == AARCH64_PARSE_OK)
9625 gcc_assert (tmp_cpu);
9626 selected_tune = tmp_cpu;
9627 explicit_tune_core = selected_tune->ident;
9629 selected_arch = &all_architectures[tmp_cpu->arch];
9630 explicit_arch = selected_arch->arch;
9631 return true;
9634 switch (parse_res)
9636 case AARCH64_PARSE_MISSING_ARG:
9637 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9638 break;
9639 case AARCH64_PARSE_INVALID_ARG:
9640 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9641 aarch64_print_hint_for_core (str);
9642 break;
9643 case AARCH64_PARSE_INVALID_FEATURE:
9644 error ("invalid feature modifier %qs for 'cpu' target %s",
9645 str, pragma_or_attr);
9646 break;
9647 default:
9648 gcc_unreachable ();
9651 return false;
9654 /* Handle the argument STR to the tune= target attribute.
9655 PRAGMA_OR_ATTR is used in potential error messages. */
9657 static bool
9658 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9660 const struct processor *tmp_tune = NULL;
9661 enum aarch64_parse_opt_result parse_res
9662 = aarch64_parse_tune (str, &tmp_tune);
9664 if (parse_res == AARCH64_PARSE_OK)
9666 gcc_assert (tmp_tune);
9667 selected_tune = tmp_tune;
9668 explicit_tune_core = selected_tune->ident;
9669 return true;
9672 switch (parse_res)
9674 case AARCH64_PARSE_INVALID_ARG:
9675 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9676 aarch64_print_hint_for_core (str);
9677 break;
9678 default:
9679 gcc_unreachable ();
9682 return false;
9685 /* Parse an architecture extensions target attribute string specified in STR.
9686 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9687 if successful. Update aarch64_isa_flags to reflect the ISA features
9688 modified.
9689 PRAGMA_OR_ATTR is used in potential error messages. */
9691 static bool
9692 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9694 enum aarch64_parse_opt_result parse_res;
9695 unsigned long isa_flags = aarch64_isa_flags;
9697 /* We allow "+nothing" in the beginning to clear out all architectural
9698 features if the user wants to handpick specific features. */
9699 if (strncmp ("+nothing", str, 8) == 0)
9701 isa_flags = 0;
9702 str += 8;
9705 parse_res = aarch64_parse_extension (str, &isa_flags);
9707 if (parse_res == AARCH64_PARSE_OK)
9709 aarch64_isa_flags = isa_flags;
9710 return true;
9713 switch (parse_res)
9715 case AARCH64_PARSE_MISSING_ARG:
9716 error ("missing feature modifier in target %s %qs",
9717 pragma_or_attr, str);
9718 break;
9720 case AARCH64_PARSE_INVALID_FEATURE:
9721 error ("invalid feature modifier in target %s %qs",
9722 pragma_or_attr, str);
9723 break;
9725 default:
9726 gcc_unreachable ();
9729 return false;
9732 /* The target attributes that we support. On top of these we also support just
9733 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9734 handled explicitly in aarch64_process_one_target_attr. */
9736 static const struct aarch64_attribute_info aarch64_attributes[] =
9738 { "general-regs-only", aarch64_attr_mask, false, NULL,
9739 OPT_mgeneral_regs_only },
9740 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9741 OPT_mfix_cortex_a53_835769 },
9742 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9743 OPT_mfix_cortex_a53_843419 },
9744 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9745 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9746 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9747 OPT_momit_leaf_frame_pointer },
9748 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9749 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9750 OPT_march_ },
9751 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9752 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9753 OPT_mtune_ },
9754 { "sign-return-address", aarch64_attr_enum, false, NULL,
9755 OPT_msign_return_address_ },
9756 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9759 /* Parse ARG_STR which contains the definition of one target attribute.
9760 Show appropriate errors if any or return true if the attribute is valid.
9761 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9762 we're processing a target attribute or pragma. */
9764 static bool
9765 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9767 bool invert = false;
9769 size_t len = strlen (arg_str);
9771 if (len == 0)
9773 error ("malformed target %s", pragma_or_attr);
9774 return false;
9777 char *str_to_check = (char *) alloca (len + 1);
9778 strcpy (str_to_check, arg_str);
9780 /* Skip leading whitespace. */
9781 while (*str_to_check == ' ' || *str_to_check == '\t')
9782 str_to_check++;
9784 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9785 It is easier to detect and handle it explicitly here rather than going
9786 through the machinery for the rest of the target attributes in this
9787 function. */
9788 if (*str_to_check == '+')
9789 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9791 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9793 invert = true;
9794 str_to_check += 3;
9796 char *arg = strchr (str_to_check, '=');
9798 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9799 and point ARG to "foo". */
9800 if (arg)
9802 *arg = '\0';
9803 arg++;
9805 const struct aarch64_attribute_info *p_attr;
9806 bool found = false;
9807 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9809 /* If the names don't match up, or the user has given an argument
9810 to an attribute that doesn't accept one, or didn't give an argument
9811 to an attribute that expects one, fail to match. */
9812 if (strcmp (str_to_check, p_attr->name) != 0)
9813 continue;
9815 found = true;
9816 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9817 || p_attr->attr_type == aarch64_attr_enum;
9819 if (attr_need_arg_p ^ (arg != NULL))
9821 error ("target %s %qs does not accept an argument",
9822 pragma_or_attr, str_to_check);
9823 return false;
9826 /* If the name matches but the attribute does not allow "no-" versions
9827 then we can't match. */
9828 if (invert && !p_attr->allow_neg)
9830 error ("target %s %qs does not allow a negated form",
9831 pragma_or_attr, str_to_check);
9832 return false;
9835 switch (p_attr->attr_type)
9837 /* Has a custom handler registered.
9838 For example, cpu=, arch=, tune=. */
9839 case aarch64_attr_custom:
9840 gcc_assert (p_attr->handler);
9841 if (!p_attr->handler (arg, pragma_or_attr))
9842 return false;
9843 break;
9845 /* Either set or unset a boolean option. */
9846 case aarch64_attr_bool:
9848 struct cl_decoded_option decoded;
9850 generate_option (p_attr->opt_num, NULL, !invert,
9851 CL_TARGET, &decoded);
9852 aarch64_handle_option (&global_options, &global_options_set,
9853 &decoded, input_location);
9854 break;
9856 /* Set or unset a bit in the target_flags. aarch64_handle_option
9857 should know what mask to apply given the option number. */
9858 case aarch64_attr_mask:
9860 struct cl_decoded_option decoded;
9861 /* We only need to specify the option number.
9862 aarch64_handle_option will know which mask to apply. */
9863 decoded.opt_index = p_attr->opt_num;
9864 decoded.value = !invert;
9865 aarch64_handle_option (&global_options, &global_options_set,
9866 &decoded, input_location);
9867 break;
9869 /* Use the option setting machinery to set an option to an enum. */
9870 case aarch64_attr_enum:
9872 gcc_assert (arg);
9873 bool valid;
9874 int value;
9875 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9876 &value, CL_TARGET);
9877 if (valid)
9879 set_option (&global_options, NULL, p_attr->opt_num, value,
9880 NULL, DK_UNSPECIFIED, input_location,
9881 global_dc);
9883 else
9885 error ("target %s %s=%s is not valid",
9886 pragma_or_attr, str_to_check, arg);
9888 break;
9890 default:
9891 gcc_unreachable ();
9895 /* If we reached here we either have found an attribute and validated
9896 it or didn't match any. If we matched an attribute but its arguments
9897 were malformed we will have returned false already. */
9898 return found;
9901 /* Count how many times the character C appears in
9902 NULL-terminated string STR. */
9904 static unsigned int
9905 num_occurences_in_str (char c, char *str)
9907 unsigned int res = 0;
9908 while (*str != '\0')
9910 if (*str == c)
9911 res++;
9913 str++;
9916 return res;
9919 /* Parse the tree in ARGS that contains the target attribute information
9920 and update the global target options space. PRAGMA_OR_ATTR is a string
9921 to be used in error messages, specifying whether this is processing
9922 a target attribute or a target pragma. */
9924 bool
9925 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9927 if (TREE_CODE (args) == TREE_LIST)
9931 tree head = TREE_VALUE (args);
9932 if (head)
9934 if (!aarch64_process_target_attr (head, pragma_or_attr))
9935 return false;
9937 args = TREE_CHAIN (args);
9938 } while (args);
9940 return true;
9943 if (TREE_CODE (args) != STRING_CST)
9945 error ("attribute %<target%> argument not a string");
9946 return false;
9949 size_t len = strlen (TREE_STRING_POINTER (args));
9950 char *str_to_check = (char *) alloca (len + 1);
9951 strcpy (str_to_check, TREE_STRING_POINTER (args));
9953 if (len == 0)
9955 error ("malformed target %s value", pragma_or_attr);
9956 return false;
9959 /* Used to catch empty spaces between commas i.e.
9960 attribute ((target ("attr1,,attr2"))). */
9961 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9963 /* Handle multiple target attributes separated by ','. */
9964 char *token = strtok (str_to_check, ",");
9966 unsigned int num_attrs = 0;
9967 while (token)
9969 num_attrs++;
9970 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9972 error ("target %s %qs is invalid", pragma_or_attr, token);
9973 return false;
9976 token = strtok (NULL, ",");
9979 if (num_attrs != num_commas + 1)
9981 error ("malformed target %s list %qs",
9982 pragma_or_attr, TREE_STRING_POINTER (args));
9983 return false;
9986 return true;
9989 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9990 process attribute ((target ("..."))). */
9992 static bool
9993 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9995 struct cl_target_option cur_target;
9996 bool ret;
9997 tree old_optimize;
9998 tree new_target, new_optimize;
9999 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10001 /* If what we're processing is the current pragma string then the
10002 target option node is already stored in target_option_current_node
10003 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
10004 having to re-parse the string. This is especially useful to keep
10005 arm_neon.h compile times down since that header contains a lot
10006 of intrinsics enclosed in pragmas. */
10007 if (!existing_target && args == current_target_pragma)
10009 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10010 return true;
10012 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10014 old_optimize = build_optimization_node (&global_options);
10015 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10017 /* If the function changed the optimization levels as well as setting
10018 target options, start with the optimizations specified. */
10019 if (func_optimize && func_optimize != old_optimize)
10020 cl_optimization_restore (&global_options,
10021 TREE_OPTIMIZATION (func_optimize));
10023 /* Save the current target options to restore at the end. */
10024 cl_target_option_save (&cur_target, &global_options);
10026 /* If fndecl already has some target attributes applied to it, unpack
10027 them so that we add this attribute on top of them, rather than
10028 overwriting them. */
10029 if (existing_target)
10031 struct cl_target_option *existing_options
10032 = TREE_TARGET_OPTION (existing_target);
10034 if (existing_options)
10035 cl_target_option_restore (&global_options, existing_options);
10037 else
10038 cl_target_option_restore (&global_options,
10039 TREE_TARGET_OPTION (target_option_current_node));
10042 ret = aarch64_process_target_attr (args, "attribute");
10044 /* Set up any additional state. */
10045 if (ret)
10047 aarch64_override_options_internal (&global_options);
10048 /* Initialize SIMD builtins if we haven't already.
10049 Set current_target_pragma to NULL for the duration so that
10050 the builtin initialization code doesn't try to tag the functions
10051 being built with the attributes specified by any current pragma, thus
10052 going into an infinite recursion. */
10053 if (TARGET_SIMD)
10055 tree saved_current_target_pragma = current_target_pragma;
10056 current_target_pragma = NULL;
10057 aarch64_init_simd_builtins ();
10058 current_target_pragma = saved_current_target_pragma;
10060 new_target = build_target_option_node (&global_options);
10062 else
10063 new_target = NULL;
10065 new_optimize = build_optimization_node (&global_options);
10067 if (fndecl && ret)
10069 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10071 if (old_optimize != new_optimize)
10072 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10075 cl_target_option_restore (&global_options, &cur_target);
10077 if (old_optimize != new_optimize)
10078 cl_optimization_restore (&global_options,
10079 TREE_OPTIMIZATION (old_optimize));
10080 return ret;
10083 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10084 tri-bool options (yes, no, don't care) and the default value is
10085 DEF, determine whether to reject inlining. */
10087 static bool
10088 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10089 int dont_care, int def)
10091 /* If the callee doesn't care, always allow inlining. */
10092 if (callee == dont_care)
10093 return true;
10095 /* If the caller doesn't care, always allow inlining. */
10096 if (caller == dont_care)
10097 return true;
10099 /* Otherwise, allow inlining if either the callee and caller values
10100 agree, or if the callee is using the default value. */
10101 return (callee == caller || callee == def);
10104 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10105 to inline CALLEE into CALLER based on target-specific info.
10106 Make sure that the caller and callee have compatible architectural
10107 features. Then go through the other possible target attributes
10108 and see if they can block inlining. Try not to reject always_inline
10109 callees unless they are incompatible architecturally. */
10111 static bool
10112 aarch64_can_inline_p (tree caller, tree callee)
10114 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10115 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10117 /* If callee has no option attributes, then it is ok to inline. */
10118 if (!callee_tree)
10119 return true;
10121 struct cl_target_option *caller_opts
10122 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10123 : target_option_default_node);
10125 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10128 /* Callee's ISA flags should be a subset of the caller's. */
10129 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10130 != callee_opts->x_aarch64_isa_flags)
10131 return false;
10133 /* Allow non-strict aligned functions inlining into strict
10134 aligned ones. */
10135 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10136 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10137 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10138 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10139 return false;
10141 bool always_inline = lookup_attribute ("always_inline",
10142 DECL_ATTRIBUTES (callee));
10144 /* If the architectural features match up and the callee is always_inline
10145 then the other attributes don't matter. */
10146 if (always_inline)
10147 return true;
10149 if (caller_opts->x_aarch64_cmodel_var
10150 != callee_opts->x_aarch64_cmodel_var)
10151 return false;
10153 if (caller_opts->x_aarch64_tls_dialect
10154 != callee_opts->x_aarch64_tls_dialect)
10155 return false;
10157 /* Honour explicit requests to workaround errata. */
10158 if (!aarch64_tribools_ok_for_inlining_p (
10159 caller_opts->x_aarch64_fix_a53_err835769,
10160 callee_opts->x_aarch64_fix_a53_err835769,
10161 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10162 return false;
10164 if (!aarch64_tribools_ok_for_inlining_p (
10165 caller_opts->x_aarch64_fix_a53_err843419,
10166 callee_opts->x_aarch64_fix_a53_err843419,
10167 2, TARGET_FIX_ERR_A53_843419))
10168 return false;
10170 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10171 caller and calle and they don't match up, reject inlining. */
10172 if (!aarch64_tribools_ok_for_inlining_p (
10173 caller_opts->x_flag_omit_leaf_frame_pointer,
10174 callee_opts->x_flag_omit_leaf_frame_pointer,
10175 2, 1))
10176 return false;
10178 /* If the callee has specific tuning overrides, respect them. */
10179 if (callee_opts->x_aarch64_override_tune_string != NULL
10180 && caller_opts->x_aarch64_override_tune_string == NULL)
10181 return false;
10183 /* If the user specified tuning override strings for the
10184 caller and callee and they don't match up, reject inlining.
10185 We just do a string compare here, we don't analyze the meaning
10186 of the string, as it would be too costly for little gain. */
10187 if (callee_opts->x_aarch64_override_tune_string
10188 && caller_opts->x_aarch64_override_tune_string
10189 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10190 caller_opts->x_aarch64_override_tune_string) != 0))
10191 return false;
10193 return true;
10196 /* Return true if SYMBOL_REF X binds locally. */
10198 static bool
10199 aarch64_symbol_binds_local_p (const_rtx x)
10201 return (SYMBOL_REF_DECL (x)
10202 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10203 : SYMBOL_REF_LOCAL_P (x));
10206 /* Return true if SYMBOL_REF X is thread local */
10207 static bool
10208 aarch64_tls_symbol_p (rtx x)
10210 if (! TARGET_HAVE_TLS)
10211 return false;
10213 if (GET_CODE (x) != SYMBOL_REF)
10214 return false;
10216 return SYMBOL_REF_TLS_MODEL (x) != 0;
10219 /* Classify a TLS symbol into one of the TLS kinds. */
10220 enum aarch64_symbol_type
10221 aarch64_classify_tls_symbol (rtx x)
10223 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10225 switch (tls_kind)
10227 case TLS_MODEL_GLOBAL_DYNAMIC:
10228 case TLS_MODEL_LOCAL_DYNAMIC:
10229 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10231 case TLS_MODEL_INITIAL_EXEC:
10232 switch (aarch64_cmodel)
10234 case AARCH64_CMODEL_TINY:
10235 case AARCH64_CMODEL_TINY_PIC:
10236 return SYMBOL_TINY_TLSIE;
10237 default:
10238 return SYMBOL_SMALL_TLSIE;
10241 case TLS_MODEL_LOCAL_EXEC:
10242 if (aarch64_tls_size == 12)
10243 return SYMBOL_TLSLE12;
10244 else if (aarch64_tls_size == 24)
10245 return SYMBOL_TLSLE24;
10246 else if (aarch64_tls_size == 32)
10247 return SYMBOL_TLSLE32;
10248 else if (aarch64_tls_size == 48)
10249 return SYMBOL_TLSLE48;
10250 else
10251 gcc_unreachable ();
10253 case TLS_MODEL_EMULATED:
10254 case TLS_MODEL_NONE:
10255 return SYMBOL_FORCE_TO_MEM;
10257 default:
10258 gcc_unreachable ();
10262 /* Return the method that should be used to access SYMBOL_REF or
10263 LABEL_REF X. */
10265 enum aarch64_symbol_type
10266 aarch64_classify_symbol (rtx x, rtx offset)
10268 if (GET_CODE (x) == LABEL_REF)
10270 switch (aarch64_cmodel)
10272 case AARCH64_CMODEL_LARGE:
10273 return SYMBOL_FORCE_TO_MEM;
10275 case AARCH64_CMODEL_TINY_PIC:
10276 case AARCH64_CMODEL_TINY:
10277 return SYMBOL_TINY_ABSOLUTE;
10279 case AARCH64_CMODEL_SMALL_SPIC:
10280 case AARCH64_CMODEL_SMALL_PIC:
10281 case AARCH64_CMODEL_SMALL:
10282 return SYMBOL_SMALL_ABSOLUTE;
10284 default:
10285 gcc_unreachable ();
10289 if (GET_CODE (x) == SYMBOL_REF)
10291 if (aarch64_tls_symbol_p (x))
10292 return aarch64_classify_tls_symbol (x);
10294 switch (aarch64_cmodel)
10296 case AARCH64_CMODEL_TINY:
10297 /* When we retrieve symbol + offset address, we have to make sure
10298 the offset does not cause overflow of the final address. But
10299 we have no way of knowing the address of symbol at compile time
10300 so we can't accurately say if the distance between the PC and
10301 symbol + offset is outside the addressible range of +/-1M in the
10302 TINY code model. So we rely on images not being greater than
10303 1M and cap the offset at 1M and anything beyond 1M will have to
10304 be loaded using an alternative mechanism. Furthermore if the
10305 symbol is a weak reference to something that isn't known to
10306 resolve to a symbol in this module, then force to memory. */
10307 if ((SYMBOL_REF_WEAK (x)
10308 && !aarch64_symbol_binds_local_p (x))
10309 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10310 return SYMBOL_FORCE_TO_MEM;
10311 return SYMBOL_TINY_ABSOLUTE;
10313 case AARCH64_CMODEL_SMALL:
10314 /* Same reasoning as the tiny code model, but the offset cap here is
10315 4G. */
10316 if ((SYMBOL_REF_WEAK (x)
10317 && !aarch64_symbol_binds_local_p (x))
10318 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10319 HOST_WIDE_INT_C (4294967264)))
10320 return SYMBOL_FORCE_TO_MEM;
10321 return SYMBOL_SMALL_ABSOLUTE;
10323 case AARCH64_CMODEL_TINY_PIC:
10324 if (!aarch64_symbol_binds_local_p (x))
10325 return SYMBOL_TINY_GOT;
10326 return SYMBOL_TINY_ABSOLUTE;
10328 case AARCH64_CMODEL_SMALL_SPIC:
10329 case AARCH64_CMODEL_SMALL_PIC:
10330 if (!aarch64_symbol_binds_local_p (x))
10331 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10332 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10333 return SYMBOL_SMALL_ABSOLUTE;
10335 case AARCH64_CMODEL_LARGE:
10336 /* This is alright even in PIC code as the constant
10337 pool reference is always PC relative and within
10338 the same translation unit. */
10339 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10340 return SYMBOL_SMALL_ABSOLUTE;
10341 else
10342 return SYMBOL_FORCE_TO_MEM;
10344 default:
10345 gcc_unreachable ();
10349 /* By default push everything into the constant pool. */
10350 return SYMBOL_FORCE_TO_MEM;
10353 bool
10354 aarch64_constant_address_p (rtx x)
10356 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10359 bool
10360 aarch64_legitimate_pic_operand_p (rtx x)
10362 if (GET_CODE (x) == SYMBOL_REF
10363 || (GET_CODE (x) == CONST
10364 && GET_CODE (XEXP (x, 0)) == PLUS
10365 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10366 return false;
10368 return true;
10371 /* Return true if X holds either a quarter-precision or
10372 floating-point +0.0 constant. */
10373 static bool
10374 aarch64_valid_floating_const (rtx x)
10376 if (!CONST_DOUBLE_P (x))
10377 return false;
10379 /* This call determines which constants can be used in mov<mode>
10380 as integer moves instead of constant loads. */
10381 if (aarch64_float_const_rtx_p (x))
10382 return true;
10384 return aarch64_float_const_representable_p (x);
10387 static bool
10388 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10390 /* Do not allow vector struct mode constants. We could support
10391 0 and -1 easily, but they need support in aarch64-simd.md. */
10392 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10393 return false;
10395 /* For these cases we never want to use a literal load.
10396 As such we have to prevent the compiler from forcing these
10397 to memory. */
10398 if ((GET_CODE (x) == CONST_VECTOR
10399 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10400 || CONST_INT_P (x)
10401 || aarch64_valid_floating_const (x)
10402 || aarch64_can_const_movi_rtx_p (x, mode)
10403 || aarch64_float_const_rtx_p (x))
10404 return !targetm.cannot_force_const_mem (mode, x);
10406 if (GET_CODE (x) == HIGH
10407 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10408 return true;
10410 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10411 so spilling them is better than rematerialization. */
10412 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10413 return true;
10415 return aarch64_constant_address_p (x);
10419 aarch64_load_tp (rtx target)
10421 if (!target
10422 || GET_MODE (target) != Pmode
10423 || !register_operand (target, Pmode))
10424 target = gen_reg_rtx (Pmode);
10426 /* Can return in any reg. */
10427 emit_insn (gen_aarch64_load_tp_hard (target));
10428 return target;
10431 /* On AAPCS systems, this is the "struct __va_list". */
10432 static GTY(()) tree va_list_type;
10434 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10435 Return the type to use as __builtin_va_list.
10437 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10439 struct __va_list
10441 void *__stack;
10442 void *__gr_top;
10443 void *__vr_top;
10444 int __gr_offs;
10445 int __vr_offs;
10446 }; */
10448 static tree
10449 aarch64_build_builtin_va_list (void)
10451 tree va_list_name;
10452 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10454 /* Create the type. */
10455 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10456 /* Give it the required name. */
10457 va_list_name = build_decl (BUILTINS_LOCATION,
10458 TYPE_DECL,
10459 get_identifier ("__va_list"),
10460 va_list_type);
10461 DECL_ARTIFICIAL (va_list_name) = 1;
10462 TYPE_NAME (va_list_type) = va_list_name;
10463 TYPE_STUB_DECL (va_list_type) = va_list_name;
10465 /* Create the fields. */
10466 f_stack = build_decl (BUILTINS_LOCATION,
10467 FIELD_DECL, get_identifier ("__stack"),
10468 ptr_type_node);
10469 f_grtop = build_decl (BUILTINS_LOCATION,
10470 FIELD_DECL, get_identifier ("__gr_top"),
10471 ptr_type_node);
10472 f_vrtop = build_decl (BUILTINS_LOCATION,
10473 FIELD_DECL, get_identifier ("__vr_top"),
10474 ptr_type_node);
10475 f_groff = build_decl (BUILTINS_LOCATION,
10476 FIELD_DECL, get_identifier ("__gr_offs"),
10477 integer_type_node);
10478 f_vroff = build_decl (BUILTINS_LOCATION,
10479 FIELD_DECL, get_identifier ("__vr_offs"),
10480 integer_type_node);
10482 /* Tell tree-stdarg pass about our internal offset fields.
10483 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10484 purpose to identify whether the code is updating va_list internal
10485 offset fields through irregular way. */
10486 va_list_gpr_counter_field = f_groff;
10487 va_list_fpr_counter_field = f_vroff;
10489 DECL_ARTIFICIAL (f_stack) = 1;
10490 DECL_ARTIFICIAL (f_grtop) = 1;
10491 DECL_ARTIFICIAL (f_vrtop) = 1;
10492 DECL_ARTIFICIAL (f_groff) = 1;
10493 DECL_ARTIFICIAL (f_vroff) = 1;
10495 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10496 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10497 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10498 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10499 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10501 TYPE_FIELDS (va_list_type) = f_stack;
10502 DECL_CHAIN (f_stack) = f_grtop;
10503 DECL_CHAIN (f_grtop) = f_vrtop;
10504 DECL_CHAIN (f_vrtop) = f_groff;
10505 DECL_CHAIN (f_groff) = f_vroff;
10507 /* Compute its layout. */
10508 layout_type (va_list_type);
10510 return va_list_type;
10513 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10514 static void
10515 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10517 const CUMULATIVE_ARGS *cum;
10518 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10519 tree stack, grtop, vrtop, groff, vroff;
10520 tree t;
10521 int gr_save_area_size = cfun->va_list_gpr_size;
10522 int vr_save_area_size = cfun->va_list_fpr_size;
10523 int vr_offset;
10525 cum = &crtl->args.info;
10526 if (cfun->va_list_gpr_size)
10527 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10528 cfun->va_list_gpr_size);
10529 if (cfun->va_list_fpr_size)
10530 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10531 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10533 if (!TARGET_FLOAT)
10535 gcc_assert (cum->aapcs_nvrn == 0);
10536 vr_save_area_size = 0;
10539 f_stack = TYPE_FIELDS (va_list_type_node);
10540 f_grtop = DECL_CHAIN (f_stack);
10541 f_vrtop = DECL_CHAIN (f_grtop);
10542 f_groff = DECL_CHAIN (f_vrtop);
10543 f_vroff = DECL_CHAIN (f_groff);
10545 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10546 NULL_TREE);
10547 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10548 NULL_TREE);
10549 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10550 NULL_TREE);
10551 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10552 NULL_TREE);
10553 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10554 NULL_TREE);
10556 /* Emit code to initialize STACK, which points to the next varargs stack
10557 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10558 by named arguments. STACK is 8-byte aligned. */
10559 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10560 if (cum->aapcs_stack_size > 0)
10561 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10562 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10563 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10565 /* Emit code to initialize GRTOP, the top of the GR save area.
10566 virtual_incoming_args_rtx should have been 16 byte aligned. */
10567 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10568 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10569 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10571 /* Emit code to initialize VRTOP, the top of the VR save area.
10572 This address is gr_save_area_bytes below GRTOP, rounded
10573 down to the next 16-byte boundary. */
10574 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10575 vr_offset = ROUND_UP (gr_save_area_size,
10576 STACK_BOUNDARY / BITS_PER_UNIT);
10578 if (vr_offset)
10579 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10580 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10581 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10583 /* Emit code to initialize GROFF, the offset from GRTOP of the
10584 next GPR argument. */
10585 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10586 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10587 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10589 /* Likewise emit code to initialize VROFF, the offset from FTOP
10590 of the next VR argument. */
10591 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10592 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10593 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10596 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10598 static tree
10599 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10600 gimple_seq *post_p ATTRIBUTE_UNUSED)
10602 tree addr;
10603 bool indirect_p;
10604 bool is_ha; /* is HFA or HVA. */
10605 bool dw_align; /* double-word align. */
10606 machine_mode ag_mode = VOIDmode;
10607 int nregs;
10608 machine_mode mode;
10610 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10611 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10612 HOST_WIDE_INT size, rsize, adjust, align;
10613 tree t, u, cond1, cond2;
10615 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10616 if (indirect_p)
10617 type = build_pointer_type (type);
10619 mode = TYPE_MODE (type);
10621 f_stack = TYPE_FIELDS (va_list_type_node);
10622 f_grtop = DECL_CHAIN (f_stack);
10623 f_vrtop = DECL_CHAIN (f_grtop);
10624 f_groff = DECL_CHAIN (f_vrtop);
10625 f_vroff = DECL_CHAIN (f_groff);
10627 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10628 f_stack, NULL_TREE);
10629 size = int_size_in_bytes (type);
10630 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10632 dw_align = false;
10633 adjust = 0;
10634 if (aarch64_vfp_is_call_or_return_candidate (mode,
10635 type,
10636 &ag_mode,
10637 &nregs,
10638 &is_ha))
10640 /* TYPE passed in fp/simd registers. */
10641 if (!TARGET_FLOAT)
10642 aarch64_err_no_fpadvsimd (mode, "varargs");
10644 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10645 unshare_expr (valist), f_vrtop, NULL_TREE);
10646 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10647 unshare_expr (valist), f_vroff, NULL_TREE);
10649 rsize = nregs * UNITS_PER_VREG;
10651 if (is_ha)
10653 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10654 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10656 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10657 && size < UNITS_PER_VREG)
10659 adjust = UNITS_PER_VREG - size;
10662 else
10664 /* TYPE passed in general registers. */
10665 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10666 unshare_expr (valist), f_grtop, NULL_TREE);
10667 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10668 unshare_expr (valist), f_groff, NULL_TREE);
10669 rsize = ROUND_UP (size, UNITS_PER_WORD);
10670 nregs = rsize / UNITS_PER_WORD;
10672 if (align > 8)
10673 dw_align = true;
10675 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10676 && size < UNITS_PER_WORD)
10678 adjust = UNITS_PER_WORD - size;
10682 /* Get a local temporary for the field value. */
10683 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10685 /* Emit code to branch if off >= 0. */
10686 t = build2 (GE_EXPR, boolean_type_node, off,
10687 build_int_cst (TREE_TYPE (off), 0));
10688 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10690 if (dw_align)
10692 /* Emit: offs = (offs + 15) & -16. */
10693 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10694 build_int_cst (TREE_TYPE (off), 15));
10695 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10696 build_int_cst (TREE_TYPE (off), -16));
10697 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10699 else
10700 roundup = NULL;
10702 /* Update ap.__[g|v]r_offs */
10703 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10704 build_int_cst (TREE_TYPE (off), rsize));
10705 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10707 /* String up. */
10708 if (roundup)
10709 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10711 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10712 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10713 build_int_cst (TREE_TYPE (f_off), 0));
10714 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10716 /* String up: make sure the assignment happens before the use. */
10717 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10718 COND_EXPR_ELSE (cond1) = t;
10720 /* Prepare the trees handling the argument that is passed on the stack;
10721 the top level node will store in ON_STACK. */
10722 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10723 if (align > 8)
10725 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10726 t = fold_convert (intDI_type_node, arg);
10727 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10728 build_int_cst (TREE_TYPE (t), 15));
10729 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10730 build_int_cst (TREE_TYPE (t), -16));
10731 t = fold_convert (TREE_TYPE (arg), t);
10732 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10734 else
10735 roundup = NULL;
10736 /* Advance ap.__stack */
10737 t = fold_convert (intDI_type_node, arg);
10738 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10739 build_int_cst (TREE_TYPE (t), size + 7));
10740 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10741 build_int_cst (TREE_TYPE (t), -8));
10742 t = fold_convert (TREE_TYPE (arg), t);
10743 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10744 /* String up roundup and advance. */
10745 if (roundup)
10746 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10747 /* String up with arg */
10748 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10749 /* Big-endianness related address adjustment. */
10750 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10751 && size < UNITS_PER_WORD)
10753 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10754 size_int (UNITS_PER_WORD - size));
10755 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10758 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10759 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10761 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10762 t = off;
10763 if (adjust)
10764 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10765 build_int_cst (TREE_TYPE (off), adjust));
10767 t = fold_convert (sizetype, t);
10768 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10770 if (is_ha)
10772 /* type ha; // treat as "struct {ftype field[n];}"
10773 ... [computing offs]
10774 for (i = 0; i <nregs; ++i, offs += 16)
10775 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10776 return ha; */
10777 int i;
10778 tree tmp_ha, field_t, field_ptr_t;
10780 /* Declare a local variable. */
10781 tmp_ha = create_tmp_var_raw (type, "ha");
10782 gimple_add_tmp_var (tmp_ha);
10784 /* Establish the base type. */
10785 switch (ag_mode)
10787 case E_SFmode:
10788 field_t = float_type_node;
10789 field_ptr_t = float_ptr_type_node;
10790 break;
10791 case E_DFmode:
10792 field_t = double_type_node;
10793 field_ptr_t = double_ptr_type_node;
10794 break;
10795 case E_TFmode:
10796 field_t = long_double_type_node;
10797 field_ptr_t = long_double_ptr_type_node;
10798 break;
10799 case E_HFmode:
10800 field_t = aarch64_fp16_type_node;
10801 field_ptr_t = aarch64_fp16_ptr_type_node;
10802 break;
10803 case E_V2SImode:
10804 case E_V4SImode:
10806 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10807 field_t = build_vector_type_for_mode (innertype, ag_mode);
10808 field_ptr_t = build_pointer_type (field_t);
10810 break;
10811 default:
10812 gcc_assert (0);
10815 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10816 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10817 addr = t;
10818 t = fold_convert (field_ptr_t, addr);
10819 t = build2 (MODIFY_EXPR, field_t,
10820 build1 (INDIRECT_REF, field_t, tmp_ha),
10821 build1 (INDIRECT_REF, field_t, t));
10823 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10824 for (i = 1; i < nregs; ++i)
10826 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10827 u = fold_convert (field_ptr_t, addr);
10828 u = build2 (MODIFY_EXPR, field_t,
10829 build2 (MEM_REF, field_t, tmp_ha,
10830 build_int_cst (field_ptr_t,
10831 (i *
10832 int_size_in_bytes (field_t)))),
10833 build1 (INDIRECT_REF, field_t, u));
10834 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10837 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10838 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10841 COND_EXPR_ELSE (cond2) = t;
10842 addr = fold_convert (build_pointer_type (type), cond1);
10843 addr = build_va_arg_indirect_ref (addr);
10845 if (indirect_p)
10846 addr = build_va_arg_indirect_ref (addr);
10848 return addr;
10851 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10853 static void
10854 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10855 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10856 int no_rtl)
10858 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10859 CUMULATIVE_ARGS local_cum;
10860 int gr_saved = cfun->va_list_gpr_size;
10861 int vr_saved = cfun->va_list_fpr_size;
10863 /* The caller has advanced CUM up to, but not beyond, the last named
10864 argument. Advance a local copy of CUM past the last "real" named
10865 argument, to find out how many registers are left over. */
10866 local_cum = *cum;
10867 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10869 /* Found out how many registers we need to save.
10870 Honor tree-stdvar analysis results. */
10871 if (cfun->va_list_gpr_size)
10872 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10873 cfun->va_list_gpr_size / UNITS_PER_WORD);
10874 if (cfun->va_list_fpr_size)
10875 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10876 cfun->va_list_fpr_size / UNITS_PER_VREG);
10878 if (!TARGET_FLOAT)
10880 gcc_assert (local_cum.aapcs_nvrn == 0);
10881 vr_saved = 0;
10884 if (!no_rtl)
10886 if (gr_saved > 0)
10888 rtx ptr, mem;
10890 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10891 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10892 - gr_saved * UNITS_PER_WORD);
10893 mem = gen_frame_mem (BLKmode, ptr);
10894 set_mem_alias_set (mem, get_varargs_alias_set ());
10896 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10897 mem, gr_saved);
10899 if (vr_saved > 0)
10901 /* We can't use move_block_from_reg, because it will use
10902 the wrong mode, storing D regs only. */
10903 machine_mode mode = TImode;
10904 int off, i, vr_start;
10906 /* Set OFF to the offset from virtual_incoming_args_rtx of
10907 the first vector register. The VR save area lies below
10908 the GR one, and is aligned to 16 bytes. */
10909 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10910 STACK_BOUNDARY / BITS_PER_UNIT);
10911 off -= vr_saved * UNITS_PER_VREG;
10913 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10914 for (i = 0; i < vr_saved; ++i)
10916 rtx ptr, mem;
10918 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10919 mem = gen_frame_mem (mode, ptr);
10920 set_mem_alias_set (mem, get_varargs_alias_set ());
10921 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10922 off += UNITS_PER_VREG;
10927 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10928 any complication of having crtl->args.pretend_args_size changed. */
10929 cfun->machine->frame.saved_varargs_size
10930 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10931 STACK_BOUNDARY / BITS_PER_UNIT)
10932 + vr_saved * UNITS_PER_VREG);
10935 static void
10936 aarch64_conditional_register_usage (void)
10938 int i;
10939 if (!TARGET_FLOAT)
10941 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10943 fixed_regs[i] = 1;
10944 call_used_regs[i] = 1;
10949 /* Walk down the type tree of TYPE counting consecutive base elements.
10950 If *MODEP is VOIDmode, then set it to the first valid floating point
10951 type. If a non-floating point type is found, or if a floating point
10952 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10953 otherwise return the count in the sub-tree. */
10954 static int
10955 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10957 machine_mode mode;
10958 HOST_WIDE_INT size;
10960 switch (TREE_CODE (type))
10962 case REAL_TYPE:
10963 mode = TYPE_MODE (type);
10964 if (mode != DFmode && mode != SFmode
10965 && mode != TFmode && mode != HFmode)
10966 return -1;
10968 if (*modep == VOIDmode)
10969 *modep = mode;
10971 if (*modep == mode)
10972 return 1;
10974 break;
10976 case COMPLEX_TYPE:
10977 mode = TYPE_MODE (TREE_TYPE (type));
10978 if (mode != DFmode && mode != SFmode
10979 && mode != TFmode && mode != HFmode)
10980 return -1;
10982 if (*modep == VOIDmode)
10983 *modep = mode;
10985 if (*modep == mode)
10986 return 2;
10988 break;
10990 case VECTOR_TYPE:
10991 /* Use V2SImode and V4SImode as representatives of all 64-bit
10992 and 128-bit vector types. */
10993 size = int_size_in_bytes (type);
10994 switch (size)
10996 case 8:
10997 mode = V2SImode;
10998 break;
10999 case 16:
11000 mode = V4SImode;
11001 break;
11002 default:
11003 return -1;
11006 if (*modep == VOIDmode)
11007 *modep = mode;
11009 /* Vector modes are considered to be opaque: two vectors are
11010 equivalent for the purposes of being homogeneous aggregates
11011 if they are the same size. */
11012 if (*modep == mode)
11013 return 1;
11015 break;
11017 case ARRAY_TYPE:
11019 int count;
11020 tree index = TYPE_DOMAIN (type);
11022 /* Can't handle incomplete types nor sizes that are not
11023 fixed. */
11024 if (!COMPLETE_TYPE_P (type)
11025 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11026 return -1;
11028 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11029 if (count == -1
11030 || !index
11031 || !TYPE_MAX_VALUE (index)
11032 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11033 || !TYPE_MIN_VALUE (index)
11034 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11035 || count < 0)
11036 return -1;
11038 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11039 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11041 /* There must be no padding. */
11042 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11043 return -1;
11045 return count;
11048 case RECORD_TYPE:
11050 int count = 0;
11051 int sub_count;
11052 tree field;
11054 /* Can't handle incomplete types nor sizes that are not
11055 fixed. */
11056 if (!COMPLETE_TYPE_P (type)
11057 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11058 return -1;
11060 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11062 if (TREE_CODE (field) != FIELD_DECL)
11063 continue;
11065 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11066 if (sub_count < 0)
11067 return -1;
11068 count += sub_count;
11071 /* There must be no padding. */
11072 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11073 return -1;
11075 return count;
11078 case UNION_TYPE:
11079 case QUAL_UNION_TYPE:
11081 /* These aren't very interesting except in a degenerate case. */
11082 int count = 0;
11083 int sub_count;
11084 tree field;
11086 /* Can't handle incomplete types nor sizes that are not
11087 fixed. */
11088 if (!COMPLETE_TYPE_P (type)
11089 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11090 return -1;
11092 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11094 if (TREE_CODE (field) != FIELD_DECL)
11095 continue;
11097 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11098 if (sub_count < 0)
11099 return -1;
11100 count = count > sub_count ? count : sub_count;
11103 /* There must be no padding. */
11104 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11105 return -1;
11107 return count;
11110 default:
11111 break;
11114 return -1;
11117 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11118 type as described in AAPCS64 \S 4.1.2.
11120 See the comment above aarch64_composite_type_p for the notes on MODE. */
11122 static bool
11123 aarch64_short_vector_p (const_tree type,
11124 machine_mode mode)
11126 HOST_WIDE_INT size = -1;
11128 if (type && TREE_CODE (type) == VECTOR_TYPE)
11129 size = int_size_in_bytes (type);
11130 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11131 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11132 size = GET_MODE_SIZE (mode);
11134 return (size == 8 || size == 16);
11137 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11138 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11139 array types. The C99 floating-point complex types are also considered
11140 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11141 types, which are GCC extensions and out of the scope of AAPCS64, are
11142 treated as composite types here as well.
11144 Note that MODE itself is not sufficient in determining whether a type
11145 is such a composite type or not. This is because
11146 stor-layout.c:compute_record_mode may have already changed the MODE
11147 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11148 structure with only one field may have its MODE set to the mode of the
11149 field. Also an integer mode whose size matches the size of the
11150 RECORD_TYPE type may be used to substitute the original mode
11151 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11152 solely relied on. */
11154 static bool
11155 aarch64_composite_type_p (const_tree type,
11156 machine_mode mode)
11158 if (aarch64_short_vector_p (type, mode))
11159 return false;
11161 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11162 return true;
11164 if (mode == BLKmode
11165 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11166 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11167 return true;
11169 return false;
11172 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11173 shall be passed or returned in simd/fp register(s) (providing these
11174 parameter passing registers are available).
11176 Upon successful return, *COUNT returns the number of needed registers,
11177 *BASE_MODE returns the mode of the individual register and when IS_HAF
11178 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11179 floating-point aggregate or a homogeneous short-vector aggregate. */
11181 static bool
11182 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11183 const_tree type,
11184 machine_mode *base_mode,
11185 int *count,
11186 bool *is_ha)
11188 machine_mode new_mode = VOIDmode;
11189 bool composite_p = aarch64_composite_type_p (type, mode);
11191 if (is_ha != NULL) *is_ha = false;
11193 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11194 || aarch64_short_vector_p (type, mode))
11196 *count = 1;
11197 new_mode = mode;
11199 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11201 if (is_ha != NULL) *is_ha = true;
11202 *count = 2;
11203 new_mode = GET_MODE_INNER (mode);
11205 else if (type && composite_p)
11207 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11209 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11211 if (is_ha != NULL) *is_ha = true;
11212 *count = ag_count;
11214 else
11215 return false;
11217 else
11218 return false;
11220 *base_mode = new_mode;
11221 return true;
11224 /* Implement TARGET_STRUCT_VALUE_RTX. */
11226 static rtx
11227 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11228 int incoming ATTRIBUTE_UNUSED)
11230 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11233 /* Implements target hook vector_mode_supported_p. */
11234 static bool
11235 aarch64_vector_mode_supported_p (machine_mode mode)
11237 if (TARGET_SIMD
11238 && (mode == V4SImode || mode == V8HImode
11239 || mode == V16QImode || mode == V2DImode
11240 || mode == V2SImode || mode == V4HImode
11241 || mode == V8QImode || mode == V2SFmode
11242 || mode == V4SFmode || mode == V2DFmode
11243 || mode == V4HFmode || mode == V8HFmode
11244 || mode == V1DFmode))
11245 return true;
11247 return false;
11250 /* Return appropriate SIMD container
11251 for MODE within a vector of WIDTH bits. */
11252 static machine_mode
11253 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11255 gcc_assert (width == 64 || width == 128);
11256 if (TARGET_SIMD)
11258 if (width == 128)
11259 switch (mode)
11261 case E_DFmode:
11262 return V2DFmode;
11263 case E_SFmode:
11264 return V4SFmode;
11265 case E_HFmode:
11266 return V8HFmode;
11267 case E_SImode:
11268 return V4SImode;
11269 case E_HImode:
11270 return V8HImode;
11271 case E_QImode:
11272 return V16QImode;
11273 case E_DImode:
11274 return V2DImode;
11275 default:
11276 break;
11278 else
11279 switch (mode)
11281 case E_SFmode:
11282 return V2SFmode;
11283 case E_HFmode:
11284 return V4HFmode;
11285 case E_SImode:
11286 return V2SImode;
11287 case E_HImode:
11288 return V4HImode;
11289 case E_QImode:
11290 return V8QImode;
11291 default:
11292 break;
11295 return word_mode;
11298 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11299 static machine_mode
11300 aarch64_preferred_simd_mode (scalar_mode mode)
11302 return aarch64_simd_container_mode (mode, 128);
11305 /* Return the bitmask of possible vector sizes for the vectorizer
11306 to iterate over. */
11307 static unsigned int
11308 aarch64_autovectorize_vector_sizes (void)
11310 return (16 | 8);
11313 /* Implement TARGET_MANGLE_TYPE. */
11315 static const char *
11316 aarch64_mangle_type (const_tree type)
11318 /* The AArch64 ABI documents say that "__va_list" has to be
11319 managled as if it is in the "std" namespace. */
11320 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11321 return "St9__va_list";
11323 /* Half-precision float. */
11324 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11325 return "Dh";
11327 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11328 builtin types. */
11329 if (TYPE_NAME (type) != NULL)
11330 return aarch64_mangle_builtin_type (type);
11332 /* Use the default mangling. */
11333 return NULL;
11336 /* Find the first rtx_insn before insn that will generate an assembly
11337 instruction. */
11339 static rtx_insn *
11340 aarch64_prev_real_insn (rtx_insn *insn)
11342 if (!insn)
11343 return NULL;
11347 insn = prev_real_insn (insn);
11349 while (insn && recog_memoized (insn) < 0);
11351 return insn;
11354 static bool
11355 is_madd_op (enum attr_type t1)
11357 unsigned int i;
11358 /* A number of these may be AArch32 only. */
11359 enum attr_type mlatypes[] = {
11360 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11361 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11362 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11365 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11367 if (t1 == mlatypes[i])
11368 return true;
11371 return false;
11374 /* Check if there is a register dependency between a load and the insn
11375 for which we hold recog_data. */
11377 static bool
11378 dep_between_memop_and_curr (rtx memop)
11380 rtx load_reg;
11381 int opno;
11383 gcc_assert (GET_CODE (memop) == SET);
11385 if (!REG_P (SET_DEST (memop)))
11386 return false;
11388 load_reg = SET_DEST (memop);
11389 for (opno = 1; opno < recog_data.n_operands; opno++)
11391 rtx operand = recog_data.operand[opno];
11392 if (REG_P (operand)
11393 && reg_overlap_mentioned_p (load_reg, operand))
11394 return true;
11397 return false;
11401 /* When working around the Cortex-A53 erratum 835769,
11402 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11403 instruction and has a preceding memory instruction such that a NOP
11404 should be inserted between them. */
11406 bool
11407 aarch64_madd_needs_nop (rtx_insn* insn)
11409 enum attr_type attr_type;
11410 rtx_insn *prev;
11411 rtx body;
11413 if (!TARGET_FIX_ERR_A53_835769)
11414 return false;
11416 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11417 return false;
11419 attr_type = get_attr_type (insn);
11420 if (!is_madd_op (attr_type))
11421 return false;
11423 prev = aarch64_prev_real_insn (insn);
11424 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11425 Restore recog state to INSN to avoid state corruption. */
11426 extract_constrain_insn_cached (insn);
11428 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11429 return false;
11431 body = single_set (prev);
11433 /* If the previous insn is a memory op and there is no dependency between
11434 it and the DImode madd, emit a NOP between them. If body is NULL then we
11435 have a complex memory operation, probably a load/store pair.
11436 Be conservative for now and emit a NOP. */
11437 if (GET_MODE (recog_data.operand[0]) == DImode
11438 && (!body || !dep_between_memop_and_curr (body)))
11439 return true;
11441 return false;
11446 /* Implement FINAL_PRESCAN_INSN. */
11448 void
11449 aarch64_final_prescan_insn (rtx_insn *insn)
11451 if (aarch64_madd_needs_nop (insn))
11452 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11456 /* Return the equivalent letter for size. */
11457 static char
11458 sizetochar (int size)
11460 switch (size)
11462 case 64: return 'd';
11463 case 32: return 's';
11464 case 16: return 'h';
11465 case 8 : return 'b';
11466 default: gcc_unreachable ();
11470 /* Return true iff x is a uniform vector of floating-point
11471 constants, and the constant can be represented in
11472 quarter-precision form. Note, as aarch64_float_const_representable
11473 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11474 static bool
11475 aarch64_vect_float_const_representable_p (rtx x)
11477 rtx elt;
11478 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11479 && const_vec_duplicate_p (x, &elt)
11480 && aarch64_float_const_representable_p (elt));
11483 /* Return true for valid and false for invalid. */
11484 bool
11485 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11486 struct simd_immediate_info *info)
11488 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11489 matches = 1; \
11490 for (i = 0; i < idx; i += (STRIDE)) \
11491 if (!(TEST)) \
11492 matches = 0; \
11493 if (matches) \
11495 immtype = (CLASS); \
11496 elsize = (ELSIZE); \
11497 eshift = (SHIFT); \
11498 emvn = (NEG); \
11499 break; \
11502 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11503 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11504 unsigned char bytes[16];
11505 int immtype = -1, matches;
11506 unsigned int invmask = inverse ? 0xff : 0;
11507 int eshift, emvn;
11509 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11511 if (! (aarch64_simd_imm_zero_p (op, mode)
11512 || aarch64_vect_float_const_representable_p (op)))
11513 return false;
11515 if (info)
11517 rtx elt = CONST_VECTOR_ELT (op, 0);
11518 scalar_float_mode elt_mode
11519 = as_a <scalar_float_mode> (GET_MODE (elt));
11521 info->value = elt;
11522 info->element_width = GET_MODE_BITSIZE (elt_mode);
11523 info->mvn = false;
11524 info->shift = 0;
11527 return true;
11530 /* Splat vector constant out into a byte vector. */
11531 for (i = 0; i < n_elts; i++)
11533 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11534 it must be laid out in the vector register in reverse order. */
11535 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11536 unsigned HOST_WIDE_INT elpart;
11538 gcc_assert (CONST_INT_P (el));
11539 elpart = INTVAL (el);
11541 for (unsigned int byte = 0; byte < innersize; byte++)
11543 bytes[idx++] = (elpart & 0xff) ^ invmask;
11544 elpart >>= BITS_PER_UNIT;
11549 /* Sanity check. */
11550 gcc_assert (idx == GET_MODE_SIZE (mode));
11554 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11555 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11557 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11558 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11560 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11561 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11563 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11564 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11566 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11568 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11570 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11571 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11573 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11574 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11576 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11577 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11579 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11580 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11582 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11584 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11586 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11587 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11589 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11590 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11592 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11593 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11595 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11596 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11598 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11600 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11601 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11603 while (0);
11605 if (immtype == -1)
11606 return false;
11608 if (info)
11610 info->element_width = elsize;
11611 info->mvn = emvn != 0;
11612 info->shift = eshift;
11614 unsigned HOST_WIDE_INT imm = 0;
11616 if (immtype >= 12 && immtype <= 15)
11617 info->msl = true;
11619 /* Un-invert bytes of recognized vector, if necessary. */
11620 if (invmask != 0)
11621 for (i = 0; i < idx; i++)
11622 bytes[i] ^= invmask;
11624 if (immtype == 17)
11626 /* FIXME: Broken on 32-bit H_W_I hosts. */
11627 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11629 for (i = 0; i < 8; i++)
11630 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11631 << (i * BITS_PER_UNIT);
11634 info->value = GEN_INT (imm);
11636 else
11638 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11639 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11641 /* Construct 'abcdefgh' because the assembler cannot handle
11642 generic constants. */
11643 if (info->mvn)
11644 imm = ~imm;
11645 imm = (imm >> info->shift) & 0xff;
11646 info->value = GEN_INT (imm);
11650 return true;
11651 #undef CHECK
11654 /* Check of immediate shift constants are within range. */
11655 bool
11656 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11658 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11659 if (left)
11660 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11661 else
11662 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11665 /* Return true if X is a uniform vector where all elements
11666 are either the floating-point constant 0.0 or the
11667 integer constant 0. */
11668 bool
11669 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11671 return x == CONST0_RTX (mode);
11675 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11676 operation of width WIDTH at bit position POS. */
11679 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11681 gcc_assert (CONST_INT_P (width));
11682 gcc_assert (CONST_INT_P (pos));
11684 unsigned HOST_WIDE_INT mask
11685 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11686 return GEN_INT (mask << UINTVAL (pos));
11689 bool
11690 aarch64_mov_operand_p (rtx x, machine_mode mode)
11692 if (GET_CODE (x) == HIGH
11693 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11694 return true;
11696 if (CONST_INT_P (x))
11697 return true;
11699 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11700 return true;
11702 return aarch64_classify_symbolic_expression (x)
11703 == SYMBOL_TINY_ABSOLUTE;
11706 /* Return a const_int vector of VAL. */
11708 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11710 int nunits = GET_MODE_NUNITS (mode);
11711 rtvec v = rtvec_alloc (nunits);
11712 int i;
11714 rtx cache = GEN_INT (val);
11716 for (i=0; i < nunits; i++)
11717 RTVEC_ELT (v, i) = cache;
11719 return gen_rtx_CONST_VECTOR (mode, v);
11722 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11724 bool
11725 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11727 machine_mode vmode;
11729 vmode = aarch64_preferred_simd_mode (mode);
11730 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11731 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11734 /* Construct and return a PARALLEL RTX vector with elements numbering the
11735 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11736 the vector - from the perspective of the architecture. This does not
11737 line up with GCC's perspective on lane numbers, so we end up with
11738 different masks depending on our target endian-ness. The diagram
11739 below may help. We must draw the distinction when building masks
11740 which select one half of the vector. An instruction selecting
11741 architectural low-lanes for a big-endian target, must be described using
11742 a mask selecting GCC high-lanes.
11744 Big-Endian Little-Endian
11746 GCC 0 1 2 3 3 2 1 0
11747 | x | x | x | x | | x | x | x | x |
11748 Architecture 3 2 1 0 3 2 1 0
11750 Low Mask: { 2, 3 } { 0, 1 }
11751 High Mask: { 0, 1 } { 2, 3 }
11755 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11757 int nunits = GET_MODE_NUNITS (mode);
11758 rtvec v = rtvec_alloc (nunits / 2);
11759 int high_base = nunits / 2;
11760 int low_base = 0;
11761 int base;
11762 rtx t1;
11763 int i;
11765 if (BYTES_BIG_ENDIAN)
11766 base = high ? low_base : high_base;
11767 else
11768 base = high ? high_base : low_base;
11770 for (i = 0; i < nunits / 2; i++)
11771 RTVEC_ELT (v, i) = GEN_INT (base + i);
11773 t1 = gen_rtx_PARALLEL (mode, v);
11774 return t1;
11777 /* Check OP for validity as a PARALLEL RTX vector with elements
11778 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11779 from the perspective of the architecture. See the diagram above
11780 aarch64_simd_vect_par_cnst_half for more details. */
11782 bool
11783 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11784 bool high)
11786 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11787 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11788 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11789 int i = 0;
11791 if (!VECTOR_MODE_P (mode))
11792 return false;
11794 if (count_op != count_ideal)
11795 return false;
11797 for (i = 0; i < count_ideal; i++)
11799 rtx elt_op = XVECEXP (op, 0, i);
11800 rtx elt_ideal = XVECEXP (ideal, 0, i);
11802 if (!CONST_INT_P (elt_op)
11803 || INTVAL (elt_ideal) != INTVAL (elt_op))
11804 return false;
11806 return true;
11809 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11810 HIGH (exclusive). */
11811 void
11812 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11813 const_tree exp)
11815 HOST_WIDE_INT lane;
11816 gcc_assert (CONST_INT_P (operand));
11817 lane = INTVAL (operand);
11819 if (lane < low || lane >= high)
11821 if (exp)
11822 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11823 else
11824 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11828 /* Return TRUE if OP is a valid vector addressing mode. */
11829 bool
11830 aarch64_simd_mem_operand_p (rtx op)
11832 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11833 || REG_P (XEXP (op, 0)));
11836 /* Emit a register copy from operand to operand, taking care not to
11837 early-clobber source registers in the process.
11839 COUNT is the number of components into which the copy needs to be
11840 decomposed. */
11841 void
11842 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11843 unsigned int count)
11845 unsigned int i;
11846 int rdest = REGNO (operands[0]);
11847 int rsrc = REGNO (operands[1]);
11849 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11850 || rdest < rsrc)
11851 for (i = 0; i < count; i++)
11852 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11853 gen_rtx_REG (mode, rsrc + i));
11854 else
11855 for (i = 0; i < count; i++)
11856 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11857 gen_rtx_REG (mode, rsrc + count - i - 1));
11860 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11861 one of VSTRUCT modes: OI, CI, or XI. */
11863 aarch64_simd_attr_length_rglist (machine_mode mode)
11865 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11868 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11869 alignment of a vector to 128 bits. */
11870 static HOST_WIDE_INT
11871 aarch64_simd_vector_alignment (const_tree type)
11873 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11874 return MIN (align, 128);
11877 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11878 static bool
11879 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11881 if (is_packed)
11882 return false;
11884 /* We guarantee alignment for vectors up to 128-bits. */
11885 if (tree_int_cst_compare (TYPE_SIZE (type),
11886 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11887 return false;
11889 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11890 return true;
11893 /* Return true if the vector misalignment factor is supported by the
11894 target. */
11895 static bool
11896 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11897 const_tree type, int misalignment,
11898 bool is_packed)
11900 if (TARGET_SIMD && STRICT_ALIGNMENT)
11902 /* Return if movmisalign pattern is not supported for this mode. */
11903 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11904 return false;
11906 /* Misalignment factor is unknown at compile time. */
11907 if (misalignment == -1)
11908 return false;
11910 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11911 is_packed);
11914 /* If VALS is a vector constant that can be loaded into a register
11915 using DUP, generate instructions to do so and return an RTX to
11916 assign to the register. Otherwise return NULL_RTX. */
11917 static rtx
11918 aarch64_simd_dup_constant (rtx vals)
11920 machine_mode mode = GET_MODE (vals);
11921 machine_mode inner_mode = GET_MODE_INNER (mode);
11922 rtx x;
11924 if (!const_vec_duplicate_p (vals, &x))
11925 return NULL_RTX;
11927 /* We can load this constant by using DUP and a constant in a
11928 single ARM register. This will be cheaper than a vector
11929 load. */
11930 x = copy_to_mode_reg (inner_mode, x);
11931 return gen_rtx_VEC_DUPLICATE (mode, x);
11935 /* Generate code to load VALS, which is a PARALLEL containing only
11936 constants (for vec_init) or CONST_VECTOR, efficiently into a
11937 register. Returns an RTX to copy into the register, or NULL_RTX
11938 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11939 static rtx
11940 aarch64_simd_make_constant (rtx vals)
11942 machine_mode mode = GET_MODE (vals);
11943 rtx const_dup;
11944 rtx const_vec = NULL_RTX;
11945 int n_elts = GET_MODE_NUNITS (mode);
11946 int n_const = 0;
11947 int i;
11949 if (GET_CODE (vals) == CONST_VECTOR)
11950 const_vec = vals;
11951 else if (GET_CODE (vals) == PARALLEL)
11953 /* A CONST_VECTOR must contain only CONST_INTs and
11954 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11955 Only store valid constants in a CONST_VECTOR. */
11956 for (i = 0; i < n_elts; ++i)
11958 rtx x = XVECEXP (vals, 0, i);
11959 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11960 n_const++;
11962 if (n_const == n_elts)
11963 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11965 else
11966 gcc_unreachable ();
11968 if (const_vec != NULL_RTX
11969 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11970 /* Load using MOVI/MVNI. */
11971 return const_vec;
11972 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11973 /* Loaded using DUP. */
11974 return const_dup;
11975 else if (const_vec != NULL_RTX)
11976 /* Load from constant pool. We can not take advantage of single-cycle
11977 LD1 because we need a PC-relative addressing mode. */
11978 return const_vec;
11979 else
11980 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11981 We can not construct an initializer. */
11982 return NULL_RTX;
11985 /* Expand a vector initialisation sequence, such that TARGET is
11986 initialised to contain VALS. */
11988 void
11989 aarch64_expand_vector_init (rtx target, rtx vals)
11991 machine_mode mode = GET_MODE (target);
11992 scalar_mode inner_mode = GET_MODE_INNER (mode);
11993 /* The number of vector elements. */
11994 int n_elts = GET_MODE_NUNITS (mode);
11995 /* The number of vector elements which are not constant. */
11996 int n_var = 0;
11997 rtx any_const = NULL_RTX;
11998 /* The first element of vals. */
11999 rtx v0 = XVECEXP (vals, 0, 0);
12000 bool all_same = true;
12002 /* Count the number of variable elements to initialise. */
12003 for (int i = 0; i < n_elts; ++i)
12005 rtx x = XVECEXP (vals, 0, i);
12006 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12007 ++n_var;
12008 else
12009 any_const = x;
12011 all_same &= rtx_equal_p (x, v0);
12014 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12015 how best to handle this. */
12016 if (n_var == 0)
12018 rtx constant = aarch64_simd_make_constant (vals);
12019 if (constant != NULL_RTX)
12021 emit_move_insn (target, constant);
12022 return;
12026 /* Splat a single non-constant element if we can. */
12027 if (all_same)
12029 rtx x = copy_to_mode_reg (inner_mode, v0);
12030 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12031 return;
12034 enum insn_code icode = optab_handler (vec_set_optab, mode);
12035 gcc_assert (icode != CODE_FOR_nothing);
12037 /* If there are only variable elements, try to optimize
12038 the insertion using dup for the most common element
12039 followed by insertions. */
12041 /* The algorithm will fill matches[*][0] with the earliest matching element,
12042 and matches[X][1] with the count of duplicate elements (if X is the
12043 earliest element which has duplicates). */
12045 if (n_var == n_elts && n_elts <= 16)
12047 int matches[16][2] = {0};
12048 for (int i = 0; i < n_elts; i++)
12050 for (int j = 0; j <= i; j++)
12052 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12054 matches[i][0] = j;
12055 matches[j][1]++;
12056 break;
12060 int maxelement = 0;
12061 int maxv = 0;
12062 for (int i = 0; i < n_elts; i++)
12063 if (matches[i][1] > maxv)
12065 maxelement = i;
12066 maxv = matches[i][1];
12069 /* Create a duplicate of the most common element. */
12070 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12071 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12073 /* Insert the rest. */
12074 for (int i = 0; i < n_elts; i++)
12076 rtx x = XVECEXP (vals, 0, i);
12077 if (matches[i][0] == maxelement)
12078 continue;
12079 x = copy_to_mode_reg (inner_mode, x);
12080 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12082 return;
12085 /* Initialise a vector which is part-variable. We want to first try
12086 to build those lanes which are constant in the most efficient way we
12087 can. */
12088 if (n_var != n_elts)
12090 rtx copy = copy_rtx (vals);
12092 /* Load constant part of vector. We really don't care what goes into the
12093 parts we will overwrite, but we're more likely to be able to load the
12094 constant efficiently if it has fewer, larger, repeating parts
12095 (see aarch64_simd_valid_immediate). */
12096 for (int i = 0; i < n_elts; i++)
12098 rtx x = XVECEXP (vals, 0, i);
12099 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12100 continue;
12101 rtx subst = any_const;
12102 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12104 /* Look in the copied vector, as more elements are const. */
12105 rtx test = XVECEXP (copy, 0, i ^ bit);
12106 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12108 subst = test;
12109 break;
12112 XVECEXP (copy, 0, i) = subst;
12114 aarch64_expand_vector_init (target, copy);
12117 /* Insert the variable lanes directly. */
12118 for (int i = 0; i < n_elts; i++)
12120 rtx x = XVECEXP (vals, 0, i);
12121 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12122 continue;
12123 x = copy_to_mode_reg (inner_mode, x);
12124 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12128 static unsigned HOST_WIDE_INT
12129 aarch64_shift_truncation_mask (machine_mode mode)
12131 return
12132 (!SHIFT_COUNT_TRUNCATED
12133 || aarch64_vector_mode_supported_p (mode)
12134 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12137 /* Select a format to encode pointers in exception handling data. */
12139 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12141 int type;
12142 switch (aarch64_cmodel)
12144 case AARCH64_CMODEL_TINY:
12145 case AARCH64_CMODEL_TINY_PIC:
12146 case AARCH64_CMODEL_SMALL:
12147 case AARCH64_CMODEL_SMALL_PIC:
12148 case AARCH64_CMODEL_SMALL_SPIC:
12149 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12150 for everything. */
12151 type = DW_EH_PE_sdata4;
12152 break;
12153 default:
12154 /* No assumptions here. 8-byte relocs required. */
12155 type = DW_EH_PE_sdata8;
12156 break;
12158 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12161 /* The last .arch and .tune assembly strings that we printed. */
12162 static std::string aarch64_last_printed_arch_string;
12163 static std::string aarch64_last_printed_tune_string;
12165 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12166 by the function fndecl. */
12168 void
12169 aarch64_declare_function_name (FILE *stream, const char* name,
12170 tree fndecl)
12172 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12174 struct cl_target_option *targ_options;
12175 if (target_parts)
12176 targ_options = TREE_TARGET_OPTION (target_parts);
12177 else
12178 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12179 gcc_assert (targ_options);
12181 const struct processor *this_arch
12182 = aarch64_get_arch (targ_options->x_explicit_arch);
12184 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12185 std::string extension
12186 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12187 this_arch->flags);
12188 /* Only update the assembler .arch string if it is distinct from the last
12189 such string we printed. */
12190 std::string to_print = this_arch->name + extension;
12191 if (to_print != aarch64_last_printed_arch_string)
12193 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12194 aarch64_last_printed_arch_string = to_print;
12197 /* Print the cpu name we're tuning for in the comments, might be
12198 useful to readers of the generated asm. Do it only when it changes
12199 from function to function and verbose assembly is requested. */
12200 const struct processor *this_tune
12201 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12203 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12205 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12206 this_tune->name);
12207 aarch64_last_printed_tune_string = this_tune->name;
12210 /* Don't forget the type directive for ELF. */
12211 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12212 ASM_OUTPUT_LABEL (stream, name);
12215 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12217 static void
12218 aarch64_start_file (void)
12220 struct cl_target_option *default_options
12221 = TREE_TARGET_OPTION (target_option_default_node);
12223 const struct processor *default_arch
12224 = aarch64_get_arch (default_options->x_explicit_arch);
12225 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12226 std::string extension
12227 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12228 default_arch->flags);
12230 aarch64_last_printed_arch_string = default_arch->name + extension;
12231 aarch64_last_printed_tune_string = "";
12232 asm_fprintf (asm_out_file, "\t.arch %s\n",
12233 aarch64_last_printed_arch_string.c_str ());
12235 default_file_start ();
12238 /* Emit load exclusive. */
12240 static void
12241 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12242 rtx mem, rtx model_rtx)
12244 rtx (*gen) (rtx, rtx, rtx);
12246 switch (mode)
12248 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12249 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12250 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12251 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12252 default:
12253 gcc_unreachable ();
12256 emit_insn (gen (rval, mem, model_rtx));
12259 /* Emit store exclusive. */
12261 static void
12262 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12263 rtx rval, rtx mem, rtx model_rtx)
12265 rtx (*gen) (rtx, rtx, rtx, rtx);
12267 switch (mode)
12269 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12270 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12271 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12272 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12273 default:
12274 gcc_unreachable ();
12277 emit_insn (gen (bval, rval, mem, model_rtx));
12280 /* Mark the previous jump instruction as unlikely. */
12282 static void
12283 aarch64_emit_unlikely_jump (rtx insn)
12285 rtx_insn *jump = emit_jump_insn (insn);
12286 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12289 /* Expand a compare and swap pattern. */
12291 void
12292 aarch64_expand_compare_and_swap (rtx operands[])
12294 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12295 machine_mode mode, cmp_mode;
12296 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12297 int idx;
12298 gen_cas_fn gen;
12299 const gen_cas_fn split_cas[] =
12301 gen_aarch64_compare_and_swapqi,
12302 gen_aarch64_compare_and_swaphi,
12303 gen_aarch64_compare_and_swapsi,
12304 gen_aarch64_compare_and_swapdi
12306 const gen_cas_fn atomic_cas[] =
12308 gen_aarch64_compare_and_swapqi_lse,
12309 gen_aarch64_compare_and_swaphi_lse,
12310 gen_aarch64_compare_and_swapsi_lse,
12311 gen_aarch64_compare_and_swapdi_lse
12314 bval = operands[0];
12315 rval = operands[1];
12316 mem = operands[2];
12317 oldval = operands[3];
12318 newval = operands[4];
12319 is_weak = operands[5];
12320 mod_s = operands[6];
12321 mod_f = operands[7];
12322 mode = GET_MODE (mem);
12323 cmp_mode = mode;
12325 /* Normally the succ memory model must be stronger than fail, but in the
12326 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12327 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12329 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12330 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12331 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12333 switch (mode)
12335 case E_QImode:
12336 case E_HImode:
12337 /* For short modes, we're going to perform the comparison in SImode,
12338 so do the zero-extension now. */
12339 cmp_mode = SImode;
12340 rval = gen_reg_rtx (SImode);
12341 oldval = convert_modes (SImode, mode, oldval, true);
12342 /* Fall through. */
12344 case E_SImode:
12345 case E_DImode:
12346 /* Force the value into a register if needed. */
12347 if (!aarch64_plus_operand (oldval, mode))
12348 oldval = force_reg (cmp_mode, oldval);
12349 break;
12351 default:
12352 gcc_unreachable ();
12355 switch (mode)
12357 case E_QImode: idx = 0; break;
12358 case E_HImode: idx = 1; break;
12359 case E_SImode: idx = 2; break;
12360 case E_DImode: idx = 3; break;
12361 default:
12362 gcc_unreachable ();
12364 if (TARGET_LSE)
12365 gen = atomic_cas[idx];
12366 else
12367 gen = split_cas[idx];
12369 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12371 if (mode == QImode || mode == HImode)
12372 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12374 x = gen_rtx_REG (CCmode, CC_REGNUM);
12375 x = gen_rtx_EQ (SImode, x, const0_rtx);
12376 emit_insn (gen_rtx_SET (bval, x));
12379 /* Test whether the target supports using a atomic load-operate instruction.
12380 CODE is the operation and AFTER is TRUE if the data in memory after the
12381 operation should be returned and FALSE if the data before the operation
12382 should be returned. Returns FALSE if the operation isn't supported by the
12383 architecture. */
12385 bool
12386 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12388 if (!TARGET_LSE)
12389 return false;
12391 switch (code)
12393 case SET:
12394 case AND:
12395 case IOR:
12396 case XOR:
12397 case MINUS:
12398 case PLUS:
12399 return true;
12400 default:
12401 return false;
12405 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12406 sequence implementing an atomic operation. */
12408 static void
12409 aarch64_emit_post_barrier (enum memmodel model)
12411 const enum memmodel base_model = memmodel_base (model);
12413 if (is_mm_sync (model)
12414 && (base_model == MEMMODEL_ACQUIRE
12415 || base_model == MEMMODEL_ACQ_REL
12416 || base_model == MEMMODEL_SEQ_CST))
12418 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12422 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12423 for the data in memory. EXPECTED is the value expected to be in memory.
12424 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12425 is the memory ordering to use. */
12427 void
12428 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12429 rtx expected, rtx desired,
12430 rtx model)
12432 rtx (*gen) (rtx, rtx, rtx, rtx);
12433 machine_mode mode;
12435 mode = GET_MODE (mem);
12437 switch (mode)
12439 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12440 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12441 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12442 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12443 default:
12444 gcc_unreachable ();
12447 /* Move the expected value into the CAS destination register. */
12448 emit_insn (gen_rtx_SET (rval, expected));
12450 /* Emit the CAS. */
12451 emit_insn (gen (rval, mem, desired, model));
12453 /* Compare the expected value with the value loaded by the CAS, to establish
12454 whether the swap was made. */
12455 aarch64_gen_compare_reg (EQ, rval, expected);
12458 /* Split a compare and swap pattern. */
12460 void
12461 aarch64_split_compare_and_swap (rtx operands[])
12463 rtx rval, mem, oldval, newval, scratch;
12464 machine_mode mode;
12465 bool is_weak;
12466 rtx_code_label *label1, *label2;
12467 rtx x, cond;
12468 enum memmodel model;
12469 rtx model_rtx;
12471 rval = operands[0];
12472 mem = operands[1];
12473 oldval = operands[2];
12474 newval = operands[3];
12475 is_weak = (operands[4] != const0_rtx);
12476 model_rtx = operands[5];
12477 scratch = operands[7];
12478 mode = GET_MODE (mem);
12479 model = memmodel_from_int (INTVAL (model_rtx));
12481 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12482 loop:
12483 .label1:
12484 LD[A]XR rval, [mem]
12485 CBNZ rval, .label2
12486 ST[L]XR scratch, newval, [mem]
12487 CBNZ scratch, .label1
12488 .label2:
12489 CMP rval, 0. */
12490 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12492 label1 = NULL;
12493 if (!is_weak)
12495 label1 = gen_label_rtx ();
12496 emit_label (label1);
12498 label2 = gen_label_rtx ();
12500 /* The initial load can be relaxed for a __sync operation since a final
12501 barrier will be emitted to stop code hoisting. */
12502 if (is_mm_sync (model))
12503 aarch64_emit_load_exclusive (mode, rval, mem,
12504 GEN_INT (MEMMODEL_RELAXED));
12505 else
12506 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12508 if (strong_zero_p)
12510 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12511 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12512 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12513 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12515 else
12517 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12518 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12519 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12520 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12521 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12524 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12526 if (!is_weak)
12528 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12529 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12530 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12531 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12533 else
12535 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12536 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12537 emit_insn (gen_rtx_SET (cond, x));
12540 emit_label (label2);
12541 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12542 to set the condition flags. If this is not used it will be removed by
12543 later passes. */
12544 if (strong_zero_p)
12546 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12547 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12548 emit_insn (gen_rtx_SET (cond, x));
12550 /* Emit any final barrier needed for a __sync operation. */
12551 if (is_mm_sync (model))
12552 aarch64_emit_post_barrier (model);
12555 /* Emit a BIC instruction. */
12557 static void
12558 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12560 rtx shift_rtx = GEN_INT (shift);
12561 rtx (*gen) (rtx, rtx, rtx, rtx);
12563 switch (mode)
12565 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12566 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12567 default:
12568 gcc_unreachable ();
12571 emit_insn (gen (dst, s2, shift_rtx, s1));
12574 /* Emit an atomic swap. */
12576 static void
12577 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12578 rtx mem, rtx model)
12580 rtx (*gen) (rtx, rtx, rtx, rtx);
12582 switch (mode)
12584 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12585 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12586 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12587 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12588 default:
12589 gcc_unreachable ();
12592 emit_insn (gen (dst, mem, value, model));
12595 /* Operations supported by aarch64_emit_atomic_load_op. */
12597 enum aarch64_atomic_load_op_code
12599 AARCH64_LDOP_PLUS, /* A + B */
12600 AARCH64_LDOP_XOR, /* A ^ B */
12601 AARCH64_LDOP_OR, /* A | B */
12602 AARCH64_LDOP_BIC /* A & ~B */
12605 /* Emit an atomic load-operate. */
12607 static void
12608 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12609 machine_mode mode, rtx dst, rtx src,
12610 rtx mem, rtx model)
12612 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12613 const aarch64_atomic_load_op_fn plus[] =
12615 gen_aarch64_atomic_loadaddqi,
12616 gen_aarch64_atomic_loadaddhi,
12617 gen_aarch64_atomic_loadaddsi,
12618 gen_aarch64_atomic_loadadddi
12620 const aarch64_atomic_load_op_fn eor[] =
12622 gen_aarch64_atomic_loadeorqi,
12623 gen_aarch64_atomic_loadeorhi,
12624 gen_aarch64_atomic_loadeorsi,
12625 gen_aarch64_atomic_loadeordi
12627 const aarch64_atomic_load_op_fn ior[] =
12629 gen_aarch64_atomic_loadsetqi,
12630 gen_aarch64_atomic_loadsethi,
12631 gen_aarch64_atomic_loadsetsi,
12632 gen_aarch64_atomic_loadsetdi
12634 const aarch64_atomic_load_op_fn bic[] =
12636 gen_aarch64_atomic_loadclrqi,
12637 gen_aarch64_atomic_loadclrhi,
12638 gen_aarch64_atomic_loadclrsi,
12639 gen_aarch64_atomic_loadclrdi
12641 aarch64_atomic_load_op_fn gen;
12642 int idx = 0;
12644 switch (mode)
12646 case E_QImode: idx = 0; break;
12647 case E_HImode: idx = 1; break;
12648 case E_SImode: idx = 2; break;
12649 case E_DImode: idx = 3; break;
12650 default:
12651 gcc_unreachable ();
12654 switch (code)
12656 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12657 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12658 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12659 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12660 default:
12661 gcc_unreachable ();
12664 emit_insn (gen (dst, mem, src, model));
12667 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12668 location to store the data read from memory. OUT_RESULT is the location to
12669 store the result of the operation. MEM is the memory location to read and
12670 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12671 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12672 be NULL. */
12674 void
12675 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12676 rtx mem, rtx value, rtx model_rtx)
12678 machine_mode mode = GET_MODE (mem);
12679 machine_mode wmode = (mode == DImode ? DImode : SImode);
12680 const bool short_mode = (mode < SImode);
12681 aarch64_atomic_load_op_code ldop_code;
12682 rtx src;
12683 rtx x;
12685 if (out_data)
12686 out_data = gen_lowpart (mode, out_data);
12688 if (out_result)
12689 out_result = gen_lowpart (mode, out_result);
12691 /* Make sure the value is in a register, putting it into a destination
12692 register if it needs to be manipulated. */
12693 if (!register_operand (value, mode)
12694 || code == AND || code == MINUS)
12696 src = out_result ? out_result : out_data;
12697 emit_move_insn (src, gen_lowpart (mode, value));
12699 else
12700 src = value;
12701 gcc_assert (register_operand (src, mode));
12703 /* Preprocess the data for the operation as necessary. If the operation is
12704 a SET then emit a swap instruction and finish. */
12705 switch (code)
12707 case SET:
12708 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12709 return;
12711 case MINUS:
12712 /* Negate the value and treat it as a PLUS. */
12714 rtx neg_src;
12716 /* Resize the value if necessary. */
12717 if (short_mode)
12718 src = gen_lowpart (wmode, src);
12720 neg_src = gen_rtx_NEG (wmode, src);
12721 emit_insn (gen_rtx_SET (src, neg_src));
12723 if (short_mode)
12724 src = gen_lowpart (mode, src);
12726 /* Fall-through. */
12727 case PLUS:
12728 ldop_code = AARCH64_LDOP_PLUS;
12729 break;
12731 case IOR:
12732 ldop_code = AARCH64_LDOP_OR;
12733 break;
12735 case XOR:
12736 ldop_code = AARCH64_LDOP_XOR;
12737 break;
12739 case AND:
12741 rtx not_src;
12743 /* Resize the value if necessary. */
12744 if (short_mode)
12745 src = gen_lowpart (wmode, src);
12747 not_src = gen_rtx_NOT (wmode, src);
12748 emit_insn (gen_rtx_SET (src, not_src));
12750 if (short_mode)
12751 src = gen_lowpart (mode, src);
12753 ldop_code = AARCH64_LDOP_BIC;
12754 break;
12756 default:
12757 /* The operation can't be done with atomic instructions. */
12758 gcc_unreachable ();
12761 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12763 /* If necessary, calculate the data in memory after the update by redoing the
12764 operation from values in registers. */
12765 if (!out_result)
12766 return;
12768 if (short_mode)
12770 src = gen_lowpart (wmode, src);
12771 out_data = gen_lowpart (wmode, out_data);
12772 out_result = gen_lowpart (wmode, out_result);
12775 x = NULL_RTX;
12777 switch (code)
12779 case MINUS:
12780 case PLUS:
12781 x = gen_rtx_PLUS (wmode, out_data, src);
12782 break;
12783 case IOR:
12784 x = gen_rtx_IOR (wmode, out_data, src);
12785 break;
12786 case XOR:
12787 x = gen_rtx_XOR (wmode, out_data, src);
12788 break;
12789 case AND:
12790 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12791 return;
12792 default:
12793 gcc_unreachable ();
12796 emit_set_insn (out_result, x);
12798 return;
12801 /* Split an atomic operation. */
12803 void
12804 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12805 rtx value, rtx model_rtx, rtx cond)
12807 machine_mode mode = GET_MODE (mem);
12808 machine_mode wmode = (mode == DImode ? DImode : SImode);
12809 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12810 const bool is_sync = is_mm_sync (model);
12811 rtx_code_label *label;
12812 rtx x;
12814 /* Split the atomic operation into a sequence. */
12815 label = gen_label_rtx ();
12816 emit_label (label);
12818 if (new_out)
12819 new_out = gen_lowpart (wmode, new_out);
12820 if (old_out)
12821 old_out = gen_lowpart (wmode, old_out);
12822 else
12823 old_out = new_out;
12824 value = simplify_gen_subreg (wmode, value, mode, 0);
12826 /* The initial load can be relaxed for a __sync operation since a final
12827 barrier will be emitted to stop code hoisting. */
12828 if (is_sync)
12829 aarch64_emit_load_exclusive (mode, old_out, mem,
12830 GEN_INT (MEMMODEL_RELAXED));
12831 else
12832 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12834 switch (code)
12836 case SET:
12837 new_out = value;
12838 break;
12840 case NOT:
12841 x = gen_rtx_AND (wmode, old_out, value);
12842 emit_insn (gen_rtx_SET (new_out, x));
12843 x = gen_rtx_NOT (wmode, new_out);
12844 emit_insn (gen_rtx_SET (new_out, x));
12845 break;
12847 case MINUS:
12848 if (CONST_INT_P (value))
12850 value = GEN_INT (-INTVAL (value));
12851 code = PLUS;
12853 /* Fall through. */
12855 default:
12856 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12857 emit_insn (gen_rtx_SET (new_out, x));
12858 break;
12861 aarch64_emit_store_exclusive (mode, cond, mem,
12862 gen_lowpart (mode, new_out), model_rtx);
12864 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12865 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12866 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12867 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12869 /* Emit any final barrier needed for a __sync operation. */
12870 if (is_sync)
12871 aarch64_emit_post_barrier (model);
12874 static void
12875 aarch64_init_libfuncs (void)
12877 /* Half-precision float operations. The compiler handles all operations
12878 with NULL libfuncs by converting to SFmode. */
12880 /* Conversions. */
12881 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12882 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12884 /* Arithmetic. */
12885 set_optab_libfunc (add_optab, HFmode, NULL);
12886 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12887 set_optab_libfunc (smul_optab, HFmode, NULL);
12888 set_optab_libfunc (neg_optab, HFmode, NULL);
12889 set_optab_libfunc (sub_optab, HFmode, NULL);
12891 /* Comparisons. */
12892 set_optab_libfunc (eq_optab, HFmode, NULL);
12893 set_optab_libfunc (ne_optab, HFmode, NULL);
12894 set_optab_libfunc (lt_optab, HFmode, NULL);
12895 set_optab_libfunc (le_optab, HFmode, NULL);
12896 set_optab_libfunc (ge_optab, HFmode, NULL);
12897 set_optab_libfunc (gt_optab, HFmode, NULL);
12898 set_optab_libfunc (unord_optab, HFmode, NULL);
12901 /* Target hook for c_mode_for_suffix. */
12902 static machine_mode
12903 aarch64_c_mode_for_suffix (char suffix)
12905 if (suffix == 'q')
12906 return TFmode;
12908 return VOIDmode;
12911 /* We can only represent floating point constants which will fit in
12912 "quarter-precision" values. These values are characterised by
12913 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12916 (-1)^s * (n/16) * 2^r
12918 Where:
12919 's' is the sign bit.
12920 'n' is an integer in the range 16 <= n <= 31.
12921 'r' is an integer in the range -3 <= r <= 4. */
12923 /* Return true iff X can be represented by a quarter-precision
12924 floating point immediate operand X. Note, we cannot represent 0.0. */
12925 bool
12926 aarch64_float_const_representable_p (rtx x)
12928 /* This represents our current view of how many bits
12929 make up the mantissa. */
12930 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12931 int exponent;
12932 unsigned HOST_WIDE_INT mantissa, mask;
12933 REAL_VALUE_TYPE r, m;
12934 bool fail;
12936 if (!CONST_DOUBLE_P (x))
12937 return false;
12939 /* We don't support HFmode constants yet. */
12940 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12941 return false;
12943 r = *CONST_DOUBLE_REAL_VALUE (x);
12945 /* We cannot represent infinities, NaNs or +/-zero. We won't
12946 know if we have +zero until we analyse the mantissa, but we
12947 can reject the other invalid values. */
12948 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12949 || REAL_VALUE_MINUS_ZERO (r))
12950 return false;
12952 /* Extract exponent. */
12953 r = real_value_abs (&r);
12954 exponent = REAL_EXP (&r);
12956 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12957 highest (sign) bit, with a fixed binary point at bit point_pos.
12958 m1 holds the low part of the mantissa, m2 the high part.
12959 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12960 bits for the mantissa, this can fail (low bits will be lost). */
12961 real_ldexp (&m, &r, point_pos - exponent);
12962 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12964 /* If the low part of the mantissa has bits set we cannot represent
12965 the value. */
12966 if (w.ulow () != 0)
12967 return false;
12968 /* We have rejected the lower HOST_WIDE_INT, so update our
12969 understanding of how many bits lie in the mantissa and
12970 look only at the high HOST_WIDE_INT. */
12971 mantissa = w.elt (1);
12972 point_pos -= HOST_BITS_PER_WIDE_INT;
12974 /* We can only represent values with a mantissa of the form 1.xxxx. */
12975 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12976 if ((mantissa & mask) != 0)
12977 return false;
12979 /* Having filtered unrepresentable values, we may now remove all
12980 but the highest 5 bits. */
12981 mantissa >>= point_pos - 5;
12983 /* We cannot represent the value 0.0, so reject it. This is handled
12984 elsewhere. */
12985 if (mantissa == 0)
12986 return false;
12988 /* Then, as bit 4 is always set, we can mask it off, leaving
12989 the mantissa in the range [0, 15]. */
12990 mantissa &= ~(1 << 4);
12991 gcc_assert (mantissa <= 15);
12993 /* GCC internally does not use IEEE754-like encoding (where normalized
12994 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12995 Our mantissa values are shifted 4 places to the left relative to
12996 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12997 by 5 places to correct for GCC's representation. */
12998 exponent = 5 - exponent;
13000 return (exponent >= 0 && exponent <= 7);
13003 char*
13004 aarch64_output_simd_mov_immediate (rtx const_vector,
13005 machine_mode mode,
13006 unsigned width)
13008 bool is_valid;
13009 static char templ[40];
13010 const char *mnemonic;
13011 const char *shift_op;
13012 unsigned int lane_count = 0;
13013 char element_char;
13015 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13017 /* This will return true to show const_vector is legal for use as either
13018 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
13019 also update INFO to show how the immediate should be generated. */
13020 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13021 gcc_assert (is_valid);
13023 element_char = sizetochar (info.element_width);
13024 lane_count = width / info.element_width;
13026 mode = GET_MODE_INNER (mode);
13027 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13029 gcc_assert (info.shift == 0 && ! info.mvn);
13030 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13031 move immediate path. */
13032 if (aarch64_float_const_zero_rtx_p (info.value))
13033 info.value = GEN_INT (0);
13034 else
13036 const unsigned int buf_size = 20;
13037 char float_buf[buf_size] = {'\0'};
13038 real_to_decimal_for_mode (float_buf,
13039 CONST_DOUBLE_REAL_VALUE (info.value),
13040 buf_size, buf_size, 1, mode);
13042 if (lane_count == 1)
13043 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13044 else
13045 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13046 lane_count, element_char, float_buf);
13047 return templ;
13051 mnemonic = info.mvn ? "mvni" : "movi";
13052 shift_op = info.msl ? "msl" : "lsl";
13054 gcc_assert (CONST_INT_P (info.value));
13055 if (lane_count == 1)
13056 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13057 mnemonic, UINTVAL (info.value));
13058 else if (info.shift)
13059 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13060 ", %s %d", mnemonic, lane_count, element_char,
13061 UINTVAL (info.value), shift_op, info.shift);
13062 else
13063 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13064 mnemonic, lane_count, element_char, UINTVAL (info.value));
13065 return templ;
13068 char*
13069 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13072 /* If a floating point number was passed and we desire to use it in an
13073 integer mode do the conversion to integer. */
13074 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13076 unsigned HOST_WIDE_INT ival;
13077 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13078 gcc_unreachable ();
13079 immediate = gen_int_mode (ival, mode);
13082 machine_mode vmode;
13083 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13084 a 128 bit vector mode. */
13085 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13087 vmode = aarch64_simd_container_mode (mode, width);
13088 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13089 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13092 /* Split operands into moves from op[1] + op[2] into op[0]. */
13094 void
13095 aarch64_split_combinev16qi (rtx operands[3])
13097 unsigned int dest = REGNO (operands[0]);
13098 unsigned int src1 = REGNO (operands[1]);
13099 unsigned int src2 = REGNO (operands[2]);
13100 machine_mode halfmode = GET_MODE (operands[1]);
13101 unsigned int halfregs = REG_NREGS (operands[1]);
13102 rtx destlo, desthi;
13104 gcc_assert (halfmode == V16QImode);
13106 if (src1 == dest && src2 == dest + halfregs)
13108 /* No-op move. Can't split to nothing; emit something. */
13109 emit_note (NOTE_INSN_DELETED);
13110 return;
13113 /* Preserve register attributes for variable tracking. */
13114 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13115 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13116 GET_MODE_SIZE (halfmode));
13118 /* Special case of reversed high/low parts. */
13119 if (reg_overlap_mentioned_p (operands[2], destlo)
13120 && reg_overlap_mentioned_p (operands[1], desthi))
13122 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13123 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13124 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13126 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13128 /* Try to avoid unnecessary moves if part of the result
13129 is in the right place already. */
13130 if (src1 != dest)
13131 emit_move_insn (destlo, operands[1]);
13132 if (src2 != dest + halfregs)
13133 emit_move_insn (desthi, operands[2]);
13135 else
13137 if (src2 != dest + halfregs)
13138 emit_move_insn (desthi, operands[2]);
13139 if (src1 != dest)
13140 emit_move_insn (destlo, operands[1]);
13144 /* vec_perm support. */
13146 #define MAX_VECT_LEN 16
13148 struct expand_vec_perm_d
13150 rtx target, op0, op1;
13151 auto_vec_perm_indices perm;
13152 machine_mode vmode;
13153 bool one_vector_p;
13154 bool testing_p;
13157 /* Generate a variable permutation. */
13159 static void
13160 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13162 machine_mode vmode = GET_MODE (target);
13163 bool one_vector_p = rtx_equal_p (op0, op1);
13165 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13166 gcc_checking_assert (GET_MODE (op0) == vmode);
13167 gcc_checking_assert (GET_MODE (op1) == vmode);
13168 gcc_checking_assert (GET_MODE (sel) == vmode);
13169 gcc_checking_assert (TARGET_SIMD);
13171 if (one_vector_p)
13173 if (vmode == V8QImode)
13175 /* Expand the argument to a V16QI mode by duplicating it. */
13176 rtx pair = gen_reg_rtx (V16QImode);
13177 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13178 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13180 else
13182 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13185 else
13187 rtx pair;
13189 if (vmode == V8QImode)
13191 pair = gen_reg_rtx (V16QImode);
13192 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13193 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13195 else
13197 pair = gen_reg_rtx (OImode);
13198 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13199 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13204 void
13205 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13207 machine_mode vmode = GET_MODE (target);
13208 unsigned int nelt = GET_MODE_NUNITS (vmode);
13209 bool one_vector_p = rtx_equal_p (op0, op1);
13210 rtx mask;
13212 /* The TBL instruction does not use a modulo index, so we must take care
13213 of that ourselves. */
13214 mask = aarch64_simd_gen_const_vector_dup (vmode,
13215 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13216 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13218 /* For big-endian, we also need to reverse the index within the vector
13219 (but not which vector). */
13220 if (BYTES_BIG_ENDIAN)
13222 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13223 if (!one_vector_p)
13224 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13225 sel = expand_simple_binop (vmode, XOR, sel, mask,
13226 NULL, 0, OPTAB_LIB_WIDEN);
13228 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13231 /* Recognize patterns suitable for the TRN instructions. */
13232 static bool
13233 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13235 unsigned int i, odd, mask, nelt = d->perm.length ();
13236 rtx out, in0, in1, x;
13237 rtx (*gen) (rtx, rtx, rtx);
13238 machine_mode vmode = d->vmode;
13240 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13241 return false;
13243 /* Note that these are little-endian tests.
13244 We correct for big-endian later. */
13245 if (d->perm[0] == 0)
13246 odd = 0;
13247 else if (d->perm[0] == 1)
13248 odd = 1;
13249 else
13250 return false;
13251 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13253 for (i = 0; i < nelt; i += 2)
13255 if (d->perm[i] != i + odd)
13256 return false;
13257 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13258 return false;
13261 /* Success! */
13262 if (d->testing_p)
13263 return true;
13265 in0 = d->op0;
13266 in1 = d->op1;
13267 if (BYTES_BIG_ENDIAN)
13269 x = in0, in0 = in1, in1 = x;
13270 odd = !odd;
13272 out = d->target;
13274 if (odd)
13276 switch (vmode)
13278 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13279 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13280 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13281 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13282 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13283 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13284 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13285 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13286 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13287 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13288 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13289 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13290 default:
13291 return false;
13294 else
13296 switch (vmode)
13298 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13299 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13300 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13301 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13302 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13303 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13304 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13305 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13306 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13307 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13308 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13309 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13310 default:
13311 return false;
13315 emit_insn (gen (out, in0, in1));
13316 return true;
13319 /* Recognize patterns suitable for the UZP instructions. */
13320 static bool
13321 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13323 unsigned int i, odd, mask, nelt = d->perm.length ();
13324 rtx out, in0, in1, x;
13325 rtx (*gen) (rtx, rtx, rtx);
13326 machine_mode vmode = d->vmode;
13328 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13329 return false;
13331 /* Note that these are little-endian tests.
13332 We correct for big-endian later. */
13333 if (d->perm[0] == 0)
13334 odd = 0;
13335 else if (d->perm[0] == 1)
13336 odd = 1;
13337 else
13338 return false;
13339 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13341 for (i = 0; i < nelt; i++)
13343 unsigned elt = (i * 2 + odd) & mask;
13344 if (d->perm[i] != elt)
13345 return false;
13348 /* Success! */
13349 if (d->testing_p)
13350 return true;
13352 in0 = d->op0;
13353 in1 = d->op1;
13354 if (BYTES_BIG_ENDIAN)
13356 x = in0, in0 = in1, in1 = x;
13357 odd = !odd;
13359 out = d->target;
13361 if (odd)
13363 switch (vmode)
13365 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13366 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13367 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13368 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13369 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13370 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13371 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13372 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13373 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13374 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13375 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13376 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13377 default:
13378 return false;
13381 else
13383 switch (vmode)
13385 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13386 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13387 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13388 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13389 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13390 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13391 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13392 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13393 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13394 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13395 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13396 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13397 default:
13398 return false;
13402 emit_insn (gen (out, in0, in1));
13403 return true;
13406 /* Recognize patterns suitable for the ZIP instructions. */
13407 static bool
13408 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13410 unsigned int i, high, mask, nelt = d->perm.length ();
13411 rtx out, in0, in1, x;
13412 rtx (*gen) (rtx, rtx, rtx);
13413 machine_mode vmode = d->vmode;
13415 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13416 return false;
13418 /* Note that these are little-endian tests.
13419 We correct for big-endian later. */
13420 high = nelt / 2;
13421 if (d->perm[0] == high)
13422 /* Do Nothing. */
13424 else if (d->perm[0] == 0)
13425 high = 0;
13426 else
13427 return false;
13428 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13430 for (i = 0; i < nelt / 2; i++)
13432 unsigned elt = (i + high) & mask;
13433 if (d->perm[i * 2] != elt)
13434 return false;
13435 elt = (elt + nelt) & mask;
13436 if (d->perm[i * 2 + 1] != elt)
13437 return false;
13440 /* Success! */
13441 if (d->testing_p)
13442 return true;
13444 in0 = d->op0;
13445 in1 = d->op1;
13446 if (BYTES_BIG_ENDIAN)
13448 x = in0, in0 = in1, in1 = x;
13449 high = !high;
13451 out = d->target;
13453 if (high)
13455 switch (vmode)
13457 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13458 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13459 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13460 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13461 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13462 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13463 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13464 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13465 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13466 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13467 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13468 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13469 default:
13470 return false;
13473 else
13475 switch (vmode)
13477 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13478 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13479 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13480 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13481 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13482 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13483 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13484 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13485 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13486 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13487 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13488 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13489 default:
13490 return false;
13494 emit_insn (gen (out, in0, in1));
13495 return true;
13498 /* Recognize patterns for the EXT insn. */
13500 static bool
13501 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13503 unsigned int i, nelt = d->perm.length ();
13504 rtx (*gen) (rtx, rtx, rtx, rtx);
13505 rtx offset;
13507 unsigned int location = d->perm[0]; /* Always < nelt. */
13509 /* Check if the extracted indices are increasing by one. */
13510 for (i = 1; i < nelt; i++)
13512 unsigned int required = location + i;
13513 if (d->one_vector_p)
13515 /* We'll pass the same vector in twice, so allow indices to wrap. */
13516 required &= (nelt - 1);
13518 if (d->perm[i] != required)
13519 return false;
13522 switch (d->vmode)
13524 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13525 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13526 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13527 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13528 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13529 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13530 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13531 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13532 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13533 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13534 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13535 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13536 default:
13537 return false;
13540 /* Success! */
13541 if (d->testing_p)
13542 return true;
13544 /* The case where (location == 0) is a no-op for both big- and little-endian,
13545 and is removed by the mid-end at optimization levels -O1 and higher. */
13547 if (BYTES_BIG_ENDIAN && (location != 0))
13549 /* After setup, we want the high elements of the first vector (stored
13550 at the LSB end of the register), and the low elements of the second
13551 vector (stored at the MSB end of the register). So swap. */
13552 std::swap (d->op0, d->op1);
13553 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13554 location = nelt - location;
13557 offset = GEN_INT (location);
13558 emit_insn (gen (d->target, d->op0, d->op1, offset));
13559 return true;
13562 /* Recognize patterns for the REV insns. */
13564 static bool
13565 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13567 unsigned int i, j, diff, nelt = d->perm.length ();
13568 rtx (*gen) (rtx, rtx);
13570 if (!d->one_vector_p)
13571 return false;
13573 diff = d->perm[0];
13574 switch (diff)
13576 case 7:
13577 switch (d->vmode)
13579 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13580 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13581 default:
13582 return false;
13584 break;
13585 case 3:
13586 switch (d->vmode)
13588 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13589 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13590 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13591 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13592 default:
13593 return false;
13595 break;
13596 case 1:
13597 switch (d->vmode)
13599 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13600 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13601 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13602 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13603 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13604 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13605 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13606 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13607 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13608 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13609 default:
13610 return false;
13612 break;
13613 default:
13614 return false;
13617 for (i = 0; i < nelt ; i += diff + 1)
13618 for (j = 0; j <= diff; j += 1)
13620 /* This is guaranteed to be true as the value of diff
13621 is 7, 3, 1 and we should have enough elements in the
13622 queue to generate this. Getting a vector mask with a
13623 value of diff other than these values implies that
13624 something is wrong by the time we get here. */
13625 gcc_assert (i + j < nelt);
13626 if (d->perm[i + j] != i + diff - j)
13627 return false;
13630 /* Success! */
13631 if (d->testing_p)
13632 return true;
13634 emit_insn (gen (d->target, d->op0));
13635 return true;
13638 static bool
13639 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13641 rtx (*gen) (rtx, rtx, rtx);
13642 rtx out = d->target;
13643 rtx in0;
13644 machine_mode vmode = d->vmode;
13645 unsigned int i, elt, nelt = d->perm.length ();
13646 rtx lane;
13648 elt = d->perm[0];
13649 for (i = 1; i < nelt; i++)
13651 if (elt != d->perm[i])
13652 return false;
13655 /* The generic preparation in aarch64_expand_vec_perm_const_1
13656 swaps the operand order and the permute indices if it finds
13657 d->perm[0] to be in the second operand. Thus, we can always
13658 use d->op0 and need not do any extra arithmetic to get the
13659 correct lane number. */
13660 in0 = d->op0;
13661 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13663 switch (vmode)
13665 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13666 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13667 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13668 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13669 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13670 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13671 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13672 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13673 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13674 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13675 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13676 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13677 default:
13678 return false;
13681 emit_insn (gen (out, in0, lane));
13682 return true;
13685 static bool
13686 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13688 rtx rperm[MAX_VECT_LEN], sel;
13689 machine_mode vmode = d->vmode;
13690 unsigned int i, nelt = d->perm.length ();
13692 if (d->testing_p)
13693 return true;
13695 /* Generic code will try constant permutation twice. Once with the
13696 original mode and again with the elements lowered to QImode.
13697 So wait and don't do the selector expansion ourselves. */
13698 if (vmode != V8QImode && vmode != V16QImode)
13699 return false;
13701 for (i = 0; i < nelt; ++i)
13703 int nunits = GET_MODE_NUNITS (vmode);
13705 /* If big-endian and two vectors we end up with a weird mixed-endian
13706 mode on NEON. Reverse the index within each word but not the word
13707 itself. */
13708 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13709 : d->perm[i]);
13711 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13712 sel = force_reg (vmode, sel);
13714 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13715 return true;
13718 static bool
13719 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13721 /* The pattern matching functions above are written to look for a small
13722 number to begin the sequence (0, 1, N/2). If we begin with an index
13723 from the second operand, we can swap the operands. */
13724 unsigned int nelt = d->perm.length ();
13725 if (d->perm[0] >= nelt)
13727 gcc_assert (nelt == (nelt & -nelt));
13728 for (unsigned int i = 0; i < nelt; ++i)
13729 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13731 std::swap (d->op0, d->op1);
13734 if (TARGET_SIMD)
13736 if (aarch64_evpc_rev (d))
13737 return true;
13738 else if (aarch64_evpc_ext (d))
13739 return true;
13740 else if (aarch64_evpc_dup (d))
13741 return true;
13742 else if (aarch64_evpc_zip (d))
13743 return true;
13744 else if (aarch64_evpc_uzp (d))
13745 return true;
13746 else if (aarch64_evpc_trn (d))
13747 return true;
13748 return aarch64_evpc_tbl (d);
13750 return false;
13753 /* Expand a vec_perm_const pattern. */
13755 bool
13756 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13758 struct expand_vec_perm_d d;
13759 int i, nelt, which;
13761 d.target = target;
13762 d.op0 = op0;
13763 d.op1 = op1;
13765 d.vmode = GET_MODE (target);
13766 gcc_assert (VECTOR_MODE_P (d.vmode));
13767 d.testing_p = false;
13769 nelt = GET_MODE_NUNITS (d.vmode);
13770 d.perm.reserve (nelt);
13771 for (i = which = 0; i < nelt; ++i)
13773 rtx e = XVECEXP (sel, 0, i);
13774 int ei = INTVAL (e) & (2 * nelt - 1);
13775 which |= (ei < nelt ? 1 : 2);
13776 d.perm.quick_push (ei);
13779 switch (which)
13781 default:
13782 gcc_unreachable ();
13784 case 3:
13785 d.one_vector_p = false;
13786 if (!rtx_equal_p (op0, op1))
13787 break;
13789 /* The elements of PERM do not suggest that only the first operand
13790 is used, but both operands are identical. Allow easier matching
13791 of the permutation by folding the permutation into the single
13792 input vector. */
13793 /* Fall Through. */
13794 case 2:
13795 for (i = 0; i < nelt; ++i)
13796 d.perm[i] &= nelt - 1;
13797 d.op0 = op1;
13798 d.one_vector_p = true;
13799 break;
13801 case 1:
13802 d.op1 = op0;
13803 d.one_vector_p = true;
13804 break;
13807 return aarch64_expand_vec_perm_const_1 (&d);
13810 static bool
13811 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
13813 struct expand_vec_perm_d d;
13814 unsigned int i, nelt, which;
13815 bool ret;
13817 d.vmode = vmode;
13818 d.testing_p = true;
13819 d.perm.safe_splice (sel);
13821 /* Calculate whether all elements are in one vector. */
13822 nelt = sel.length ();
13823 for (i = which = 0; i < nelt; ++i)
13825 unsigned int e = d.perm[i];
13826 gcc_assert (e < 2 * nelt);
13827 which |= (e < nelt ? 1 : 2);
13830 /* If all elements are from the second vector, reindex as if from the
13831 first vector. */
13832 if (which == 2)
13833 for (i = 0; i < nelt; ++i)
13834 d.perm[i] -= nelt;
13836 /* Check whether the mask can be applied to a single vector. */
13837 d.one_vector_p = (which != 3);
13839 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13840 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13841 if (!d.one_vector_p)
13842 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13844 start_sequence ();
13845 ret = aarch64_expand_vec_perm_const_1 (&d);
13846 end_sequence ();
13848 return ret;
13852 aarch64_reverse_mask (machine_mode mode)
13854 /* We have to reverse each vector because we dont have
13855 a permuted load that can reverse-load according to ABI rules. */
13856 rtx mask;
13857 rtvec v = rtvec_alloc (16);
13858 int i, j;
13859 int nunits = GET_MODE_NUNITS (mode);
13860 int usize = GET_MODE_UNIT_SIZE (mode);
13862 gcc_assert (BYTES_BIG_ENDIAN);
13863 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13865 for (i = 0; i < nunits; i++)
13866 for (j = 0; j < usize; j++)
13867 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13868 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13869 return force_reg (V16QImode, mask);
13872 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
13873 true. However due to issues with register allocation it is preferable
13874 to avoid tieing integer scalar and FP scalar modes. Executing integer
13875 operations in general registers is better than treating them as scalar
13876 vector operations. This reduces latency and avoids redundant int<->FP
13877 moves. So tie modes if they are either the same class, or vector modes
13878 with other vector modes, vector structs or any scalar mode. */
13880 static bool
13881 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13883 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13884 return true;
13886 /* We specifically want to allow elements of "structure" modes to
13887 be tieable to the structure. This more general condition allows
13888 other rarer situations too. */
13889 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13890 return true;
13892 /* Also allow any scalar modes with vectors. */
13893 if (aarch64_vector_mode_supported_p (mode1)
13894 || aarch64_vector_mode_supported_p (mode2))
13895 return true;
13897 return false;
13900 /* Return a new RTX holding the result of moving POINTER forward by
13901 AMOUNT bytes. */
13903 static rtx
13904 aarch64_move_pointer (rtx pointer, int amount)
13906 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13908 return adjust_automodify_address (pointer, GET_MODE (pointer),
13909 next, amount);
13912 /* Return a new RTX holding the result of moving POINTER forward by the
13913 size of the mode it points to. */
13915 static rtx
13916 aarch64_progress_pointer (rtx pointer)
13918 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13920 return aarch64_move_pointer (pointer, amount);
13923 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13924 MODE bytes. */
13926 static void
13927 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13928 machine_mode mode)
13930 rtx reg = gen_reg_rtx (mode);
13932 /* "Cast" the pointers to the correct mode. */
13933 *src = adjust_address (*src, mode, 0);
13934 *dst = adjust_address (*dst, mode, 0);
13935 /* Emit the memcpy. */
13936 emit_move_insn (reg, *src);
13937 emit_move_insn (*dst, reg);
13938 /* Move the pointers forward. */
13939 *src = aarch64_progress_pointer (*src);
13940 *dst = aarch64_progress_pointer (*dst);
13943 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13944 we succeed, otherwise return false. */
13946 bool
13947 aarch64_expand_movmem (rtx *operands)
13949 unsigned int n;
13950 rtx dst = operands[0];
13951 rtx src = operands[1];
13952 rtx base;
13953 bool speed_p = !optimize_function_for_size_p (cfun);
13955 /* When optimizing for size, give a better estimate of the length of a
13956 memcpy call, but use the default otherwise. */
13957 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13959 /* We can't do anything smart if the amount to copy is not constant. */
13960 if (!CONST_INT_P (operands[2]))
13961 return false;
13963 n = UINTVAL (operands[2]);
13965 /* Try to keep the number of instructions low. For cases below 16 bytes we
13966 need to make at most two moves. For cases above 16 bytes it will be one
13967 move for each 16 byte chunk, then at most two additional moves. */
13968 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13969 return false;
13971 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13972 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13974 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13975 src = adjust_automodify_address (src, VOIDmode, base, 0);
13977 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13978 1-byte chunk. */
13979 if (n < 4)
13981 if (n >= 2)
13983 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13984 n -= 2;
13987 if (n == 1)
13988 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13990 return true;
13993 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13994 4-byte chunk, partially overlapping with the previously copied chunk. */
13995 if (n < 8)
13997 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13998 n -= 4;
13999 if (n > 0)
14001 int move = n - 4;
14003 src = aarch64_move_pointer (src, move);
14004 dst = aarch64_move_pointer (dst, move);
14005 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14007 return true;
14010 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
14011 them, then (if applicable) an 8-byte chunk. */
14012 while (n >= 8)
14014 if (n / 16)
14016 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14017 n -= 16;
14019 else
14021 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14022 n -= 8;
14026 /* Finish the final bytes of the copy. We can always do this in one
14027 instruction. We either copy the exact amount we need, or partially
14028 overlap with the previous chunk we copied and copy 8-bytes. */
14029 if (n == 0)
14030 return true;
14031 else if (n == 1)
14032 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14033 else if (n == 2)
14034 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14035 else if (n == 4)
14036 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14037 else
14039 if (n == 3)
14041 src = aarch64_move_pointer (src, -1);
14042 dst = aarch64_move_pointer (dst, -1);
14043 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14045 else
14047 int move = n - 8;
14049 src = aarch64_move_pointer (src, move);
14050 dst = aarch64_move_pointer (dst, move);
14051 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14055 return true;
14058 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14059 SImode stores. Handle the case when the constant has identical
14060 bottom and top halves. This is beneficial when the two stores can be
14061 merged into an STP and we avoid synthesising potentially expensive
14062 immediates twice. Return true if such a split is possible. */
14064 bool
14065 aarch64_split_dimode_const_store (rtx dst, rtx src)
14067 rtx lo = gen_lowpart (SImode, src);
14068 rtx hi = gen_highpart_mode (SImode, DImode, src);
14070 bool size_p = optimize_function_for_size_p (cfun);
14072 if (!rtx_equal_p (lo, hi))
14073 return false;
14075 unsigned int orig_cost
14076 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14077 unsigned int lo_cost
14078 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14080 /* We want to transform:
14081 MOV x1, 49370
14082 MOVK x1, 0x140, lsl 16
14083 MOVK x1, 0xc0da, lsl 32
14084 MOVK x1, 0x140, lsl 48
14085 STR x1, [x0]
14086 into:
14087 MOV w1, 49370
14088 MOVK w1, 0x140, lsl 16
14089 STP w1, w1, [x0]
14090 So we want to perform this only when we save two instructions
14091 or more. When optimizing for size, however, accept any code size
14092 savings we can. */
14093 if (size_p && orig_cost <= lo_cost)
14094 return false;
14096 if (!size_p
14097 && (orig_cost <= lo_cost + 1))
14098 return false;
14100 rtx mem_lo = adjust_address (dst, SImode, 0);
14101 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14102 return false;
14104 rtx tmp_reg = gen_reg_rtx (SImode);
14105 aarch64_expand_mov_immediate (tmp_reg, lo);
14106 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14107 /* Don't emit an explicit store pair as this may not be always profitable.
14108 Let the sched-fusion logic decide whether to merge them. */
14109 emit_move_insn (mem_lo, tmp_reg);
14110 emit_move_insn (mem_hi, tmp_reg);
14112 return true;
14115 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14117 static unsigned HOST_WIDE_INT
14118 aarch64_asan_shadow_offset (void)
14120 return (HOST_WIDE_INT_1 << 36);
14123 static bool
14124 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14125 unsigned int align,
14126 enum by_pieces_operation op,
14127 bool speed_p)
14129 /* STORE_BY_PIECES can be used when copying a constant string, but
14130 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14131 For now we always fail this and let the move_by_pieces code copy
14132 the string from read-only memory. */
14133 if (op == STORE_BY_PIECES)
14134 return false;
14136 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14139 static rtx
14140 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14141 int code, tree treeop0, tree treeop1)
14143 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14144 rtx op0, op1;
14145 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14146 insn_code icode;
14147 struct expand_operand ops[4];
14149 start_sequence ();
14150 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14152 op_mode = GET_MODE (op0);
14153 if (op_mode == VOIDmode)
14154 op_mode = GET_MODE (op1);
14156 switch (op_mode)
14158 case E_QImode:
14159 case E_HImode:
14160 case E_SImode:
14161 cmp_mode = SImode;
14162 icode = CODE_FOR_cmpsi;
14163 break;
14165 case E_DImode:
14166 cmp_mode = DImode;
14167 icode = CODE_FOR_cmpdi;
14168 break;
14170 case E_SFmode:
14171 cmp_mode = SFmode;
14172 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14173 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14174 break;
14176 case E_DFmode:
14177 cmp_mode = DFmode;
14178 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14179 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14180 break;
14182 default:
14183 end_sequence ();
14184 return NULL_RTX;
14187 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14188 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14189 if (!op0 || !op1)
14191 end_sequence ();
14192 return NULL_RTX;
14194 *prep_seq = get_insns ();
14195 end_sequence ();
14197 create_fixed_operand (&ops[0], op0);
14198 create_fixed_operand (&ops[1], op1);
14200 start_sequence ();
14201 if (!maybe_expand_insn (icode, 2, ops))
14203 end_sequence ();
14204 return NULL_RTX;
14206 *gen_seq = get_insns ();
14207 end_sequence ();
14209 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14210 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14213 static rtx
14214 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14215 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14217 rtx op0, op1, target;
14218 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14219 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14220 insn_code icode;
14221 struct expand_operand ops[6];
14222 int aarch64_cond;
14224 push_to_sequence (*prep_seq);
14225 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14227 op_mode = GET_MODE (op0);
14228 if (op_mode == VOIDmode)
14229 op_mode = GET_MODE (op1);
14231 switch (op_mode)
14233 case E_QImode:
14234 case E_HImode:
14235 case E_SImode:
14236 cmp_mode = SImode;
14237 icode = CODE_FOR_ccmpsi;
14238 break;
14240 case E_DImode:
14241 cmp_mode = DImode;
14242 icode = CODE_FOR_ccmpdi;
14243 break;
14245 case E_SFmode:
14246 cmp_mode = SFmode;
14247 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14248 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14249 break;
14251 case E_DFmode:
14252 cmp_mode = DFmode;
14253 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14254 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14255 break;
14257 default:
14258 end_sequence ();
14259 return NULL_RTX;
14262 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14263 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14264 if (!op0 || !op1)
14266 end_sequence ();
14267 return NULL_RTX;
14269 *prep_seq = get_insns ();
14270 end_sequence ();
14272 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14273 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14275 if (bit_code != AND)
14277 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14278 GET_MODE (XEXP (prev, 0))),
14279 VOIDmode, XEXP (prev, 0), const0_rtx);
14280 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14283 create_fixed_operand (&ops[0], XEXP (prev, 0));
14284 create_fixed_operand (&ops[1], target);
14285 create_fixed_operand (&ops[2], op0);
14286 create_fixed_operand (&ops[3], op1);
14287 create_fixed_operand (&ops[4], prev);
14288 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14290 push_to_sequence (*gen_seq);
14291 if (!maybe_expand_insn (icode, 6, ops))
14293 end_sequence ();
14294 return NULL_RTX;
14297 *gen_seq = get_insns ();
14298 end_sequence ();
14300 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14303 #undef TARGET_GEN_CCMP_FIRST
14304 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14306 #undef TARGET_GEN_CCMP_NEXT
14307 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14309 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14310 instruction fusion of some sort. */
14312 static bool
14313 aarch64_macro_fusion_p (void)
14315 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14319 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14320 should be kept together during scheduling. */
14322 static bool
14323 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14325 rtx set_dest;
14326 rtx prev_set = single_set (prev);
14327 rtx curr_set = single_set (curr);
14328 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14329 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14331 if (!aarch64_macro_fusion_p ())
14332 return false;
14334 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14336 /* We are trying to match:
14337 prev (mov) == (set (reg r0) (const_int imm16))
14338 curr (movk) == (set (zero_extract (reg r0)
14339 (const_int 16)
14340 (const_int 16))
14341 (const_int imm16_1)) */
14343 set_dest = SET_DEST (curr_set);
14345 if (GET_CODE (set_dest) == ZERO_EXTRACT
14346 && CONST_INT_P (SET_SRC (curr_set))
14347 && CONST_INT_P (SET_SRC (prev_set))
14348 && CONST_INT_P (XEXP (set_dest, 2))
14349 && INTVAL (XEXP (set_dest, 2)) == 16
14350 && REG_P (XEXP (set_dest, 0))
14351 && REG_P (SET_DEST (prev_set))
14352 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14354 return true;
14358 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14361 /* We're trying to match:
14362 prev (adrp) == (set (reg r1)
14363 (high (symbol_ref ("SYM"))))
14364 curr (add) == (set (reg r0)
14365 (lo_sum (reg r1)
14366 (symbol_ref ("SYM"))))
14367 Note that r0 need not necessarily be the same as r1, especially
14368 during pre-regalloc scheduling. */
14370 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14371 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14373 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14374 && REG_P (XEXP (SET_SRC (curr_set), 0))
14375 && REGNO (XEXP (SET_SRC (curr_set), 0))
14376 == REGNO (SET_DEST (prev_set))
14377 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14378 XEXP (SET_SRC (curr_set), 1)))
14379 return true;
14383 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14386 /* We're trying to match:
14387 prev (movk) == (set (zero_extract (reg r0)
14388 (const_int 16)
14389 (const_int 32))
14390 (const_int imm16_1))
14391 curr (movk) == (set (zero_extract (reg r0)
14392 (const_int 16)
14393 (const_int 48))
14394 (const_int imm16_2)) */
14396 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14397 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14398 && REG_P (XEXP (SET_DEST (prev_set), 0))
14399 && REG_P (XEXP (SET_DEST (curr_set), 0))
14400 && REGNO (XEXP (SET_DEST (prev_set), 0))
14401 == REGNO (XEXP (SET_DEST (curr_set), 0))
14402 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14403 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14404 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14405 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14406 && CONST_INT_P (SET_SRC (prev_set))
14407 && CONST_INT_P (SET_SRC (curr_set)))
14408 return true;
14411 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14413 /* We're trying to match:
14414 prev (adrp) == (set (reg r0)
14415 (high (symbol_ref ("SYM"))))
14416 curr (ldr) == (set (reg r1)
14417 (mem (lo_sum (reg r0)
14418 (symbol_ref ("SYM")))))
14420 curr (ldr) == (set (reg r1)
14421 (zero_extend (mem
14422 (lo_sum (reg r0)
14423 (symbol_ref ("SYM")))))) */
14424 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14425 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14427 rtx curr_src = SET_SRC (curr_set);
14429 if (GET_CODE (curr_src) == ZERO_EXTEND)
14430 curr_src = XEXP (curr_src, 0);
14432 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14433 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14434 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14435 == REGNO (SET_DEST (prev_set))
14436 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14437 XEXP (SET_SRC (prev_set), 0)))
14438 return true;
14442 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14443 && aarch_crypto_can_dual_issue (prev, curr))
14444 return true;
14446 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14447 && any_condjump_p (curr))
14449 enum attr_type prev_type = get_attr_type (prev);
14451 unsigned int condreg1, condreg2;
14452 rtx cc_reg_1;
14453 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14454 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14456 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14457 && prev
14458 && modified_in_p (cc_reg_1, prev))
14460 /* FIXME: this misses some which is considered simple arthematic
14461 instructions for ThunderX. Simple shifts are missed here. */
14462 if (prev_type == TYPE_ALUS_SREG
14463 || prev_type == TYPE_ALUS_IMM
14464 || prev_type == TYPE_LOGICS_REG
14465 || prev_type == TYPE_LOGICS_IMM)
14466 return true;
14470 if (prev_set
14471 && curr_set
14472 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14473 && any_condjump_p (curr))
14475 /* We're trying to match:
14476 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14477 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14478 (const_int 0))
14479 (label_ref ("SYM"))
14480 (pc)) */
14481 if (SET_DEST (curr_set) == (pc_rtx)
14482 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14483 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14484 && REG_P (SET_DEST (prev_set))
14485 && REGNO (SET_DEST (prev_set))
14486 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14488 /* Fuse ALU operations followed by conditional branch instruction. */
14489 switch (get_attr_type (prev))
14491 case TYPE_ALU_IMM:
14492 case TYPE_ALU_SREG:
14493 case TYPE_ADC_REG:
14494 case TYPE_ADC_IMM:
14495 case TYPE_ADCS_REG:
14496 case TYPE_ADCS_IMM:
14497 case TYPE_LOGIC_REG:
14498 case TYPE_LOGIC_IMM:
14499 case TYPE_CSEL:
14500 case TYPE_ADR:
14501 case TYPE_MOV_IMM:
14502 case TYPE_SHIFT_REG:
14503 case TYPE_SHIFT_IMM:
14504 case TYPE_BFM:
14505 case TYPE_RBIT:
14506 case TYPE_REV:
14507 case TYPE_EXTEND:
14508 return true;
14510 default:;
14515 return false;
14518 /* Return true iff the instruction fusion described by OP is enabled. */
14520 bool
14521 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14523 return (aarch64_tune_params.fusible_ops & op) != 0;
14526 /* If MEM is in the form of [base+offset], extract the two parts
14527 of address and set to BASE and OFFSET, otherwise return false
14528 after clearing BASE and OFFSET. */
14530 bool
14531 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14533 rtx addr;
14535 gcc_assert (MEM_P (mem));
14537 addr = XEXP (mem, 0);
14539 if (REG_P (addr))
14541 *base = addr;
14542 *offset = const0_rtx;
14543 return true;
14546 if (GET_CODE (addr) == PLUS
14547 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14549 *base = XEXP (addr, 0);
14550 *offset = XEXP (addr, 1);
14551 return true;
14554 *base = NULL_RTX;
14555 *offset = NULL_RTX;
14557 return false;
14560 /* Types for scheduling fusion. */
14561 enum sched_fusion_type
14563 SCHED_FUSION_NONE = 0,
14564 SCHED_FUSION_LD_SIGN_EXTEND,
14565 SCHED_FUSION_LD_ZERO_EXTEND,
14566 SCHED_FUSION_LD,
14567 SCHED_FUSION_ST,
14568 SCHED_FUSION_NUM
14571 /* If INSN is a load or store of address in the form of [base+offset],
14572 extract the two parts and set to BASE and OFFSET. Return scheduling
14573 fusion type this INSN is. */
14575 static enum sched_fusion_type
14576 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14578 rtx x, dest, src;
14579 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14581 gcc_assert (INSN_P (insn));
14582 x = PATTERN (insn);
14583 if (GET_CODE (x) != SET)
14584 return SCHED_FUSION_NONE;
14586 src = SET_SRC (x);
14587 dest = SET_DEST (x);
14589 machine_mode dest_mode = GET_MODE (dest);
14591 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14592 return SCHED_FUSION_NONE;
14594 if (GET_CODE (src) == SIGN_EXTEND)
14596 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14597 src = XEXP (src, 0);
14598 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14599 return SCHED_FUSION_NONE;
14601 else if (GET_CODE (src) == ZERO_EXTEND)
14603 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14604 src = XEXP (src, 0);
14605 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14606 return SCHED_FUSION_NONE;
14609 if (GET_CODE (src) == MEM && REG_P (dest))
14610 extract_base_offset_in_addr (src, base, offset);
14611 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14613 fusion = SCHED_FUSION_ST;
14614 extract_base_offset_in_addr (dest, base, offset);
14616 else
14617 return SCHED_FUSION_NONE;
14619 if (*base == NULL_RTX || *offset == NULL_RTX)
14620 fusion = SCHED_FUSION_NONE;
14622 return fusion;
14625 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14627 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14628 and PRI are only calculated for these instructions. For other instruction,
14629 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14630 type instruction fusion can be added by returning different priorities.
14632 It's important that irrelevant instructions get the largest FUSION_PRI. */
14634 static void
14635 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14636 int *fusion_pri, int *pri)
14638 int tmp, off_val;
14639 rtx base, offset;
14640 enum sched_fusion_type fusion;
14642 gcc_assert (INSN_P (insn));
14644 tmp = max_pri - 1;
14645 fusion = fusion_load_store (insn, &base, &offset);
14646 if (fusion == SCHED_FUSION_NONE)
14648 *pri = tmp;
14649 *fusion_pri = tmp;
14650 return;
14653 /* Set FUSION_PRI according to fusion type and base register. */
14654 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14656 /* Calculate PRI. */
14657 tmp /= 2;
14659 /* INSN with smaller offset goes first. */
14660 off_val = (int)(INTVAL (offset));
14661 if (off_val >= 0)
14662 tmp -= (off_val & 0xfffff);
14663 else
14664 tmp += ((- off_val) & 0xfffff);
14666 *pri = tmp;
14667 return;
14670 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14671 Adjust priority of sha1h instructions so they are scheduled before
14672 other SHA1 instructions. */
14674 static int
14675 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14677 rtx x = PATTERN (insn);
14679 if (GET_CODE (x) == SET)
14681 x = SET_SRC (x);
14683 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14684 return priority + 10;
14687 return priority;
14690 /* Given OPERANDS of consecutive load/store, check if we can merge
14691 them into ldp/stp. LOAD is true if they are load instructions.
14692 MODE is the mode of memory operands. */
14694 bool
14695 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14696 machine_mode mode)
14698 HOST_WIDE_INT offval_1, offval_2, msize;
14699 enum reg_class rclass_1, rclass_2;
14700 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14702 if (load)
14704 mem_1 = operands[1];
14705 mem_2 = operands[3];
14706 reg_1 = operands[0];
14707 reg_2 = operands[2];
14708 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14709 if (REGNO (reg_1) == REGNO (reg_2))
14710 return false;
14712 else
14714 mem_1 = operands[0];
14715 mem_2 = operands[2];
14716 reg_1 = operands[1];
14717 reg_2 = operands[3];
14720 /* The mems cannot be volatile. */
14721 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14722 return false;
14724 /* If we have SImode and slow unaligned ldp,
14725 check the alignment to be at least 8 byte. */
14726 if (mode == SImode
14727 && (aarch64_tune_params.extra_tuning_flags
14728 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14729 && !optimize_size
14730 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14731 return false;
14733 /* Check if the addresses are in the form of [base+offset]. */
14734 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14735 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14736 return false;
14737 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14738 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14739 return false;
14741 /* Check if the bases are same. */
14742 if (!rtx_equal_p (base_1, base_2))
14743 return false;
14745 offval_1 = INTVAL (offset_1);
14746 offval_2 = INTVAL (offset_2);
14747 msize = GET_MODE_SIZE (mode);
14748 /* Check if the offsets are consecutive. */
14749 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14750 return false;
14752 /* Check if the addresses are clobbered by load. */
14753 if (load)
14755 if (reg_mentioned_p (reg_1, mem_1))
14756 return false;
14758 /* In increasing order, the last load can clobber the address. */
14759 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14760 return false;
14763 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14764 rclass_1 = FP_REGS;
14765 else
14766 rclass_1 = GENERAL_REGS;
14768 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14769 rclass_2 = FP_REGS;
14770 else
14771 rclass_2 = GENERAL_REGS;
14773 /* Check if the registers are of same class. */
14774 if (rclass_1 != rclass_2)
14775 return false;
14777 return true;
14780 /* Given OPERANDS of consecutive load/store, check if we can merge
14781 them into ldp/stp by adjusting the offset. LOAD is true if they
14782 are load instructions. MODE is the mode of memory operands.
14784 Given below consecutive stores:
14786 str w1, [xb, 0x100]
14787 str w1, [xb, 0x104]
14788 str w1, [xb, 0x108]
14789 str w1, [xb, 0x10c]
14791 Though the offsets are out of the range supported by stp, we can
14792 still pair them after adjusting the offset, like:
14794 add scratch, xb, 0x100
14795 stp w1, w1, [scratch]
14796 stp w1, w1, [scratch, 0x8]
14798 The peephole patterns detecting this opportunity should guarantee
14799 the scratch register is avaliable. */
14801 bool
14802 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14803 scalar_mode mode)
14805 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14806 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14807 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14808 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14810 if (load)
14812 reg_1 = operands[0];
14813 mem_1 = operands[1];
14814 reg_2 = operands[2];
14815 mem_2 = operands[3];
14816 reg_3 = operands[4];
14817 mem_3 = operands[5];
14818 reg_4 = operands[6];
14819 mem_4 = operands[7];
14820 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14821 && REG_P (reg_3) && REG_P (reg_4));
14822 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14823 return false;
14825 else
14827 mem_1 = operands[0];
14828 reg_1 = operands[1];
14829 mem_2 = operands[2];
14830 reg_2 = operands[3];
14831 mem_3 = operands[4];
14832 reg_3 = operands[5];
14833 mem_4 = operands[6];
14834 reg_4 = operands[7];
14836 /* Skip if memory operand is by itslef valid for ldp/stp. */
14837 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14838 return false;
14840 /* The mems cannot be volatile. */
14841 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14842 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14843 return false;
14845 /* Check if the addresses are in the form of [base+offset]. */
14846 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14847 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14848 return false;
14849 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14850 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14851 return false;
14852 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14853 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14854 return false;
14855 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14856 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14857 return false;
14859 /* Check if the bases are same. */
14860 if (!rtx_equal_p (base_1, base_2)
14861 || !rtx_equal_p (base_2, base_3)
14862 || !rtx_equal_p (base_3, base_4))
14863 return false;
14865 offval_1 = INTVAL (offset_1);
14866 offval_2 = INTVAL (offset_2);
14867 offval_3 = INTVAL (offset_3);
14868 offval_4 = INTVAL (offset_4);
14869 msize = GET_MODE_SIZE (mode);
14870 /* Check if the offsets are consecutive. */
14871 if ((offval_1 != (offval_2 + msize)
14872 || offval_1 != (offval_3 + msize * 2)
14873 || offval_1 != (offval_4 + msize * 3))
14874 && (offval_4 != (offval_3 + msize)
14875 || offval_4 != (offval_2 + msize * 2)
14876 || offval_4 != (offval_1 + msize * 3)))
14877 return false;
14879 /* Check if the addresses are clobbered by load. */
14880 if (load)
14882 if (reg_mentioned_p (reg_1, mem_1)
14883 || reg_mentioned_p (reg_2, mem_2)
14884 || reg_mentioned_p (reg_3, mem_3))
14885 return false;
14887 /* In increasing order, the last load can clobber the address. */
14888 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14889 return false;
14892 /* If we have SImode and slow unaligned ldp,
14893 check the alignment to be at least 8 byte. */
14894 if (mode == SImode
14895 && (aarch64_tune_params.extra_tuning_flags
14896 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14897 && !optimize_size
14898 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14899 return false;
14901 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14902 rclass_1 = FP_REGS;
14903 else
14904 rclass_1 = GENERAL_REGS;
14906 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14907 rclass_2 = FP_REGS;
14908 else
14909 rclass_2 = GENERAL_REGS;
14911 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14912 rclass_3 = FP_REGS;
14913 else
14914 rclass_3 = GENERAL_REGS;
14916 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14917 rclass_4 = FP_REGS;
14918 else
14919 rclass_4 = GENERAL_REGS;
14921 /* Check if the registers are of same class. */
14922 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14923 return false;
14925 return true;
14928 /* Given OPERANDS of consecutive load/store, this function pairs them
14929 into ldp/stp after adjusting the offset. It depends on the fact
14930 that addresses of load/store instructions are in increasing order.
14931 MODE is the mode of memory operands. CODE is the rtl operator
14932 which should be applied to all memory operands, it's SIGN_EXTEND,
14933 ZERO_EXTEND or UNKNOWN. */
14935 bool
14936 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14937 scalar_mode mode, RTX_CODE code)
14939 rtx base, offset, t1, t2;
14940 rtx mem_1, mem_2, mem_3, mem_4;
14941 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14943 if (load)
14945 mem_1 = operands[1];
14946 mem_2 = operands[3];
14947 mem_3 = operands[5];
14948 mem_4 = operands[7];
14950 else
14952 mem_1 = operands[0];
14953 mem_2 = operands[2];
14954 mem_3 = operands[4];
14955 mem_4 = operands[6];
14956 gcc_assert (code == UNKNOWN);
14959 extract_base_offset_in_addr (mem_1, &base, &offset);
14960 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14962 /* Adjust offset thus it can fit in ldp/stp instruction. */
14963 msize = GET_MODE_SIZE (mode);
14964 stp_off_limit = msize * 0x40;
14965 off_val = INTVAL (offset);
14966 abs_off = (off_val < 0) ? -off_val : off_val;
14967 new_off = abs_off % stp_off_limit;
14968 adj_off = abs_off - new_off;
14970 /* Further adjust to make sure all offsets are OK. */
14971 if ((new_off + msize * 2) >= stp_off_limit)
14973 adj_off += stp_off_limit;
14974 new_off -= stp_off_limit;
14977 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14978 if (adj_off >= 0x1000)
14979 return false;
14981 if (off_val < 0)
14983 adj_off = -adj_off;
14984 new_off = -new_off;
14987 /* Create new memory references. */
14988 mem_1 = change_address (mem_1, VOIDmode,
14989 plus_constant (DImode, operands[8], new_off));
14991 /* Check if the adjusted address is OK for ldp/stp. */
14992 if (!aarch64_mem_pair_operand (mem_1, mode))
14993 return false;
14995 msize = GET_MODE_SIZE (mode);
14996 mem_2 = change_address (mem_2, VOIDmode,
14997 plus_constant (DImode,
14998 operands[8],
14999 new_off + msize));
15000 mem_3 = change_address (mem_3, VOIDmode,
15001 plus_constant (DImode,
15002 operands[8],
15003 new_off + msize * 2));
15004 mem_4 = change_address (mem_4, VOIDmode,
15005 plus_constant (DImode,
15006 operands[8],
15007 new_off + msize * 3));
15009 if (code == ZERO_EXTEND)
15011 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15012 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15013 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15014 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15016 else if (code == SIGN_EXTEND)
15018 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15019 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15020 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15021 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15024 if (load)
15026 operands[1] = mem_1;
15027 operands[3] = mem_2;
15028 operands[5] = mem_3;
15029 operands[7] = mem_4;
15031 else
15033 operands[0] = mem_1;
15034 operands[2] = mem_2;
15035 operands[4] = mem_3;
15036 operands[6] = mem_4;
15039 /* Emit adjusting instruction. */
15040 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15041 /* Emit ldp/stp instructions. */
15042 t1 = gen_rtx_SET (operands[0], operands[1]);
15043 t2 = gen_rtx_SET (operands[2], operands[3]);
15044 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15045 t1 = gen_rtx_SET (operands[4], operands[5]);
15046 t2 = gen_rtx_SET (operands[6], operands[7]);
15047 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15048 return true;
15051 /* Return 1 if pseudo register should be created and used to hold
15052 GOT address for PIC code. */
15054 bool
15055 aarch64_use_pseudo_pic_reg (void)
15057 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15060 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15062 static int
15063 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15065 switch (XINT (x, 1))
15067 case UNSPEC_GOTSMALLPIC:
15068 case UNSPEC_GOTSMALLPIC28K:
15069 case UNSPEC_GOTTINYPIC:
15070 return 0;
15071 default:
15072 break;
15075 return default_unspec_may_trap_p (x, flags);
15079 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15080 return the log2 of that value. Otherwise return -1. */
15083 aarch64_fpconst_pow_of_2 (rtx x)
15085 const REAL_VALUE_TYPE *r;
15087 if (!CONST_DOUBLE_P (x))
15088 return -1;
15090 r = CONST_DOUBLE_REAL_VALUE (x);
15092 if (REAL_VALUE_NEGATIVE (*r)
15093 || REAL_VALUE_ISNAN (*r)
15094 || REAL_VALUE_ISINF (*r)
15095 || !real_isinteger (r, DFmode))
15096 return -1;
15098 return exact_log2 (real_to_integer (r));
15101 /* If X is a vector of equal CONST_DOUBLE values and that value is
15102 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15105 aarch64_vec_fpconst_pow_of_2 (rtx x)
15107 if (GET_CODE (x) != CONST_VECTOR)
15108 return -1;
15110 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15111 return -1;
15113 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15114 if (firstval <= 0)
15115 return -1;
15117 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15118 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15119 return -1;
15121 return firstval;
15124 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15125 to float.
15127 __fp16 always promotes through this hook.
15128 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15129 through the generic excess precision logic rather than here. */
15131 static tree
15132 aarch64_promoted_type (const_tree t)
15134 if (SCALAR_FLOAT_TYPE_P (t)
15135 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15136 return float_type_node;
15138 return NULL_TREE;
15141 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15143 static bool
15144 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15145 optimization_type opt_type)
15147 switch (op)
15149 case rsqrt_optab:
15150 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15152 default:
15153 return true;
15157 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15158 if MODE is HFmode, and punt to the generic implementation otherwise. */
15160 static bool
15161 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15163 return (mode == HFmode
15164 ? true
15165 : default_libgcc_floating_mode_supported_p (mode));
15168 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15169 if MODE is HFmode, and punt to the generic implementation otherwise. */
15171 static bool
15172 aarch64_scalar_mode_supported_p (scalar_mode mode)
15174 return (mode == HFmode
15175 ? true
15176 : default_scalar_mode_supported_p (mode));
15179 /* Set the value of FLT_EVAL_METHOD.
15180 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15182 0: evaluate all operations and constants, whose semantic type has at
15183 most the range and precision of type float, to the range and
15184 precision of float; evaluate all other operations and constants to
15185 the range and precision of the semantic type;
15187 N, where _FloatN is a supported interchange floating type
15188 evaluate all operations and constants, whose semantic type has at
15189 most the range and precision of _FloatN type, to the range and
15190 precision of the _FloatN type; evaluate all other operations and
15191 constants to the range and precision of the semantic type;
15193 If we have the ARMv8.2-A extensions then we support _Float16 in native
15194 precision, so we should set this to 16. Otherwise, we support the type,
15195 but want to evaluate expressions in float precision, so set this to
15196 0. */
15198 static enum flt_eval_method
15199 aarch64_excess_precision (enum excess_precision_type type)
15201 switch (type)
15203 case EXCESS_PRECISION_TYPE_FAST:
15204 case EXCESS_PRECISION_TYPE_STANDARD:
15205 /* We can calculate either in 16-bit range and precision or
15206 32-bit range and precision. Make that decision based on whether
15207 we have native support for the ARMv8.2-A 16-bit floating-point
15208 instructions or not. */
15209 return (TARGET_FP_F16INST
15210 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15211 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15212 case EXCESS_PRECISION_TYPE_IMPLICIT:
15213 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15214 default:
15215 gcc_unreachable ();
15217 return FLT_EVAL_METHOD_UNPREDICTABLE;
15220 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15221 scheduled for speculative execution. Reject the long-running division
15222 and square-root instructions. */
15224 static bool
15225 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15227 switch (get_attr_type (insn))
15229 case TYPE_SDIV:
15230 case TYPE_UDIV:
15231 case TYPE_FDIVS:
15232 case TYPE_FDIVD:
15233 case TYPE_FSQRTS:
15234 case TYPE_FSQRTD:
15235 case TYPE_NEON_FP_SQRT_S:
15236 case TYPE_NEON_FP_SQRT_D:
15237 case TYPE_NEON_FP_SQRT_S_Q:
15238 case TYPE_NEON_FP_SQRT_D_Q:
15239 case TYPE_NEON_FP_DIV_S:
15240 case TYPE_NEON_FP_DIV_D:
15241 case TYPE_NEON_FP_DIV_S_Q:
15242 case TYPE_NEON_FP_DIV_D_Q:
15243 return false;
15244 default:
15245 return true;
15249 /* Target-specific selftests. */
15251 #if CHECKING_P
15253 namespace selftest {
15255 /* Selftest for the RTL loader.
15256 Verify that the RTL loader copes with a dump from
15257 print_rtx_function. This is essentially just a test that class
15258 function_reader can handle a real dump, but it also verifies
15259 that lookup_reg_by_dump_name correctly handles hard regs.
15260 The presence of hard reg names in the dump means that the test is
15261 target-specific, hence it is in this file. */
15263 static void
15264 aarch64_test_loading_full_dump ()
15266 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15268 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15270 rtx_insn *insn_1 = get_insn_by_uid (1);
15271 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15273 rtx_insn *insn_15 = get_insn_by_uid (15);
15274 ASSERT_EQ (INSN, GET_CODE (insn_15));
15275 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15277 /* Verify crtl->return_rtx. */
15278 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15279 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15280 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15283 /* Run all target-specific selftests. */
15285 static void
15286 aarch64_run_selftests (void)
15288 aarch64_test_loading_full_dump ();
15291 } // namespace selftest
15293 #endif /* #if CHECKING_P */
15295 #undef TARGET_ADDRESS_COST
15296 #define TARGET_ADDRESS_COST aarch64_address_cost
15298 /* This hook will determines whether unnamed bitfields affect the alignment
15299 of the containing structure. The hook returns true if the structure
15300 should inherit the alignment requirements of an unnamed bitfield's
15301 type. */
15302 #undef TARGET_ALIGN_ANON_BITFIELD
15303 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15305 #undef TARGET_ASM_ALIGNED_DI_OP
15306 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15308 #undef TARGET_ASM_ALIGNED_HI_OP
15309 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15311 #undef TARGET_ASM_ALIGNED_SI_OP
15312 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15314 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15315 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15316 hook_bool_const_tree_hwi_hwi_const_tree_true
15318 #undef TARGET_ASM_FILE_START
15319 #define TARGET_ASM_FILE_START aarch64_start_file
15321 #undef TARGET_ASM_OUTPUT_MI_THUNK
15322 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15324 #undef TARGET_ASM_SELECT_RTX_SECTION
15325 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15327 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15328 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15330 #undef TARGET_BUILD_BUILTIN_VA_LIST
15331 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15333 #undef TARGET_CALLEE_COPIES
15334 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15336 #undef TARGET_CAN_ELIMINATE
15337 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15339 #undef TARGET_CAN_INLINE_P
15340 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15342 #undef TARGET_CANNOT_FORCE_CONST_MEM
15343 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15345 #undef TARGET_CASE_VALUES_THRESHOLD
15346 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15348 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15349 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15351 /* Only the least significant bit is used for initialization guard
15352 variables. */
15353 #undef TARGET_CXX_GUARD_MASK_BIT
15354 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15356 #undef TARGET_C_MODE_FOR_SUFFIX
15357 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15359 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15360 #undef TARGET_DEFAULT_TARGET_FLAGS
15361 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15362 #endif
15364 #undef TARGET_CLASS_MAX_NREGS
15365 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15367 #undef TARGET_BUILTIN_DECL
15368 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15370 #undef TARGET_BUILTIN_RECIPROCAL
15371 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15373 #undef TARGET_C_EXCESS_PRECISION
15374 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15376 #undef TARGET_EXPAND_BUILTIN
15377 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15379 #undef TARGET_EXPAND_BUILTIN_VA_START
15380 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15382 #undef TARGET_FOLD_BUILTIN
15383 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15385 #undef TARGET_FUNCTION_ARG
15386 #define TARGET_FUNCTION_ARG aarch64_function_arg
15388 #undef TARGET_FUNCTION_ARG_ADVANCE
15389 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15391 #undef TARGET_FUNCTION_ARG_BOUNDARY
15392 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15394 #undef TARGET_FUNCTION_ARG_PADDING
15395 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15397 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15398 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15400 #undef TARGET_FUNCTION_VALUE
15401 #define TARGET_FUNCTION_VALUE aarch64_function_value
15403 #undef TARGET_FUNCTION_VALUE_REGNO_P
15404 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15406 #undef TARGET_FRAME_POINTER_REQUIRED
15407 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15409 #undef TARGET_GIMPLE_FOLD_BUILTIN
15410 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15412 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15413 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15415 #undef TARGET_INIT_BUILTINS
15416 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15418 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15419 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15420 aarch64_ira_change_pseudo_allocno_class
15422 #undef TARGET_LEGITIMATE_ADDRESS_P
15423 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15425 #undef TARGET_LEGITIMATE_CONSTANT_P
15426 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15428 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15429 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15430 aarch64_legitimize_address_displacement
15432 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15433 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15435 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15436 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15437 aarch64_libgcc_floating_mode_supported_p
15439 #undef TARGET_MANGLE_TYPE
15440 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15442 #undef TARGET_MEMORY_MOVE_COST
15443 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15445 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15446 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15448 #undef TARGET_MUST_PASS_IN_STACK
15449 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15451 /* This target hook should return true if accesses to volatile bitfields
15452 should use the narrowest mode possible. It should return false if these
15453 accesses should use the bitfield container type. */
15454 #undef TARGET_NARROW_VOLATILE_BITFIELD
15455 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15457 #undef TARGET_OPTION_OVERRIDE
15458 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15460 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15461 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15462 aarch64_override_options_after_change
15464 #undef TARGET_OPTION_SAVE
15465 #define TARGET_OPTION_SAVE aarch64_option_save
15467 #undef TARGET_OPTION_RESTORE
15468 #define TARGET_OPTION_RESTORE aarch64_option_restore
15470 #undef TARGET_OPTION_PRINT
15471 #define TARGET_OPTION_PRINT aarch64_option_print
15473 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15474 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15476 #undef TARGET_SET_CURRENT_FUNCTION
15477 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15479 #undef TARGET_PASS_BY_REFERENCE
15480 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15482 #undef TARGET_PREFERRED_RELOAD_CLASS
15483 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15485 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15486 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15488 #undef TARGET_PROMOTED_TYPE
15489 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15491 #undef TARGET_SECONDARY_RELOAD
15492 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15494 #undef TARGET_SHIFT_TRUNCATION_MASK
15495 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15497 #undef TARGET_SETUP_INCOMING_VARARGS
15498 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15500 #undef TARGET_STRUCT_VALUE_RTX
15501 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15503 #undef TARGET_REGISTER_MOVE_COST
15504 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15506 #undef TARGET_RETURN_IN_MEMORY
15507 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15509 #undef TARGET_RETURN_IN_MSB
15510 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15512 #undef TARGET_RTX_COSTS
15513 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15515 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15516 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15518 #undef TARGET_SCHED_ISSUE_RATE
15519 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15521 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15522 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15523 aarch64_sched_first_cycle_multipass_dfa_lookahead
15525 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15526 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15527 aarch64_first_cycle_multipass_dfa_lookahead_guard
15529 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15530 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15531 aarch64_get_separate_components
15533 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15534 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15535 aarch64_components_for_bb
15537 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15538 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15539 aarch64_disqualify_components
15541 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15542 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15543 aarch64_emit_prologue_components
15545 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15546 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15547 aarch64_emit_epilogue_components
15549 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15550 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15551 aarch64_set_handled_components
15553 #undef TARGET_TRAMPOLINE_INIT
15554 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15556 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15557 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15559 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15560 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15562 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15563 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15564 aarch64_builtin_support_vector_misalignment
15566 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15567 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15569 #undef TARGET_VECTORIZE_ADD_STMT_COST
15570 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15572 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15573 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15574 aarch64_builtin_vectorization_cost
15576 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15577 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15579 #undef TARGET_VECTORIZE_BUILTINS
15580 #define TARGET_VECTORIZE_BUILTINS
15582 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15583 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15584 aarch64_builtin_vectorized_function
15586 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15587 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15588 aarch64_autovectorize_vector_sizes
15590 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15591 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15592 aarch64_atomic_assign_expand_fenv
15594 /* Section anchor support. */
15596 #undef TARGET_MIN_ANCHOR_OFFSET
15597 #define TARGET_MIN_ANCHOR_OFFSET -256
15599 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15600 byte offset; we can do much more for larger data types, but have no way
15601 to determine the size of the access. We assume accesses are aligned. */
15602 #undef TARGET_MAX_ANCHOR_OFFSET
15603 #define TARGET_MAX_ANCHOR_OFFSET 4095
15605 #undef TARGET_VECTOR_ALIGNMENT
15606 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15608 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15609 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15610 aarch64_simd_vector_alignment_reachable
15612 /* vec_perm support. */
15614 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15615 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15616 aarch64_vectorize_vec_perm_const_ok
15618 #undef TARGET_INIT_LIBFUNCS
15619 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15621 #undef TARGET_FIXED_CONDITION_CODE_REGS
15622 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15624 #undef TARGET_FLAGS_REGNUM
15625 #define TARGET_FLAGS_REGNUM CC_REGNUM
15627 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15628 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15630 #undef TARGET_ASAN_SHADOW_OFFSET
15631 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15633 #undef TARGET_LEGITIMIZE_ADDRESS
15634 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15636 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15637 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15638 aarch64_use_by_pieces_infrastructure_p
15640 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15641 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15643 #undef TARGET_CAN_USE_DOLOOP_P
15644 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15646 #undef TARGET_SCHED_ADJUST_PRIORITY
15647 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15649 #undef TARGET_SCHED_MACRO_FUSION_P
15650 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15652 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15653 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15655 #undef TARGET_SCHED_FUSION_PRIORITY
15656 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15658 #undef TARGET_UNSPEC_MAY_TRAP_P
15659 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15661 #undef TARGET_USE_PSEUDO_PIC_REG
15662 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15664 #undef TARGET_PRINT_OPERAND
15665 #define TARGET_PRINT_OPERAND aarch64_print_operand
15667 #undef TARGET_PRINT_OPERAND_ADDRESS
15668 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15670 #undef TARGET_OPTAB_SUPPORTED_P
15671 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15673 #undef TARGET_OMIT_STRUCT_RETURN_REG
15674 #define TARGET_OMIT_STRUCT_RETURN_REG true
15676 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15677 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15678 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15680 #undef TARGET_HARD_REGNO_NREGS
15681 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15682 #undef TARGET_HARD_REGNO_MODE_OK
15683 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15685 #undef TARGET_MODES_TIEABLE_P
15686 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15688 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15689 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15690 aarch64_hard_regno_call_part_clobbered
15692 #undef TARGET_CONSTANT_ALIGNMENT
15693 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15695 #if CHECKING_P
15696 #undef TARGET_RUN_TARGET_SELFTESTS
15697 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15698 #endif /* #if CHECKING_P */
15700 struct gcc_target targetm = TARGET_INITIALIZER;
15702 #include "gt-aarch64.h"