Daily bump.
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob1c1400871d4e08ae97d0c8c320f1b183e5a8efb1
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
145 const unsigned char *sel);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings =
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
869 /* A processor implementing AArch64. */
870 struct processor
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
929 aarch64_cc;
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
992 machine_mode mode;
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
997 if (best_class != ALL_REGS)
998 return best_class;
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1045 return mode == OImode || mode == CImode || mode == XImode;
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1067 return false;
1070 /* Implement TARGET_HARD_REGNO_NREGS. */
1072 static unsigned int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1075 switch (aarch64_regno_regclass (regno))
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1083 gcc_unreachable ();
1086 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return true;
1106 if (FP_REGNUM_P (regno))
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1110 else
1111 return true;
1114 return false;
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1118 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1119 clobbers the top 64 bits when restoring the bottom 64 bits. */
1121 static bool
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1124 return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1128 machine_mode
1129 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1130 machine_mode mode)
1132 /* Handle modes that fit within single registers. */
1133 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1135 if (GET_MODE_SIZE (mode) >= 4)
1136 return mode;
1137 else
1138 return SImode;
1140 /* Fall back to generic for multi-reg and very large modes. */
1141 else
1142 return choose_hard_reg_mode (regno, nregs, false);
1145 /* Return true if calls to DECL should be treated as
1146 long-calls (ie called via a register). */
1147 static bool
1148 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1150 return false;
1153 /* Return true if calls to symbol-ref SYM should be treated as
1154 long-calls (ie called via a register). */
1155 bool
1156 aarch64_is_long_call_p (rtx sym)
1158 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1161 /* Return true if calls to symbol-ref SYM should not go through
1162 plt stubs. */
1164 bool
1165 aarch64_is_noplt_call_p (rtx sym)
1167 const_tree decl = SYMBOL_REF_DECL (sym);
1169 if (flag_pic
1170 && decl
1171 && (!flag_plt
1172 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1173 && !targetm.binds_local_p (decl))
1174 return true;
1176 return false;
1179 /* Return true if the offsets to a zero/sign-extract operation
1180 represent an expression that matches an extend operation. The
1181 operands represent the paramters from
1183 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1184 bool
1185 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1186 rtx extract_imm)
1188 HOST_WIDE_INT mult_val, extract_val;
1190 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1191 return false;
1193 mult_val = INTVAL (mult_imm);
1194 extract_val = INTVAL (extract_imm);
1196 if (extract_val > 8
1197 && extract_val < GET_MODE_BITSIZE (mode)
1198 && exact_log2 (extract_val & ~7) > 0
1199 && (extract_val & 7) <= 4
1200 && mult_val == (1 << (extract_val & 7)))
1201 return true;
1203 return false;
1206 /* Emit an insn that's a simple single-set. Both the operands must be
1207 known to be valid. */
1208 inline static rtx_insn *
1209 emit_set_insn (rtx x, rtx y)
1211 return emit_insn (gen_rtx_SET (x, y));
1214 /* X and Y are two things to compare using CODE. Emit the compare insn and
1215 return the rtx for register 0 in the proper mode. */
1217 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1219 machine_mode mode = SELECT_CC_MODE (code, x, y);
1220 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1222 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1223 return cc_reg;
1226 /* Build the SYMBOL_REF for __tls_get_addr. */
1228 static GTY(()) rtx tls_get_addr_libfunc;
1231 aarch64_tls_get_addr (void)
1233 if (!tls_get_addr_libfunc)
1234 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1235 return tls_get_addr_libfunc;
1238 /* Return the TLS model to use for ADDR. */
1240 static enum tls_model
1241 tls_symbolic_operand_type (rtx addr)
1243 enum tls_model tls_kind = TLS_MODEL_NONE;
1244 rtx sym, addend;
1246 if (GET_CODE (addr) == CONST)
1248 split_const (addr, &sym, &addend);
1249 if (GET_CODE (sym) == SYMBOL_REF)
1250 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1252 else if (GET_CODE (addr) == SYMBOL_REF)
1253 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1255 return tls_kind;
1258 /* We'll allow lo_sum's in addresses in our legitimate addresses
1259 so that combine would take care of combining addresses where
1260 necessary, but for generation purposes, we'll generate the address
1261 as :
1262 RTL Absolute
1263 tmp = hi (symbol_ref); adrp x1, foo
1264 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1267 PIC TLS
1268 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1269 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1270 bl __tls_get_addr
1273 Load TLS symbol, depending on TLS mechanism and TLS access model.
1275 Global Dynamic - Traditional TLS:
1276 adrp tmp, :tlsgd:imm
1277 add dest, tmp, #:tlsgd_lo12:imm
1278 bl __tls_get_addr
1280 Global Dynamic - TLS Descriptors:
1281 adrp dest, :tlsdesc:imm
1282 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1283 add dest, dest, #:tlsdesc_lo12:imm
1284 blr tmp
1285 mrs tp, tpidr_el0
1286 add dest, dest, tp
1288 Initial Exec:
1289 mrs tp, tpidr_el0
1290 adrp tmp, :gottprel:imm
1291 ldr dest, [tmp, #:gottprel_lo12:imm]
1292 add dest, dest, tp
1294 Local Exec:
1295 mrs tp, tpidr_el0
1296 add t0, tp, #:tprel_hi12:imm, lsl #12
1297 add t0, t0, #:tprel_lo12_nc:imm
1300 static void
1301 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1302 enum aarch64_symbol_type type)
1304 switch (type)
1306 case SYMBOL_SMALL_ABSOLUTE:
1308 /* In ILP32, the mode of dest can be either SImode or DImode. */
1309 rtx tmp_reg = dest;
1310 machine_mode mode = GET_MODE (dest);
1312 gcc_assert (mode == Pmode || mode == ptr_mode);
1314 if (can_create_pseudo_p ())
1315 tmp_reg = gen_reg_rtx (mode);
1317 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1318 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1319 return;
1322 case SYMBOL_TINY_ABSOLUTE:
1323 emit_insn (gen_rtx_SET (dest, imm));
1324 return;
1326 case SYMBOL_SMALL_GOT_28K:
1328 machine_mode mode = GET_MODE (dest);
1329 rtx gp_rtx = pic_offset_table_rtx;
1330 rtx insn;
1331 rtx mem;
1333 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1334 here before rtl expand. Tree IVOPT will generate rtl pattern to
1335 decide rtx costs, in which case pic_offset_table_rtx is not
1336 initialized. For that case no need to generate the first adrp
1337 instruction as the final cost for global variable access is
1338 one instruction. */
1339 if (gp_rtx != NULL)
1341 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1342 using the page base as GOT base, the first page may be wasted,
1343 in the worst scenario, there is only 28K space for GOT).
1345 The generate instruction sequence for accessing global variable
1348 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1350 Only one instruction needed. But we must initialize
1351 pic_offset_table_rtx properly. We generate initialize insn for
1352 every global access, and allow CSE to remove all redundant.
1354 The final instruction sequences will look like the following
1355 for multiply global variables access.
1357 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1359 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1360 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1361 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1362 ... */
1364 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1365 crtl->uses_pic_offset_table = 1;
1366 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1368 if (mode != GET_MODE (gp_rtx))
1369 gp_rtx = gen_lowpart (mode, gp_rtx);
1373 if (mode == ptr_mode)
1375 if (mode == DImode)
1376 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1377 else
1378 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1380 mem = XVECEXP (SET_SRC (insn), 0, 0);
1382 else
1384 gcc_assert (mode == Pmode);
1386 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1387 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1390 /* The operand is expected to be MEM. Whenever the related insn
1391 pattern changed, above code which calculate mem should be
1392 updated. */
1393 gcc_assert (GET_CODE (mem) == MEM);
1394 MEM_READONLY_P (mem) = 1;
1395 MEM_NOTRAP_P (mem) = 1;
1396 emit_insn (insn);
1397 return;
1400 case SYMBOL_SMALL_GOT_4G:
1402 /* In ILP32, the mode of dest can be either SImode or DImode,
1403 while the got entry is always of SImode size. The mode of
1404 dest depends on how dest is used: if dest is assigned to a
1405 pointer (e.g. in the memory), it has SImode; it may have
1406 DImode if dest is dereferenced to access the memeory.
1407 This is why we have to handle three different ldr_got_small
1408 patterns here (two patterns for ILP32). */
1410 rtx insn;
1411 rtx mem;
1412 rtx tmp_reg = dest;
1413 machine_mode mode = GET_MODE (dest);
1415 if (can_create_pseudo_p ())
1416 tmp_reg = gen_reg_rtx (mode);
1418 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1419 if (mode == ptr_mode)
1421 if (mode == DImode)
1422 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1423 else
1424 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1426 mem = XVECEXP (SET_SRC (insn), 0, 0);
1428 else
1430 gcc_assert (mode == Pmode);
1432 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1433 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1436 gcc_assert (GET_CODE (mem) == MEM);
1437 MEM_READONLY_P (mem) = 1;
1438 MEM_NOTRAP_P (mem) = 1;
1439 emit_insn (insn);
1440 return;
1443 case SYMBOL_SMALL_TLSGD:
1445 rtx_insn *insns;
1446 machine_mode mode = GET_MODE (dest);
1447 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1449 start_sequence ();
1450 if (TARGET_ILP32)
1451 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1452 else
1453 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1454 insns = get_insns ();
1455 end_sequence ();
1457 RTL_CONST_CALL_P (insns) = 1;
1458 emit_libcall_block (insns, dest, result, imm);
1459 return;
1462 case SYMBOL_SMALL_TLSDESC:
1464 machine_mode mode = GET_MODE (dest);
1465 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1466 rtx tp;
1468 gcc_assert (mode == Pmode || mode == ptr_mode);
1470 /* In ILP32, the got entry is always of SImode size. Unlike
1471 small GOT, the dest is fixed at reg 0. */
1472 if (TARGET_ILP32)
1473 emit_insn (gen_tlsdesc_small_si (imm));
1474 else
1475 emit_insn (gen_tlsdesc_small_di (imm));
1476 tp = aarch64_load_tp (NULL);
1478 if (mode != Pmode)
1479 tp = gen_lowpart (mode, tp);
1481 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1482 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1483 return;
1486 case SYMBOL_SMALL_TLSIE:
1488 /* In ILP32, the mode of dest can be either SImode or DImode,
1489 while the got entry is always of SImode size. The mode of
1490 dest depends on how dest is used: if dest is assigned to a
1491 pointer (e.g. in the memory), it has SImode; it may have
1492 DImode if dest is dereferenced to access the memeory.
1493 This is why we have to handle three different tlsie_small
1494 patterns here (two patterns for ILP32). */
1495 machine_mode mode = GET_MODE (dest);
1496 rtx tmp_reg = gen_reg_rtx (mode);
1497 rtx tp = aarch64_load_tp (NULL);
1499 if (mode == ptr_mode)
1501 if (mode == DImode)
1502 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1503 else
1505 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1506 tp = gen_lowpart (mode, tp);
1509 else
1511 gcc_assert (mode == Pmode);
1512 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1515 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1516 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1517 return;
1520 case SYMBOL_TLSLE12:
1521 case SYMBOL_TLSLE24:
1522 case SYMBOL_TLSLE32:
1523 case SYMBOL_TLSLE48:
1525 machine_mode mode = GET_MODE (dest);
1526 rtx tp = aarch64_load_tp (NULL);
1528 if (mode != Pmode)
1529 tp = gen_lowpart (mode, tp);
1531 switch (type)
1533 case SYMBOL_TLSLE12:
1534 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1535 (dest, tp, imm));
1536 break;
1537 case SYMBOL_TLSLE24:
1538 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1539 (dest, tp, imm));
1540 break;
1541 case SYMBOL_TLSLE32:
1542 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1543 (dest, imm));
1544 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1545 (dest, dest, tp));
1546 break;
1547 case SYMBOL_TLSLE48:
1548 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1549 (dest, imm));
1550 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1551 (dest, dest, tp));
1552 break;
1553 default:
1554 gcc_unreachable ();
1557 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1558 return;
1561 case SYMBOL_TINY_GOT:
1562 emit_insn (gen_ldr_got_tiny (dest, imm));
1563 return;
1565 case SYMBOL_TINY_TLSIE:
1567 machine_mode mode = GET_MODE (dest);
1568 rtx tp = aarch64_load_tp (NULL);
1570 if (mode == ptr_mode)
1572 if (mode == DImode)
1573 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1574 else
1576 tp = gen_lowpart (mode, tp);
1577 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1580 else
1582 gcc_assert (mode == Pmode);
1583 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1586 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1587 return;
1590 default:
1591 gcc_unreachable ();
1595 /* Emit a move from SRC to DEST. Assume that the move expanders can
1596 handle all moves if !can_create_pseudo_p (). The distinction is
1597 important because, unlike emit_move_insn, the move expanders know
1598 how to force Pmode objects into the constant pool even when the
1599 constant pool address is not itself legitimate. */
1600 static rtx
1601 aarch64_emit_move (rtx dest, rtx src)
1603 return (can_create_pseudo_p ()
1604 ? emit_move_insn (dest, src)
1605 : emit_move_insn_1 (dest, src));
1608 /* Split a 128-bit move operation into two 64-bit move operations,
1609 taking care to handle partial overlap of register to register
1610 copies. Special cases are needed when moving between GP regs and
1611 FP regs. SRC can be a register, constant or memory; DST a register
1612 or memory. If either operand is memory it must not have any side
1613 effects. */
1614 void
1615 aarch64_split_128bit_move (rtx dst, rtx src)
1617 rtx dst_lo, dst_hi;
1618 rtx src_lo, src_hi;
1620 machine_mode mode = GET_MODE (dst);
1622 gcc_assert (mode == TImode || mode == TFmode);
1623 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1624 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1626 if (REG_P (dst) && REG_P (src))
1628 int src_regno = REGNO (src);
1629 int dst_regno = REGNO (dst);
1631 /* Handle FP <-> GP regs. */
1632 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1634 src_lo = gen_lowpart (word_mode, src);
1635 src_hi = gen_highpart (word_mode, src);
1637 if (mode == TImode)
1639 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1640 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1642 else
1644 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1645 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1647 return;
1649 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1651 dst_lo = gen_lowpart (word_mode, dst);
1652 dst_hi = gen_highpart (word_mode, dst);
1654 if (mode == TImode)
1656 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1657 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1659 else
1661 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1662 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1664 return;
1668 dst_lo = gen_lowpart (word_mode, dst);
1669 dst_hi = gen_highpart (word_mode, dst);
1670 src_lo = gen_lowpart (word_mode, src);
1671 src_hi = gen_highpart_mode (word_mode, mode, src);
1673 /* At most one pairing may overlap. */
1674 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1676 aarch64_emit_move (dst_hi, src_hi);
1677 aarch64_emit_move (dst_lo, src_lo);
1679 else
1681 aarch64_emit_move (dst_lo, src_lo);
1682 aarch64_emit_move (dst_hi, src_hi);
1686 bool
1687 aarch64_split_128bit_move_p (rtx dst, rtx src)
1689 return (! REG_P (src)
1690 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1693 /* Split a complex SIMD combine. */
1695 void
1696 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1698 machine_mode src_mode = GET_MODE (src1);
1699 machine_mode dst_mode = GET_MODE (dst);
1701 gcc_assert (VECTOR_MODE_P (dst_mode));
1702 gcc_assert (register_operand (dst, dst_mode)
1703 && register_operand (src1, src_mode)
1704 && register_operand (src2, src_mode));
1706 rtx (*gen) (rtx, rtx, rtx);
1708 switch (src_mode)
1710 case E_V8QImode:
1711 gen = gen_aarch64_simd_combinev8qi;
1712 break;
1713 case E_V4HImode:
1714 gen = gen_aarch64_simd_combinev4hi;
1715 break;
1716 case E_V2SImode:
1717 gen = gen_aarch64_simd_combinev2si;
1718 break;
1719 case E_V4HFmode:
1720 gen = gen_aarch64_simd_combinev4hf;
1721 break;
1722 case E_V2SFmode:
1723 gen = gen_aarch64_simd_combinev2sf;
1724 break;
1725 case E_DImode:
1726 gen = gen_aarch64_simd_combinedi;
1727 break;
1728 case E_DFmode:
1729 gen = gen_aarch64_simd_combinedf;
1730 break;
1731 default:
1732 gcc_unreachable ();
1735 emit_insn (gen (dst, src1, src2));
1736 return;
1739 /* Split a complex SIMD move. */
1741 void
1742 aarch64_split_simd_move (rtx dst, rtx src)
1744 machine_mode src_mode = GET_MODE (src);
1745 machine_mode dst_mode = GET_MODE (dst);
1747 gcc_assert (VECTOR_MODE_P (dst_mode));
1749 if (REG_P (dst) && REG_P (src))
1751 rtx (*gen) (rtx, rtx);
1753 gcc_assert (VECTOR_MODE_P (src_mode));
1755 switch (src_mode)
1757 case E_V16QImode:
1758 gen = gen_aarch64_split_simd_movv16qi;
1759 break;
1760 case E_V8HImode:
1761 gen = gen_aarch64_split_simd_movv8hi;
1762 break;
1763 case E_V4SImode:
1764 gen = gen_aarch64_split_simd_movv4si;
1765 break;
1766 case E_V2DImode:
1767 gen = gen_aarch64_split_simd_movv2di;
1768 break;
1769 case E_V8HFmode:
1770 gen = gen_aarch64_split_simd_movv8hf;
1771 break;
1772 case E_V4SFmode:
1773 gen = gen_aarch64_split_simd_movv4sf;
1774 break;
1775 case E_V2DFmode:
1776 gen = gen_aarch64_split_simd_movv2df;
1777 break;
1778 default:
1779 gcc_unreachable ();
1782 emit_insn (gen (dst, src));
1783 return;
1787 bool
1788 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1789 machine_mode ymode, rtx y)
1791 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1792 gcc_assert (r != NULL);
1793 return rtx_equal_p (x, r);
1797 static rtx
1798 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1800 if (can_create_pseudo_p ())
1801 return force_reg (mode, value);
1802 else
1804 x = aarch64_emit_move (x, value);
1805 return x;
1810 static rtx
1811 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1812 HOST_WIDE_INT offset)
1814 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1816 rtx high;
1817 /* Load the full offset into a register. This
1818 might be improvable in the future. */
1819 high = GEN_INT (offset);
1820 offset = 0;
1821 high = aarch64_force_temporary (mode, temp, high);
1822 reg = aarch64_force_temporary (mode, temp,
1823 gen_rtx_PLUS (mode, high, reg));
1825 return plus_constant (mode, reg, offset);
1828 static int
1829 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1830 scalar_int_mode mode)
1832 int i;
1833 unsigned HOST_WIDE_INT val, val2, mask;
1834 int one_match, zero_match;
1835 int num_insns;
1837 val = INTVAL (imm);
1839 if (aarch64_move_imm (val, mode))
1841 if (generate)
1842 emit_insn (gen_rtx_SET (dest, imm));
1843 return 1;
1846 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1847 (with XXXX non-zero). In that case check to see if the move can be done in
1848 a smaller mode. */
1849 val2 = val & 0xffffffff;
1850 if (mode == DImode
1851 && aarch64_move_imm (val2, SImode)
1852 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1854 if (generate)
1855 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1857 /* Check if we have to emit a second instruction by checking to see
1858 if any of the upper 32 bits of the original DI mode value is set. */
1859 if (val == val2)
1860 return 1;
1862 i = (val >> 48) ? 48 : 32;
1864 if (generate)
1865 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1866 GEN_INT ((val >> i) & 0xffff)));
1868 return 2;
1871 if ((val >> 32) == 0 || mode == SImode)
1873 if (generate)
1875 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1876 if (mode == SImode)
1877 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1878 GEN_INT ((val >> 16) & 0xffff)));
1879 else
1880 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1881 GEN_INT ((val >> 16) & 0xffff)));
1883 return 2;
1886 /* Remaining cases are all for DImode. */
1888 mask = 0xffff;
1889 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1890 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1891 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1892 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1894 if (zero_match != 2 && one_match != 2)
1896 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1897 For a 64-bit bitmask try whether changing 16 bits to all ones or
1898 zeroes creates a valid bitmask. To check any repeated bitmask,
1899 try using 16 bits from the other 32-bit half of val. */
1901 for (i = 0; i < 64; i += 16, mask <<= 16)
1903 val2 = val & ~mask;
1904 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1905 break;
1906 val2 = val | mask;
1907 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1908 break;
1909 val2 = val2 & ~mask;
1910 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1911 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1912 break;
1914 if (i != 64)
1916 if (generate)
1918 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1919 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1920 GEN_INT ((val >> i) & 0xffff)));
1922 return 2;
1926 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1927 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1928 otherwise skip zero bits. */
1930 num_insns = 1;
1931 mask = 0xffff;
1932 val2 = one_match > zero_match ? ~val : val;
1933 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1935 if (generate)
1936 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1937 ? (val | ~(mask << i))
1938 : (val & (mask << i)))));
1939 for (i += 16; i < 64; i += 16)
1941 if ((val2 & (mask << i)) == 0)
1942 continue;
1943 if (generate)
1944 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1945 GEN_INT ((val >> i) & 0xffff)));
1946 num_insns ++;
1949 return num_insns;
1953 void
1954 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1956 machine_mode mode = GET_MODE (dest);
1958 gcc_assert (mode == SImode || mode == DImode);
1960 /* Check on what type of symbol it is. */
1961 scalar_int_mode int_mode;
1962 if ((GET_CODE (imm) == SYMBOL_REF
1963 || GET_CODE (imm) == LABEL_REF
1964 || GET_CODE (imm) == CONST)
1965 && is_a <scalar_int_mode> (mode, &int_mode))
1967 rtx mem, base, offset;
1968 enum aarch64_symbol_type sty;
1970 /* If we have (const (plus symbol offset)), separate out the offset
1971 before we start classifying the symbol. */
1972 split_const (imm, &base, &offset);
1974 sty = aarch64_classify_symbol (base, offset);
1975 switch (sty)
1977 case SYMBOL_FORCE_TO_MEM:
1978 if (offset != const0_rtx
1979 && targetm.cannot_force_const_mem (int_mode, imm))
1981 gcc_assert (can_create_pseudo_p ());
1982 base = aarch64_force_temporary (int_mode, dest, base);
1983 base = aarch64_add_offset (int_mode, NULL, base,
1984 INTVAL (offset));
1985 aarch64_emit_move (dest, base);
1986 return;
1989 mem = force_const_mem (ptr_mode, imm);
1990 gcc_assert (mem);
1992 /* If we aren't generating PC relative literals, then
1993 we need to expand the literal pool access carefully.
1994 This is something that needs to be done in a number
1995 of places, so could well live as a separate function. */
1996 if (!aarch64_pcrelative_literal_loads)
1998 gcc_assert (can_create_pseudo_p ());
1999 base = gen_reg_rtx (ptr_mode);
2000 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2001 if (ptr_mode != Pmode)
2002 base = convert_memory_address (Pmode, base);
2003 mem = gen_rtx_MEM (ptr_mode, base);
2006 if (int_mode != ptr_mode)
2007 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2009 emit_insn (gen_rtx_SET (dest, mem));
2011 return;
2013 case SYMBOL_SMALL_TLSGD:
2014 case SYMBOL_SMALL_TLSDESC:
2015 case SYMBOL_SMALL_TLSIE:
2016 case SYMBOL_SMALL_GOT_28K:
2017 case SYMBOL_SMALL_GOT_4G:
2018 case SYMBOL_TINY_GOT:
2019 case SYMBOL_TINY_TLSIE:
2020 if (offset != const0_rtx)
2022 gcc_assert(can_create_pseudo_p ());
2023 base = aarch64_force_temporary (int_mode, dest, base);
2024 base = aarch64_add_offset (int_mode, NULL, base,
2025 INTVAL (offset));
2026 aarch64_emit_move (dest, base);
2027 return;
2029 /* FALLTHRU */
2031 case SYMBOL_SMALL_ABSOLUTE:
2032 case SYMBOL_TINY_ABSOLUTE:
2033 case SYMBOL_TLSLE12:
2034 case SYMBOL_TLSLE24:
2035 case SYMBOL_TLSLE32:
2036 case SYMBOL_TLSLE48:
2037 aarch64_load_symref_appropriately (dest, imm, sty);
2038 return;
2040 default:
2041 gcc_unreachable ();
2045 if (!CONST_INT_P (imm))
2047 if (GET_CODE (imm) == HIGH)
2048 emit_insn (gen_rtx_SET (dest, imm));
2049 else
2051 rtx mem = force_const_mem (mode, imm);
2052 gcc_assert (mem);
2053 emit_insn (gen_rtx_SET (dest, mem));
2056 return;
2059 aarch64_internal_mov_immediate (dest, imm, true,
2060 as_a <scalar_int_mode> (mode));
2063 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2064 temporary value if necessary. FRAME_RELATED_P should be true if
2065 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2066 to the generated instructions. If SCRATCHREG is known to hold
2067 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2068 immediate again.
2070 Since this function may be used to adjust the stack pointer, we must
2071 ensure that it cannot cause transient stack deallocation (for example
2072 by first incrementing SP and then decrementing when adjusting by a
2073 large immediate). */
2075 static void
2076 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2077 int scratchreg, HOST_WIDE_INT delta,
2078 bool frame_related_p, bool emit_move_imm)
2080 HOST_WIDE_INT mdelta = abs_hwi (delta);
2081 rtx this_rtx = gen_rtx_REG (mode, regnum);
2082 rtx_insn *insn;
2084 if (!mdelta)
2085 return;
2087 /* Single instruction adjustment. */
2088 if (aarch64_uimm12_shift (mdelta))
2090 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2091 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2092 return;
2095 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2096 Only do this if mdelta is not a 16-bit move as adjusting using a move
2097 is better. */
2098 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2100 HOST_WIDE_INT low_off = mdelta & 0xfff;
2102 low_off = delta < 0 ? -low_off : low_off;
2103 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2104 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2105 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2106 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2107 return;
2110 /* Emit a move immediate if required and an addition/subtraction. */
2111 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2112 if (emit_move_imm)
2113 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2114 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2115 : gen_add2_insn (this_rtx, scratch_rtx));
2116 if (frame_related_p)
2118 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2119 rtx adj = plus_constant (mode, this_rtx, delta);
2120 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2124 static inline void
2125 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2126 HOST_WIDE_INT delta)
2128 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2131 static inline void
2132 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2134 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2135 true, emit_move_imm);
2138 static inline void
2139 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2141 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2142 frame_related_p, true);
2145 static bool
2146 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2147 tree exp ATTRIBUTE_UNUSED)
2149 /* Currently, always true. */
2150 return true;
2153 /* Implement TARGET_PASS_BY_REFERENCE. */
2155 static bool
2156 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2157 machine_mode mode,
2158 const_tree type,
2159 bool named ATTRIBUTE_UNUSED)
2161 HOST_WIDE_INT size;
2162 machine_mode dummymode;
2163 int nregs;
2165 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2166 size = (mode == BLKmode && type)
2167 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2169 /* Aggregates are passed by reference based on their size. */
2170 if (type && AGGREGATE_TYPE_P (type))
2172 size = int_size_in_bytes (type);
2175 /* Variable sized arguments are always returned by reference. */
2176 if (size < 0)
2177 return true;
2179 /* Can this be a candidate to be passed in fp/simd register(s)? */
2180 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2181 &dummymode, &nregs,
2182 NULL))
2183 return false;
2185 /* Arguments which are variable sized or larger than 2 registers are
2186 passed by reference unless they are a homogenous floating point
2187 aggregate. */
2188 return size > 2 * UNITS_PER_WORD;
2191 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2192 static bool
2193 aarch64_return_in_msb (const_tree valtype)
2195 machine_mode dummy_mode;
2196 int dummy_int;
2198 /* Never happens in little-endian mode. */
2199 if (!BYTES_BIG_ENDIAN)
2200 return false;
2202 /* Only composite types smaller than or equal to 16 bytes can
2203 be potentially returned in registers. */
2204 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2205 || int_size_in_bytes (valtype) <= 0
2206 || int_size_in_bytes (valtype) > 16)
2207 return false;
2209 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2210 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2211 is always passed/returned in the least significant bits of fp/simd
2212 register(s). */
2213 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2214 &dummy_mode, &dummy_int, NULL))
2215 return false;
2217 return true;
2220 /* Implement TARGET_FUNCTION_VALUE.
2221 Define how to find the value returned by a function. */
2223 static rtx
2224 aarch64_function_value (const_tree type, const_tree func,
2225 bool outgoing ATTRIBUTE_UNUSED)
2227 machine_mode mode;
2228 int unsignedp;
2229 int count;
2230 machine_mode ag_mode;
2232 mode = TYPE_MODE (type);
2233 if (INTEGRAL_TYPE_P (type))
2234 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2236 if (aarch64_return_in_msb (type))
2238 HOST_WIDE_INT size = int_size_in_bytes (type);
2240 if (size % UNITS_PER_WORD != 0)
2242 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2243 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2247 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2248 &ag_mode, &count, NULL))
2250 if (!aarch64_composite_type_p (type, mode))
2252 gcc_assert (count == 1 && mode == ag_mode);
2253 return gen_rtx_REG (mode, V0_REGNUM);
2255 else
2257 int i;
2258 rtx par;
2260 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2261 for (i = 0; i < count; i++)
2263 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2264 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2265 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2266 XVECEXP (par, 0, i) = tmp;
2268 return par;
2271 else
2272 return gen_rtx_REG (mode, R0_REGNUM);
2275 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2276 Return true if REGNO is the number of a hard register in which the values
2277 of called function may come back. */
2279 static bool
2280 aarch64_function_value_regno_p (const unsigned int regno)
2282 /* Maximum of 16 bytes can be returned in the general registers. Examples
2283 of 16-byte return values are: 128-bit integers and 16-byte small
2284 structures (excluding homogeneous floating-point aggregates). */
2285 if (regno == R0_REGNUM || regno == R1_REGNUM)
2286 return true;
2288 /* Up to four fp/simd registers can return a function value, e.g. a
2289 homogeneous floating-point aggregate having four members. */
2290 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2291 return TARGET_FLOAT;
2293 return false;
2296 /* Implement TARGET_RETURN_IN_MEMORY.
2298 If the type T of the result of a function is such that
2299 void func (T arg)
2300 would require that arg be passed as a value in a register (or set of
2301 registers) according to the parameter passing rules, then the result
2302 is returned in the same registers as would be used for such an
2303 argument. */
2305 static bool
2306 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2308 HOST_WIDE_INT size;
2309 machine_mode ag_mode;
2310 int count;
2312 if (!AGGREGATE_TYPE_P (type)
2313 && TREE_CODE (type) != COMPLEX_TYPE
2314 && TREE_CODE (type) != VECTOR_TYPE)
2315 /* Simple scalar types always returned in registers. */
2316 return false;
2318 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2319 type,
2320 &ag_mode,
2321 &count,
2322 NULL))
2323 return false;
2325 /* Types larger than 2 registers returned in memory. */
2326 size = int_size_in_bytes (type);
2327 return (size < 0 || size > 2 * UNITS_PER_WORD);
2330 static bool
2331 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2332 const_tree type, int *nregs)
2334 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2335 return aarch64_vfp_is_call_or_return_candidate (mode,
2336 type,
2337 &pcum->aapcs_vfp_rmode,
2338 nregs,
2339 NULL);
2342 /* Given MODE and TYPE of a function argument, return the alignment in
2343 bits. The idea is to suppress any stronger alignment requested by
2344 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2345 This is a helper function for local use only. */
2347 static unsigned int
2348 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2350 if (!type)
2351 return GET_MODE_ALIGNMENT (mode);
2353 if (integer_zerop (TYPE_SIZE (type)))
2354 return 0;
2356 gcc_assert (TYPE_MODE (type) == mode);
2358 if (!AGGREGATE_TYPE_P (type))
2359 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2361 if (TREE_CODE (type) == ARRAY_TYPE)
2362 return TYPE_ALIGN (TREE_TYPE (type));
2364 unsigned int alignment = 0;
2365 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2366 if (TREE_CODE (field) == FIELD_DECL)
2367 alignment = std::max (alignment, DECL_ALIGN (field));
2369 return alignment;
2372 /* Layout a function argument according to the AAPCS64 rules. The rule
2373 numbers refer to the rule numbers in the AAPCS64. */
2375 static void
2376 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2377 const_tree type,
2378 bool named ATTRIBUTE_UNUSED)
2380 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2381 int ncrn, nvrn, nregs;
2382 bool allocate_ncrn, allocate_nvrn;
2383 HOST_WIDE_INT size;
2385 /* We need to do this once per argument. */
2386 if (pcum->aapcs_arg_processed)
2387 return;
2389 pcum->aapcs_arg_processed = true;
2391 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2392 size
2393 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2394 UNITS_PER_WORD);
2396 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2397 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2398 mode,
2399 type,
2400 &nregs);
2402 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2403 The following code thus handles passing by SIMD/FP registers first. */
2405 nvrn = pcum->aapcs_nvrn;
2407 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2408 and homogenous short-vector aggregates (HVA). */
2409 if (allocate_nvrn)
2411 if (!TARGET_FLOAT)
2412 aarch64_err_no_fpadvsimd (mode, "argument");
2414 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2416 pcum->aapcs_nextnvrn = nvrn + nregs;
2417 if (!aarch64_composite_type_p (type, mode))
2419 gcc_assert (nregs == 1);
2420 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2422 else
2424 rtx par;
2425 int i;
2426 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2427 for (i = 0; i < nregs; i++)
2429 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2430 V0_REGNUM + nvrn + i);
2431 tmp = gen_rtx_EXPR_LIST
2432 (VOIDmode, tmp,
2433 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2434 XVECEXP (par, 0, i) = tmp;
2436 pcum->aapcs_reg = par;
2438 return;
2440 else
2442 /* C.3 NSRN is set to 8. */
2443 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2444 goto on_stack;
2448 ncrn = pcum->aapcs_ncrn;
2449 nregs = size / UNITS_PER_WORD;
2451 /* C6 - C9. though the sign and zero extension semantics are
2452 handled elsewhere. This is the case where the argument fits
2453 entirely general registers. */
2454 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2457 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2459 /* C.8 if the argument has an alignment of 16 then the NGRN is
2460 rounded up to the next even number. */
2461 if (nregs == 2
2462 && ncrn % 2
2463 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2464 comparison is there because for > 16 * BITS_PER_UNIT
2465 alignment nregs should be > 2 and therefore it should be
2466 passed by reference rather than value. */
2467 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2469 ++ncrn;
2470 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2473 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2474 A reg is still generated for it, but the caller should be smart
2475 enough not to use it. */
2476 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2477 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2478 else
2480 rtx par;
2481 int i;
2483 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2484 for (i = 0; i < nregs; i++)
2486 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2487 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2488 GEN_INT (i * UNITS_PER_WORD));
2489 XVECEXP (par, 0, i) = tmp;
2491 pcum->aapcs_reg = par;
2494 pcum->aapcs_nextncrn = ncrn + nregs;
2495 return;
2498 /* C.11 */
2499 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2501 /* The argument is passed on stack; record the needed number of words for
2502 this argument and align the total size if necessary. */
2503 on_stack:
2504 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2506 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2507 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2508 16 / UNITS_PER_WORD);
2509 return;
2512 /* Implement TARGET_FUNCTION_ARG. */
2514 static rtx
2515 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2516 const_tree type, bool named)
2518 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2519 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2521 if (mode == VOIDmode)
2522 return NULL_RTX;
2524 aarch64_layout_arg (pcum_v, mode, type, named);
2525 return pcum->aapcs_reg;
2528 void
2529 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2530 const_tree fntype ATTRIBUTE_UNUSED,
2531 rtx libname ATTRIBUTE_UNUSED,
2532 const_tree fndecl ATTRIBUTE_UNUSED,
2533 unsigned n_named ATTRIBUTE_UNUSED)
2535 pcum->aapcs_ncrn = 0;
2536 pcum->aapcs_nvrn = 0;
2537 pcum->aapcs_nextncrn = 0;
2538 pcum->aapcs_nextnvrn = 0;
2539 pcum->pcs_variant = ARM_PCS_AAPCS64;
2540 pcum->aapcs_reg = NULL_RTX;
2541 pcum->aapcs_arg_processed = false;
2542 pcum->aapcs_stack_words = 0;
2543 pcum->aapcs_stack_size = 0;
2545 if (!TARGET_FLOAT
2546 && fndecl && TREE_PUBLIC (fndecl)
2547 && fntype && fntype != error_mark_node)
2549 const_tree type = TREE_TYPE (fntype);
2550 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2551 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2552 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2553 &mode, &nregs, NULL))
2554 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2556 return;
2559 static void
2560 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2561 machine_mode mode,
2562 const_tree type,
2563 bool named)
2565 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2566 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2568 aarch64_layout_arg (pcum_v, mode, type, named);
2569 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2570 != (pcum->aapcs_stack_words != 0));
2571 pcum->aapcs_arg_processed = false;
2572 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2573 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2574 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2575 pcum->aapcs_stack_words = 0;
2576 pcum->aapcs_reg = NULL_RTX;
2580 bool
2581 aarch64_function_arg_regno_p (unsigned regno)
2583 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2584 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2587 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2588 PARM_BOUNDARY bits of alignment, but will be given anything up
2589 to STACK_BOUNDARY bits if the type requires it. This makes sure
2590 that both before and after the layout of each argument, the Next
2591 Stacked Argument Address (NSAA) will have a minimum alignment of
2592 8 bytes. */
2594 static unsigned int
2595 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2597 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2598 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2601 /* Implement TARGET_FUNCTION_ARG_PADDING.
2603 Small aggregate types are placed in the lowest memory address.
2605 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2607 static pad_direction
2608 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2610 /* On little-endian targets, the least significant byte of every stack
2611 argument is passed at the lowest byte address of the stack slot. */
2612 if (!BYTES_BIG_ENDIAN)
2613 return PAD_UPWARD;
2615 /* Otherwise, integral, floating-point and pointer types are padded downward:
2616 the least significant byte of a stack argument is passed at the highest
2617 byte address of the stack slot. */
2618 if (type
2619 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2620 || POINTER_TYPE_P (type))
2621 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2622 return PAD_DOWNWARD;
2624 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2625 return PAD_UPWARD;
2628 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2630 It specifies padding for the last (may also be the only)
2631 element of a block move between registers and memory. If
2632 assuming the block is in the memory, padding upward means that
2633 the last element is padded after its highest significant byte,
2634 while in downward padding, the last element is padded at the
2635 its least significant byte side.
2637 Small aggregates and small complex types are always padded
2638 upwards.
2640 We don't need to worry about homogeneous floating-point or
2641 short-vector aggregates; their move is not affected by the
2642 padding direction determined here. Regardless of endianness,
2643 each element of such an aggregate is put in the least
2644 significant bits of a fp/simd register.
2646 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2647 register has useful data, and return the opposite if the most
2648 significant byte does. */
2650 bool
2651 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2652 bool first ATTRIBUTE_UNUSED)
2655 /* Small composite types are always padded upward. */
2656 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2658 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2659 : GET_MODE_SIZE (mode));
2660 if (size < 2 * UNITS_PER_WORD)
2661 return true;
2664 /* Otherwise, use the default padding. */
2665 return !BYTES_BIG_ENDIAN;
2668 static scalar_int_mode
2669 aarch64_libgcc_cmp_return_mode (void)
2671 return SImode;
2674 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2676 /* We use the 12-bit shifted immediate arithmetic instructions so values
2677 must be multiple of (1 << 12), i.e. 4096. */
2678 #define ARITH_FACTOR 4096
2680 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2681 #error Cannot use simple address calculation for stack probing
2682 #endif
2684 /* The pair of scratch registers used for stack probing. */
2685 #define PROBE_STACK_FIRST_REG 9
2686 #define PROBE_STACK_SECOND_REG 10
2688 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2689 inclusive. These are offsets from the current stack pointer. */
2691 static void
2692 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2694 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2696 /* See the same assertion on PROBE_INTERVAL above. */
2697 gcc_assert ((first % ARITH_FACTOR) == 0);
2699 /* See if we have a constant small number of probes to generate. If so,
2700 that's the easy case. */
2701 if (size <= PROBE_INTERVAL)
2703 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2705 emit_set_insn (reg1,
2706 plus_constant (Pmode,
2707 stack_pointer_rtx, -(first + base)));
2708 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2711 /* The run-time loop is made up of 8 insns in the generic case while the
2712 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2713 else if (size <= 4 * PROBE_INTERVAL)
2715 HOST_WIDE_INT i, rem;
2717 emit_set_insn (reg1,
2718 plus_constant (Pmode,
2719 stack_pointer_rtx,
2720 -(first + PROBE_INTERVAL)));
2721 emit_stack_probe (reg1);
2723 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2724 it exceeds SIZE. If only two probes are needed, this will not
2725 generate any code. Then probe at FIRST + SIZE. */
2726 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2728 emit_set_insn (reg1,
2729 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2730 emit_stack_probe (reg1);
2733 rem = size - (i - PROBE_INTERVAL);
2734 if (rem > 256)
2736 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2738 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2739 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2741 else
2742 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2745 /* Otherwise, do the same as above, but in a loop. Note that we must be
2746 extra careful with variables wrapping around because we might be at
2747 the very top (or the very bottom) of the address space and we have
2748 to be able to handle this case properly; in particular, we use an
2749 equality test for the loop condition. */
2750 else
2752 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2754 /* Step 1: round SIZE to the previous multiple of the interval. */
2756 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2759 /* Step 2: compute initial and final value of the loop counter. */
2761 /* TEST_ADDR = SP + FIRST. */
2762 emit_set_insn (reg1,
2763 plus_constant (Pmode, stack_pointer_rtx, -first));
2765 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2766 HOST_WIDE_INT adjustment = - (first + rounded_size);
2767 if (! aarch64_uimm12_shift (adjustment))
2769 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2770 true, Pmode);
2771 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2773 else
2775 emit_set_insn (reg2,
2776 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2779 /* Step 3: the loop
2783 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2784 probe at TEST_ADDR
2786 while (TEST_ADDR != LAST_ADDR)
2788 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2789 until it is equal to ROUNDED_SIZE. */
2791 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2794 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2795 that SIZE is equal to ROUNDED_SIZE. */
2797 if (size != rounded_size)
2799 HOST_WIDE_INT rem = size - rounded_size;
2801 if (rem > 256)
2803 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2805 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2806 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2808 else
2809 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2813 /* Make sure nothing is scheduled before we are done. */
2814 emit_insn (gen_blockage ());
2817 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2818 absolute addresses. */
2820 const char *
2821 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2823 static int labelno = 0;
2824 char loop_lab[32];
2825 rtx xops[2];
2827 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2829 /* Loop. */
2830 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2832 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2833 xops[0] = reg1;
2834 xops[1] = GEN_INT (PROBE_INTERVAL);
2835 output_asm_insn ("sub\t%0, %0, %1", xops);
2837 /* Probe at TEST_ADDR. */
2838 output_asm_insn ("str\txzr, [%0]", xops);
2840 /* Test if TEST_ADDR == LAST_ADDR. */
2841 xops[1] = reg2;
2842 output_asm_insn ("cmp\t%0, %1", xops);
2844 /* Branch. */
2845 fputs ("\tb.ne\t", asm_out_file);
2846 assemble_name_raw (asm_out_file, loop_lab);
2847 fputc ('\n', asm_out_file);
2849 return "";
2852 static bool
2853 aarch64_frame_pointer_required (void)
2855 /* In aarch64_override_options_after_change
2856 flag_omit_leaf_frame_pointer turns off the frame pointer by
2857 default. Turn it back on now if we've not got a leaf
2858 function. */
2859 if (flag_omit_leaf_frame_pointer
2860 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2861 return true;
2863 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2864 if (crtl->calls_eh_return)
2865 return true;
2867 return false;
2870 /* Mark the registers that need to be saved by the callee and calculate
2871 the size of the callee-saved registers area and frame record (both FP
2872 and LR may be omitted). */
2873 static void
2874 aarch64_layout_frame (void)
2876 HOST_WIDE_INT offset = 0;
2877 int regno, last_fp_reg = INVALID_REGNUM;
2879 if (reload_completed && cfun->machine->frame.laid_out)
2880 return;
2882 #define SLOT_NOT_REQUIRED (-2)
2883 #define SLOT_REQUIRED (-1)
2885 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2886 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2888 /* First mark all the registers that really need to be saved... */
2889 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2890 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2892 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2893 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2895 /* ... that includes the eh data registers (if needed)... */
2896 if (crtl->calls_eh_return)
2897 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2898 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2899 = SLOT_REQUIRED;
2901 /* ... and any callee saved register that dataflow says is live. */
2902 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2903 if (df_regs_ever_live_p (regno)
2904 && (regno == R30_REGNUM
2905 || !call_used_regs[regno]))
2906 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2908 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2909 if (df_regs_ever_live_p (regno)
2910 && !call_used_regs[regno])
2912 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2913 last_fp_reg = regno;
2916 if (frame_pointer_needed)
2918 /* FP and LR are placed in the linkage record. */
2919 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2920 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2921 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2922 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2923 offset += 2 * UNITS_PER_WORD;
2926 /* Now assign stack slots for them. */
2927 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2928 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2930 cfun->machine->frame.reg_offset[regno] = offset;
2931 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2932 cfun->machine->frame.wb_candidate1 = regno;
2933 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2934 cfun->machine->frame.wb_candidate2 = regno;
2935 offset += UNITS_PER_WORD;
2938 HOST_WIDE_INT max_int_offset = offset;
2939 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2940 bool has_align_gap = offset != max_int_offset;
2942 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2943 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2945 /* If there is an alignment gap between integer and fp callee-saves,
2946 allocate the last fp register to it if possible. */
2947 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2949 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2950 break;
2953 cfun->machine->frame.reg_offset[regno] = offset;
2954 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2955 cfun->machine->frame.wb_candidate1 = regno;
2956 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2957 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2958 cfun->machine->frame.wb_candidate2 = regno;
2959 offset += UNITS_PER_WORD;
2962 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2964 cfun->machine->frame.saved_regs_size = offset;
2966 HOST_WIDE_INT varargs_and_saved_regs_size
2967 = offset + cfun->machine->frame.saved_varargs_size;
2969 cfun->machine->frame.hard_fp_offset
2970 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2971 STACK_BOUNDARY / BITS_PER_UNIT);
2973 cfun->machine->frame.frame_size
2974 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2975 + crtl->outgoing_args_size,
2976 STACK_BOUNDARY / BITS_PER_UNIT);
2978 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2980 cfun->machine->frame.initial_adjust = 0;
2981 cfun->machine->frame.final_adjust = 0;
2982 cfun->machine->frame.callee_adjust = 0;
2983 cfun->machine->frame.callee_offset = 0;
2985 HOST_WIDE_INT max_push_offset = 0;
2986 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2987 max_push_offset = 512;
2988 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2989 max_push_offset = 256;
2991 if (cfun->machine->frame.frame_size < max_push_offset
2992 && crtl->outgoing_args_size == 0)
2994 /* Simple, small frame with no outgoing arguments:
2995 stp reg1, reg2, [sp, -frame_size]!
2996 stp reg3, reg4, [sp, 16] */
2997 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2999 else if ((crtl->outgoing_args_size
3000 + cfun->machine->frame.saved_regs_size < 512)
3001 && !(cfun->calls_alloca
3002 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3004 /* Frame with small outgoing arguments:
3005 sub sp, sp, frame_size
3006 stp reg1, reg2, [sp, outgoing_args_size]
3007 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3008 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3009 cfun->machine->frame.callee_offset
3010 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3012 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3014 /* Frame with large outgoing arguments but a small local area:
3015 stp reg1, reg2, [sp, -hard_fp_offset]!
3016 stp reg3, reg4, [sp, 16]
3017 sub sp, sp, outgoing_args_size */
3018 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3019 cfun->machine->frame.final_adjust
3020 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3022 else if (!frame_pointer_needed
3023 && varargs_and_saved_regs_size < max_push_offset)
3025 /* Frame with large local area and outgoing arguments (this pushes the
3026 callee-saves first, followed by the locals and outgoing area):
3027 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3028 stp reg3, reg4, [sp, 16]
3029 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3030 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3031 cfun->machine->frame.final_adjust
3032 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3033 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3034 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3036 else
3038 /* Frame with large local area and outgoing arguments using frame pointer:
3039 sub sp, sp, hard_fp_offset
3040 stp x29, x30, [sp, 0]
3041 add x29, sp, 0
3042 stp reg3, reg4, [sp, 16]
3043 sub sp, sp, outgoing_args_size */
3044 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3045 cfun->machine->frame.final_adjust
3046 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3049 cfun->machine->frame.laid_out = true;
3052 /* Return true if the register REGNO is saved on entry to
3053 the current function. */
3055 static bool
3056 aarch64_register_saved_on_entry (int regno)
3058 return cfun->machine->frame.reg_offset[regno] >= 0;
3061 /* Return the next register up from REGNO up to LIMIT for the callee
3062 to save. */
3064 static unsigned
3065 aarch64_next_callee_save (unsigned regno, unsigned limit)
3067 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3068 regno ++;
3069 return regno;
3072 /* Push the register number REGNO of mode MODE to the stack with write-back
3073 adjusting the stack by ADJUSTMENT. */
3075 static void
3076 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3077 HOST_WIDE_INT adjustment)
3079 rtx base_rtx = stack_pointer_rtx;
3080 rtx insn, reg, mem;
3082 reg = gen_rtx_REG (mode, regno);
3083 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3084 plus_constant (Pmode, base_rtx, -adjustment));
3085 mem = gen_frame_mem (mode, mem);
3087 insn = emit_move_insn (mem, reg);
3088 RTX_FRAME_RELATED_P (insn) = 1;
3091 /* Generate and return an instruction to store the pair of registers
3092 REG and REG2 of mode MODE to location BASE with write-back adjusting
3093 the stack location BASE by ADJUSTMENT. */
3095 static rtx
3096 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3097 HOST_WIDE_INT adjustment)
3099 switch (mode)
3101 case E_DImode:
3102 return gen_storewb_pairdi_di (base, base, reg, reg2,
3103 GEN_INT (-adjustment),
3104 GEN_INT (UNITS_PER_WORD - adjustment));
3105 case E_DFmode:
3106 return gen_storewb_pairdf_di (base, base, reg, reg2,
3107 GEN_INT (-adjustment),
3108 GEN_INT (UNITS_PER_WORD - adjustment));
3109 default:
3110 gcc_unreachable ();
3114 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3115 stack pointer by ADJUSTMENT. */
3117 static void
3118 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3120 rtx_insn *insn;
3121 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3123 if (regno2 == INVALID_REGNUM)
3124 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3126 rtx reg1 = gen_rtx_REG (mode, regno1);
3127 rtx reg2 = gen_rtx_REG (mode, regno2);
3129 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3130 reg2, adjustment));
3131 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3132 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3133 RTX_FRAME_RELATED_P (insn) = 1;
3136 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3137 adjusting it by ADJUSTMENT afterwards. */
3139 static rtx
3140 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3141 HOST_WIDE_INT adjustment)
3143 switch (mode)
3145 case E_DImode:
3146 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3147 GEN_INT (UNITS_PER_WORD));
3148 case E_DFmode:
3149 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3150 GEN_INT (UNITS_PER_WORD));
3151 default:
3152 gcc_unreachable ();
3156 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3157 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3158 into CFI_OPS. */
3160 static void
3161 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3162 rtx *cfi_ops)
3164 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3165 rtx reg1 = gen_rtx_REG (mode, regno1);
3167 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3169 if (regno2 == INVALID_REGNUM)
3171 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3172 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3173 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3175 else
3177 rtx reg2 = gen_rtx_REG (mode, regno2);
3178 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3179 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3180 reg2, adjustment));
3184 /* Generate and return a store pair instruction of mode MODE to store
3185 register REG1 to MEM1 and register REG2 to MEM2. */
3187 static rtx
3188 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3189 rtx reg2)
3191 switch (mode)
3193 case E_DImode:
3194 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3196 case E_DFmode:
3197 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3199 default:
3200 gcc_unreachable ();
3204 /* Generate and regurn a load pair isntruction of mode MODE to load register
3205 REG1 from MEM1 and register REG2 from MEM2. */
3207 static rtx
3208 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3209 rtx mem2)
3211 switch (mode)
3213 case E_DImode:
3214 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3216 case E_DFmode:
3217 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3219 default:
3220 gcc_unreachable ();
3224 /* Return TRUE if return address signing should be enabled for the current
3225 function, otherwise return FALSE. */
3227 bool
3228 aarch64_return_address_signing_enabled (void)
3230 /* This function should only be called after frame laid out. */
3231 gcc_assert (cfun->machine->frame.laid_out);
3233 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3234 if it's LR is pushed onto stack. */
3235 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3236 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3237 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3240 /* Emit code to save the callee-saved registers from register number START
3241 to LIMIT to the stack at the location starting at offset START_OFFSET,
3242 skipping any write-back candidates if SKIP_WB is true. */
3244 static void
3245 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3246 unsigned start, unsigned limit, bool skip_wb)
3248 rtx_insn *insn;
3249 unsigned regno;
3250 unsigned regno2;
3252 for (regno = aarch64_next_callee_save (start, limit);
3253 regno <= limit;
3254 regno = aarch64_next_callee_save (regno + 1, limit))
3256 rtx reg, mem;
3257 HOST_WIDE_INT offset;
3259 if (skip_wb
3260 && (regno == cfun->machine->frame.wb_candidate1
3261 || regno == cfun->machine->frame.wb_candidate2))
3262 continue;
3264 if (cfun->machine->reg_is_wrapped_separately[regno])
3265 continue;
3267 reg = gen_rtx_REG (mode, regno);
3268 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3269 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3270 offset));
3272 regno2 = aarch64_next_callee_save (regno + 1, limit);
3274 if (regno2 <= limit
3275 && !cfun->machine->reg_is_wrapped_separately[regno2]
3276 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3277 == cfun->machine->frame.reg_offset[regno2]))
3280 rtx reg2 = gen_rtx_REG (mode, regno2);
3281 rtx mem2;
3283 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3284 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3285 offset));
3286 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3287 reg2));
3289 /* The first part of a frame-related parallel insn is
3290 always assumed to be relevant to the frame
3291 calculations; subsequent parts, are only
3292 frame-related if explicitly marked. */
3293 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3294 regno = regno2;
3296 else
3297 insn = emit_move_insn (mem, reg);
3299 RTX_FRAME_RELATED_P (insn) = 1;
3303 /* Emit code to restore the callee registers of mode MODE from register
3304 number START up to and including LIMIT. Restore from the stack offset
3305 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3306 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3308 static void
3309 aarch64_restore_callee_saves (machine_mode mode,
3310 HOST_WIDE_INT start_offset, unsigned start,
3311 unsigned limit, bool skip_wb, rtx *cfi_ops)
3313 rtx base_rtx = stack_pointer_rtx;
3314 unsigned regno;
3315 unsigned regno2;
3316 HOST_WIDE_INT offset;
3318 for (regno = aarch64_next_callee_save (start, limit);
3319 regno <= limit;
3320 regno = aarch64_next_callee_save (regno + 1, limit))
3322 if (cfun->machine->reg_is_wrapped_separately[regno])
3323 continue;
3325 rtx reg, mem;
3327 if (skip_wb
3328 && (regno == cfun->machine->frame.wb_candidate1
3329 || regno == cfun->machine->frame.wb_candidate2))
3330 continue;
3332 reg = gen_rtx_REG (mode, regno);
3333 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3334 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3336 regno2 = aarch64_next_callee_save (regno + 1, limit);
3338 if (regno2 <= limit
3339 && !cfun->machine->reg_is_wrapped_separately[regno2]
3340 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3341 == cfun->machine->frame.reg_offset[regno2]))
3343 rtx reg2 = gen_rtx_REG (mode, regno2);
3344 rtx mem2;
3346 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3347 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3348 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3350 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3351 regno = regno2;
3353 else
3354 emit_move_insn (reg, mem);
3355 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3359 static inline bool
3360 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3361 HOST_WIDE_INT offset)
3363 return offset >= -256 && offset < 256;
3366 static inline bool
3367 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3369 return (offset >= 0
3370 && offset < 4096 * GET_MODE_SIZE (mode)
3371 && offset % GET_MODE_SIZE (mode) == 0);
3374 bool
3375 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3377 return (offset >= -64 * GET_MODE_SIZE (mode)
3378 && offset < 64 * GET_MODE_SIZE (mode)
3379 && offset % GET_MODE_SIZE (mode) == 0);
3382 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3384 static sbitmap
3385 aarch64_get_separate_components (void)
3387 aarch64_layout_frame ();
3389 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3390 bitmap_clear (components);
3392 /* The registers we need saved to the frame. */
3393 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3394 if (aarch64_register_saved_on_entry (regno))
3396 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3397 if (!frame_pointer_needed)
3398 offset += cfun->machine->frame.frame_size
3399 - cfun->machine->frame.hard_fp_offset;
3400 /* Check that we can access the stack slot of the register with one
3401 direct load with no adjustments needed. */
3402 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3403 bitmap_set_bit (components, regno);
3406 /* Don't mess with the hard frame pointer. */
3407 if (frame_pointer_needed)
3408 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3410 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3411 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3412 /* If aarch64_layout_frame has chosen registers to store/restore with
3413 writeback don't interfere with them to avoid having to output explicit
3414 stack adjustment instructions. */
3415 if (reg2 != INVALID_REGNUM)
3416 bitmap_clear_bit (components, reg2);
3417 if (reg1 != INVALID_REGNUM)
3418 bitmap_clear_bit (components, reg1);
3420 bitmap_clear_bit (components, LR_REGNUM);
3421 bitmap_clear_bit (components, SP_REGNUM);
3423 return components;
3426 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3428 static sbitmap
3429 aarch64_components_for_bb (basic_block bb)
3431 bitmap in = DF_LIVE_IN (bb);
3432 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3433 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3435 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3436 bitmap_clear (components);
3438 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3439 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3440 if ((!call_used_regs[regno])
3441 && (bitmap_bit_p (in, regno)
3442 || bitmap_bit_p (gen, regno)
3443 || bitmap_bit_p (kill, regno)))
3444 bitmap_set_bit (components, regno);
3446 return components;
3449 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3450 Nothing to do for aarch64. */
3452 static void
3453 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3457 /* Return the next set bit in BMP from START onwards. Return the total number
3458 of bits in BMP if no set bit is found at or after START. */
3460 static unsigned int
3461 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3463 unsigned int nbits = SBITMAP_SIZE (bmp);
3464 if (start == nbits)
3465 return start;
3467 gcc_assert (start < nbits);
3468 for (unsigned int i = start; i < nbits; i++)
3469 if (bitmap_bit_p (bmp, i))
3470 return i;
3472 return nbits;
3475 /* Do the work for aarch64_emit_prologue_components and
3476 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3477 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3478 for these components or the epilogue sequence. That is, it determines
3479 whether we should emit stores or loads and what kind of CFA notes to attach
3480 to the insns. Otherwise the logic for the two sequences is very
3481 similar. */
3483 static void
3484 aarch64_process_components (sbitmap components, bool prologue_p)
3486 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3487 ? HARD_FRAME_POINTER_REGNUM
3488 : STACK_POINTER_REGNUM);
3490 unsigned last_regno = SBITMAP_SIZE (components);
3491 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3492 rtx_insn *insn = NULL;
3494 while (regno != last_regno)
3496 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3497 so DFmode for the vector registers is enough. */
3498 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3499 rtx reg = gen_rtx_REG (mode, regno);
3500 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3501 if (!frame_pointer_needed)
3502 offset += cfun->machine->frame.frame_size
3503 - cfun->machine->frame.hard_fp_offset;
3504 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3505 rtx mem = gen_frame_mem (mode, addr);
3507 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3508 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3509 /* No more registers to handle after REGNO.
3510 Emit a single save/restore and exit. */
3511 if (regno2 == last_regno)
3513 insn = emit_insn (set);
3514 RTX_FRAME_RELATED_P (insn) = 1;
3515 if (prologue_p)
3516 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3517 else
3518 add_reg_note (insn, REG_CFA_RESTORE, reg);
3519 break;
3522 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3523 /* The next register is not of the same class or its offset is not
3524 mergeable with the current one into a pair. */
3525 if (!satisfies_constraint_Ump (mem)
3526 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3527 || (offset2 - cfun->machine->frame.reg_offset[regno])
3528 != GET_MODE_SIZE (mode))
3530 insn = emit_insn (set);
3531 RTX_FRAME_RELATED_P (insn) = 1;
3532 if (prologue_p)
3533 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3534 else
3535 add_reg_note (insn, REG_CFA_RESTORE, reg);
3537 regno = regno2;
3538 continue;
3541 /* REGNO2 can be saved/restored in a pair with REGNO. */
3542 rtx reg2 = gen_rtx_REG (mode, regno2);
3543 if (!frame_pointer_needed)
3544 offset2 += cfun->machine->frame.frame_size
3545 - cfun->machine->frame.hard_fp_offset;
3546 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3547 rtx mem2 = gen_frame_mem (mode, addr2);
3548 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3549 : gen_rtx_SET (reg2, mem2);
3551 if (prologue_p)
3552 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3553 else
3554 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3556 RTX_FRAME_RELATED_P (insn) = 1;
3557 if (prologue_p)
3559 add_reg_note (insn, REG_CFA_OFFSET, set);
3560 add_reg_note (insn, REG_CFA_OFFSET, set2);
3562 else
3564 add_reg_note (insn, REG_CFA_RESTORE, reg);
3565 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3568 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3572 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3574 static void
3575 aarch64_emit_prologue_components (sbitmap components)
3577 aarch64_process_components (components, true);
3580 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3582 static void
3583 aarch64_emit_epilogue_components (sbitmap components)
3585 aarch64_process_components (components, false);
3588 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3590 static void
3591 aarch64_set_handled_components (sbitmap components)
3593 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3594 if (bitmap_bit_p (components, regno))
3595 cfun->machine->reg_is_wrapped_separately[regno] = true;
3598 /* AArch64 stack frames generated by this compiler look like:
3600 +-------------------------------+
3602 | incoming stack arguments |
3604 +-------------------------------+
3605 | | <-- incoming stack pointer (aligned)
3606 | callee-allocated save area |
3607 | for register varargs |
3609 +-------------------------------+
3610 | local variables | <-- frame_pointer_rtx
3612 +-------------------------------+
3613 | padding0 | \
3614 +-------------------------------+ |
3615 | callee-saved registers | | frame.saved_regs_size
3616 +-------------------------------+ |
3617 | LR' | |
3618 +-------------------------------+ |
3619 | FP' | / <- hard_frame_pointer_rtx (aligned)
3620 +-------------------------------+
3621 | dynamic allocation |
3622 +-------------------------------+
3623 | padding |
3624 +-------------------------------+
3625 | outgoing stack arguments | <-- arg_pointer
3627 +-------------------------------+
3628 | | <-- stack_pointer_rtx (aligned)
3630 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3631 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3632 unchanged. */
3634 /* Generate the prologue instructions for entry into a function.
3635 Establish the stack frame by decreasing the stack pointer with a
3636 properly calculated size and, if necessary, create a frame record
3637 filled with the values of LR and previous frame pointer. The
3638 current FP is also set up if it is in use. */
3640 void
3641 aarch64_expand_prologue (void)
3643 aarch64_layout_frame ();
3645 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3646 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3647 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3648 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3649 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3650 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3651 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3652 rtx_insn *insn;
3654 /* Sign return address for functions. */
3655 if (aarch64_return_address_signing_enabled ())
3657 insn = emit_insn (gen_pacisp ());
3658 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3659 RTX_FRAME_RELATED_P (insn) = 1;
3662 if (flag_stack_usage_info)
3663 current_function_static_stack_size = frame_size;
3665 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3667 if (crtl->is_leaf && !cfun->calls_alloca)
3669 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3670 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3671 frame_size - STACK_CHECK_PROTECT);
3673 else if (frame_size > 0)
3674 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3677 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3679 if (callee_adjust != 0)
3680 aarch64_push_regs (reg1, reg2, callee_adjust);
3682 if (frame_pointer_needed)
3684 if (callee_adjust == 0)
3685 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3686 R30_REGNUM, false);
3687 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3688 stack_pointer_rtx,
3689 GEN_INT (callee_offset)));
3690 RTX_FRAME_RELATED_P (insn) = 1;
3691 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3694 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3695 callee_adjust != 0 || frame_pointer_needed);
3696 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3697 callee_adjust != 0 || frame_pointer_needed);
3698 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3701 /* Return TRUE if we can use a simple_return insn.
3703 This function checks whether the callee saved stack is empty, which
3704 means no restore actions are need. The pro_and_epilogue will use
3705 this to check whether shrink-wrapping opt is feasible. */
3707 bool
3708 aarch64_use_return_insn_p (void)
3710 if (!reload_completed)
3711 return false;
3713 if (crtl->profile)
3714 return false;
3716 aarch64_layout_frame ();
3718 return cfun->machine->frame.frame_size == 0;
3721 /* Generate the epilogue instructions for returning from a function.
3722 This is almost exactly the reverse of the prolog sequence, except
3723 that we need to insert barriers to avoid scheduling loads that read
3724 from a deallocated stack, and we optimize the unwind records by
3725 emitting them all together if possible. */
3726 void
3727 aarch64_expand_epilogue (bool for_sibcall)
3729 aarch64_layout_frame ();
3731 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3732 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3733 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3734 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3735 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3736 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3737 rtx cfi_ops = NULL;
3738 rtx_insn *insn;
3740 /* We need to add memory barrier to prevent read from deallocated stack. */
3741 bool need_barrier_p = (get_frame_size ()
3742 + cfun->machine->frame.saved_varargs_size) != 0;
3744 /* Emit a barrier to prevent loads from a deallocated stack. */
3745 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3746 || crtl->calls_eh_return)
3748 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3749 need_barrier_p = false;
3752 /* Restore the stack pointer from the frame pointer if it may not
3753 be the same as the stack pointer. */
3754 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3756 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3757 hard_frame_pointer_rtx,
3758 GEN_INT (-callee_offset)));
3759 /* If writeback is used when restoring callee-saves, the CFA
3760 is restored on the instruction doing the writeback. */
3761 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3763 else
3764 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3766 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3767 callee_adjust != 0, &cfi_ops);
3768 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3769 callee_adjust != 0, &cfi_ops);
3771 if (need_barrier_p)
3772 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3774 if (callee_adjust != 0)
3775 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3777 if (callee_adjust != 0 || initial_adjust > 65536)
3779 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3780 insn = get_last_insn ();
3781 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3782 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3783 RTX_FRAME_RELATED_P (insn) = 1;
3784 cfi_ops = NULL;
3787 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3789 if (cfi_ops)
3791 /* Emit delayed restores and reset the CFA to be SP. */
3792 insn = get_last_insn ();
3793 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3794 REG_NOTES (insn) = cfi_ops;
3795 RTX_FRAME_RELATED_P (insn) = 1;
3798 /* We prefer to emit the combined return/authenticate instruction RETAA,
3799 however there are three cases in which we must instead emit an explicit
3800 authentication instruction.
3802 1) Sibcalls don't return in a normal way, so if we're about to call one
3803 we must authenticate.
3805 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3806 generating code for !TARGET_ARMV8_3 we can't use it and must
3807 explicitly authenticate.
3809 3) On an eh_return path we make extra stack adjustments to update the
3810 canonical frame address to be the exception handler's CFA. We want
3811 to authenticate using the CFA of the function which calls eh_return.
3813 if (aarch64_return_address_signing_enabled ()
3814 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3816 insn = emit_insn (gen_autisp ());
3817 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3818 RTX_FRAME_RELATED_P (insn) = 1;
3821 /* Stack adjustment for exception handler. */
3822 if (crtl->calls_eh_return)
3824 /* We need to unwind the stack by the offset computed by
3825 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3826 to be SP; letting the CFA move during this adjustment
3827 is just as correct as retaining the CFA from the body
3828 of the function. Therefore, do nothing special. */
3829 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3832 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3833 if (!for_sibcall)
3834 emit_jump_insn (ret_rtx);
3837 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3838 normally or return to a previous frame after unwinding.
3840 An EH return uses a single shared return sequence. The epilogue is
3841 exactly like a normal epilogue except that it has an extra input
3842 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3843 that must be applied after the frame has been destroyed. An extra label
3844 is inserted before the epilogue which initializes this register to zero,
3845 and this is the entry point for a normal return.
3847 An actual EH return updates the return address, initializes the stack
3848 adjustment and jumps directly into the epilogue (bypassing the zeroing
3849 of the adjustment). Since the return address is typically saved on the
3850 stack when a function makes a call, the saved LR must be updated outside
3851 the epilogue.
3853 This poses problems as the store is generated well before the epilogue,
3854 so the offset of LR is not known yet. Also optimizations will remove the
3855 store as it appears dead, even after the epilogue is generated (as the
3856 base or offset for loading LR is different in many cases).
3858 To avoid these problems this implementation forces the frame pointer
3859 in eh_return functions so that the location of LR is fixed and known early.
3860 It also marks the store volatile, so no optimization is permitted to
3861 remove the store. */
3863 aarch64_eh_return_handler_rtx (void)
3865 rtx tmp = gen_frame_mem (Pmode,
3866 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3868 /* Mark the store volatile, so no optimization is permitted to remove it. */
3869 MEM_VOLATILE_P (tmp) = true;
3870 return tmp;
3873 /* Output code to add DELTA to the first argument, and then jump
3874 to FUNCTION. Used for C++ multiple inheritance. */
3875 static void
3876 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3877 HOST_WIDE_INT delta,
3878 HOST_WIDE_INT vcall_offset,
3879 tree function)
3881 /* The this pointer is always in x0. Note that this differs from
3882 Arm where the this pointer maybe bumped to r1 if r0 is required
3883 to return a pointer to an aggregate. On AArch64 a result value
3884 pointer will be in x8. */
3885 int this_regno = R0_REGNUM;
3886 rtx this_rtx, temp0, temp1, addr, funexp;
3887 rtx_insn *insn;
3889 reload_completed = 1;
3890 emit_note (NOTE_INSN_PROLOGUE_END);
3892 if (vcall_offset == 0)
3893 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3894 else
3896 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3898 this_rtx = gen_rtx_REG (Pmode, this_regno);
3899 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3900 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3902 addr = this_rtx;
3903 if (delta != 0)
3905 if (delta >= -256 && delta < 256)
3906 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3907 plus_constant (Pmode, this_rtx, delta));
3908 else
3909 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3912 if (Pmode == ptr_mode)
3913 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3914 else
3915 aarch64_emit_move (temp0,
3916 gen_rtx_ZERO_EXTEND (Pmode,
3917 gen_rtx_MEM (ptr_mode, addr)));
3919 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3920 addr = plus_constant (Pmode, temp0, vcall_offset);
3921 else
3923 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3924 Pmode);
3925 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3928 if (Pmode == ptr_mode)
3929 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3930 else
3931 aarch64_emit_move (temp1,
3932 gen_rtx_SIGN_EXTEND (Pmode,
3933 gen_rtx_MEM (ptr_mode, addr)));
3935 emit_insn (gen_add2_insn (this_rtx, temp1));
3938 /* Generate a tail call to the target function. */
3939 if (!TREE_USED (function))
3941 assemble_external (function);
3942 TREE_USED (function) = 1;
3944 funexp = XEXP (DECL_RTL (function), 0);
3945 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3946 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3947 SIBLING_CALL_P (insn) = 1;
3949 insn = get_insns ();
3950 shorten_branches (insn);
3951 final_start_function (insn, file, 1);
3952 final (insn, file, 1);
3953 final_end_function ();
3955 /* Stop pretending to be a post-reload pass. */
3956 reload_completed = 0;
3959 static bool
3960 aarch64_tls_referenced_p (rtx x)
3962 if (!TARGET_HAVE_TLS)
3963 return false;
3964 subrtx_iterator::array_type array;
3965 FOR_EACH_SUBRTX (iter, array, x, ALL)
3967 const_rtx x = *iter;
3968 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3969 return true;
3970 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3971 TLS offsets, not real symbol references. */
3972 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3973 iter.skip_subrtxes ();
3975 return false;
3979 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3980 a left shift of 0 or 12 bits. */
3981 bool
3982 aarch64_uimm12_shift (HOST_WIDE_INT val)
3984 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3985 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3990 /* Return true if val is an immediate that can be loaded into a
3991 register by a MOVZ instruction. */
3992 static bool
3993 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
3995 if (GET_MODE_SIZE (mode) > 4)
3997 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3998 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3999 return 1;
4001 else
4003 /* Ignore sign extension. */
4004 val &= (HOST_WIDE_INT) 0xffffffff;
4006 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4007 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4010 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4012 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4014 0x0000000100000001ull,
4015 0x0001000100010001ull,
4016 0x0101010101010101ull,
4017 0x1111111111111111ull,
4018 0x5555555555555555ull,
4022 /* Return true if val is a valid bitmask immediate. */
4024 bool
4025 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4027 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4028 int bits;
4030 /* Check for a single sequence of one bits and return quickly if so.
4031 The special cases of all ones and all zeroes returns false. */
4032 val = (unsigned HOST_WIDE_INT) val_in;
4033 tmp = val + (val & -val);
4035 if (tmp == (tmp & -tmp))
4036 return (val + 1) > 1;
4038 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4039 if (mode == SImode)
4040 val = (val << 32) | (val & 0xffffffff);
4042 /* Invert if the immediate doesn't start with a zero bit - this means we
4043 only need to search for sequences of one bits. */
4044 if (val & 1)
4045 val = ~val;
4047 /* Find the first set bit and set tmp to val with the first sequence of one
4048 bits removed. Return success if there is a single sequence of ones. */
4049 first_one = val & -val;
4050 tmp = val & (val + first_one);
4052 if (tmp == 0)
4053 return true;
4055 /* Find the next set bit and compute the difference in bit position. */
4056 next_one = tmp & -tmp;
4057 bits = clz_hwi (first_one) - clz_hwi (next_one);
4058 mask = val ^ tmp;
4060 /* Check the bit position difference is a power of 2, and that the first
4061 sequence of one bits fits within 'bits' bits. */
4062 if ((mask >> bits) != 0 || bits != (bits & -bits))
4063 return false;
4065 /* Check the sequence of one bits is repeated 64/bits times. */
4066 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4069 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4070 Assumed precondition: VAL_IN Is not zero. */
4072 unsigned HOST_WIDE_INT
4073 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4075 int lowest_bit_set = ctz_hwi (val_in);
4076 int highest_bit_set = floor_log2 (val_in);
4077 gcc_assert (val_in != 0);
4079 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4080 (HOST_WIDE_INT_1U << lowest_bit_set));
4083 /* Create constant where bits outside of lowest bit set to highest bit set
4084 are set to 1. */
4086 unsigned HOST_WIDE_INT
4087 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4089 return val_in | ~aarch64_and_split_imm1 (val_in);
4092 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4094 bool
4095 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4097 scalar_int_mode int_mode;
4098 if (!is_a <scalar_int_mode> (mode, &int_mode))
4099 return false;
4101 if (aarch64_bitmask_imm (val_in, int_mode))
4102 return false;
4104 if (aarch64_move_imm (val_in, int_mode))
4105 return false;
4107 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4109 return aarch64_bitmask_imm (imm2, int_mode);
4112 /* Return true if val is an immediate that can be loaded into a
4113 register in a single instruction. */
4114 bool
4115 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4117 scalar_int_mode int_mode;
4118 if (!is_a <scalar_int_mode> (mode, &int_mode))
4119 return false;
4121 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4122 return 1;
4123 return aarch64_bitmask_imm (val, int_mode);
4126 static bool
4127 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4129 rtx base, offset;
4131 if (GET_CODE (x) == HIGH)
4132 return true;
4134 split_const (x, &base, &offset);
4135 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4137 if (aarch64_classify_symbol (base, offset)
4138 != SYMBOL_FORCE_TO_MEM)
4139 return true;
4140 else
4141 /* Avoid generating a 64-bit relocation in ILP32; leave
4142 to aarch64_expand_mov_immediate to handle it properly. */
4143 return mode != ptr_mode;
4146 return aarch64_tls_referenced_p (x);
4149 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4150 The expansion for a table switch is quite expensive due to the number
4151 of instructions, the table lookup and hard to predict indirect jump.
4152 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4153 set, otherwise use tables for > 16 cases as a tradeoff between size and
4154 performance. When optimizing for size, use the default setting. */
4156 static unsigned int
4157 aarch64_case_values_threshold (void)
4159 /* Use the specified limit for the number of cases before using jump
4160 tables at higher optimization levels. */
4161 if (optimize > 2
4162 && selected_cpu->tune->max_case_values != 0)
4163 return selected_cpu->tune->max_case_values;
4164 else
4165 return optimize_size ? default_case_values_threshold () : 17;
4168 /* Return true if register REGNO is a valid index register.
4169 STRICT_P is true if REG_OK_STRICT is in effect. */
4171 bool
4172 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4174 if (!HARD_REGISTER_NUM_P (regno))
4176 if (!strict_p)
4177 return true;
4179 if (!reg_renumber)
4180 return false;
4182 regno = reg_renumber[regno];
4184 return GP_REGNUM_P (regno);
4187 /* Return true if register REGNO is a valid base register for mode MODE.
4188 STRICT_P is true if REG_OK_STRICT is in effect. */
4190 bool
4191 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4193 if (!HARD_REGISTER_NUM_P (regno))
4195 if (!strict_p)
4196 return true;
4198 if (!reg_renumber)
4199 return false;
4201 regno = reg_renumber[regno];
4204 /* The fake registers will be eliminated to either the stack or
4205 hard frame pointer, both of which are usually valid base registers.
4206 Reload deals with the cases where the eliminated form isn't valid. */
4207 return (GP_REGNUM_P (regno)
4208 || regno == SP_REGNUM
4209 || regno == FRAME_POINTER_REGNUM
4210 || regno == ARG_POINTER_REGNUM);
4213 /* Return true if X is a valid base register for mode MODE.
4214 STRICT_P is true if REG_OK_STRICT is in effect. */
4216 static bool
4217 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4219 if (!strict_p
4220 && GET_CODE (x) == SUBREG
4221 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4222 x = SUBREG_REG (x);
4224 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4227 /* Return true if address offset is a valid index. If it is, fill in INFO
4228 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4230 static bool
4231 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4232 machine_mode mode, bool strict_p)
4234 enum aarch64_address_type type;
4235 rtx index;
4236 int shift;
4238 /* (reg:P) */
4239 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4240 && GET_MODE (x) == Pmode)
4242 type = ADDRESS_REG_REG;
4243 index = x;
4244 shift = 0;
4246 /* (sign_extend:DI (reg:SI)) */
4247 else if ((GET_CODE (x) == SIGN_EXTEND
4248 || GET_CODE (x) == ZERO_EXTEND)
4249 && GET_MODE (x) == DImode
4250 && GET_MODE (XEXP (x, 0)) == SImode)
4252 type = (GET_CODE (x) == SIGN_EXTEND)
4253 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4254 index = XEXP (x, 0);
4255 shift = 0;
4257 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4258 else if (GET_CODE (x) == MULT
4259 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4260 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4261 && GET_MODE (XEXP (x, 0)) == DImode
4262 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4263 && CONST_INT_P (XEXP (x, 1)))
4265 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4266 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267 index = XEXP (XEXP (x, 0), 0);
4268 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4270 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4271 else if (GET_CODE (x) == ASHIFT
4272 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274 && GET_MODE (XEXP (x, 0)) == DImode
4275 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276 && CONST_INT_P (XEXP (x, 1)))
4278 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280 index = XEXP (XEXP (x, 0), 0);
4281 shift = INTVAL (XEXP (x, 1));
4283 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4284 else if ((GET_CODE (x) == SIGN_EXTRACT
4285 || GET_CODE (x) == ZERO_EXTRACT)
4286 && GET_MODE (x) == DImode
4287 && GET_CODE (XEXP (x, 0)) == MULT
4288 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4289 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4291 type = (GET_CODE (x) == SIGN_EXTRACT)
4292 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293 index = XEXP (XEXP (x, 0), 0);
4294 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4295 if (INTVAL (XEXP (x, 1)) != 32 + shift
4296 || INTVAL (XEXP (x, 2)) != 0)
4297 shift = -1;
4299 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4300 (const_int 0xffffffff<<shift)) */
4301 else if (GET_CODE (x) == AND
4302 && GET_MODE (x) == DImode
4303 && GET_CODE (XEXP (x, 0)) == MULT
4304 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4305 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4306 && CONST_INT_P (XEXP (x, 1)))
4308 type = ADDRESS_REG_UXTW;
4309 index = XEXP (XEXP (x, 0), 0);
4310 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4311 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4312 shift = -1;
4314 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4315 else if ((GET_CODE (x) == SIGN_EXTRACT
4316 || GET_CODE (x) == ZERO_EXTRACT)
4317 && GET_MODE (x) == DImode
4318 && GET_CODE (XEXP (x, 0)) == ASHIFT
4319 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4320 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4322 type = (GET_CODE (x) == SIGN_EXTRACT)
4323 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4324 index = XEXP (XEXP (x, 0), 0);
4325 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4326 if (INTVAL (XEXP (x, 1)) != 32 + shift
4327 || INTVAL (XEXP (x, 2)) != 0)
4328 shift = -1;
4330 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4331 (const_int 0xffffffff<<shift)) */
4332 else if (GET_CODE (x) == AND
4333 && GET_MODE (x) == DImode
4334 && GET_CODE (XEXP (x, 0)) == ASHIFT
4335 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4336 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4337 && CONST_INT_P (XEXP (x, 1)))
4339 type = ADDRESS_REG_UXTW;
4340 index = XEXP (XEXP (x, 0), 0);
4341 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4342 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4343 shift = -1;
4345 /* (mult:P (reg:P) (const_int scale)) */
4346 else if (GET_CODE (x) == MULT
4347 && GET_MODE (x) == Pmode
4348 && GET_MODE (XEXP (x, 0)) == Pmode
4349 && CONST_INT_P (XEXP (x, 1)))
4351 type = ADDRESS_REG_REG;
4352 index = XEXP (x, 0);
4353 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4355 /* (ashift:P (reg:P) (const_int shift)) */
4356 else if (GET_CODE (x) == ASHIFT
4357 && GET_MODE (x) == Pmode
4358 && GET_MODE (XEXP (x, 0)) == Pmode
4359 && CONST_INT_P (XEXP (x, 1)))
4361 type = ADDRESS_REG_REG;
4362 index = XEXP (x, 0);
4363 shift = INTVAL (XEXP (x, 1));
4365 else
4366 return false;
4368 if (!strict_p
4369 && GET_CODE (index) == SUBREG
4370 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4371 index = SUBREG_REG (index);
4373 if ((shift == 0 ||
4374 (shift > 0 && shift <= 3
4375 && (1 << shift) == GET_MODE_SIZE (mode)))
4376 && REG_P (index)
4377 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4379 info->type = type;
4380 info->offset = index;
4381 info->shift = shift;
4382 return true;
4385 return false;
4388 /* Return true if MODE is one of the modes for which we
4389 support LDP/STP operations. */
4391 static bool
4392 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4394 return mode == SImode || mode == DImode
4395 || mode == SFmode || mode == DFmode
4396 || (aarch64_vector_mode_supported_p (mode)
4397 && GET_MODE_SIZE (mode) == 8);
4400 /* Return true if REGNO is a virtual pointer register, or an eliminable
4401 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4402 include stack_pointer or hard_frame_pointer. */
4403 static bool
4404 virt_or_elim_regno_p (unsigned regno)
4406 return ((regno >= FIRST_VIRTUAL_REGISTER
4407 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4408 || regno == FRAME_POINTER_REGNUM
4409 || regno == ARG_POINTER_REGNUM);
4412 /* Return true if X is a valid address for machine mode MODE. If it is,
4413 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4414 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4416 static bool
4417 aarch64_classify_address (struct aarch64_address_info *info,
4418 rtx x, machine_mode mode,
4419 RTX_CODE outer_code, bool strict_p)
4421 enum rtx_code code = GET_CODE (x);
4422 rtx op0, op1;
4424 /* On BE, we use load/store pair for all large int mode load/stores.
4425 TI/TFmode may also use a load/store pair. */
4426 bool load_store_pair_p = (outer_code == PARALLEL
4427 || mode == TImode
4428 || mode == TFmode
4429 || (BYTES_BIG_ENDIAN
4430 && aarch64_vect_struct_mode_p (mode)));
4432 bool allow_reg_index_p =
4433 !load_store_pair_p
4434 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4435 && !aarch64_vect_struct_mode_p (mode);
4437 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4438 REG addressing. */
4439 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4440 && (code != POST_INC && code != REG))
4441 return false;
4443 switch (code)
4445 case REG:
4446 case SUBREG:
4447 info->type = ADDRESS_REG_IMM;
4448 info->base = x;
4449 info->offset = const0_rtx;
4450 return aarch64_base_register_rtx_p (x, strict_p);
4452 case PLUS:
4453 op0 = XEXP (x, 0);
4454 op1 = XEXP (x, 1);
4456 if (! strict_p
4457 && REG_P (op0)
4458 && virt_or_elim_regno_p (REGNO (op0))
4459 && CONST_INT_P (op1))
4461 info->type = ADDRESS_REG_IMM;
4462 info->base = op0;
4463 info->offset = op1;
4465 return true;
4468 if (GET_MODE_SIZE (mode) != 0
4469 && CONST_INT_P (op1)
4470 && aarch64_base_register_rtx_p (op0, strict_p))
4472 HOST_WIDE_INT offset = INTVAL (op1);
4474 info->type = ADDRESS_REG_IMM;
4475 info->base = op0;
4476 info->offset = op1;
4478 /* TImode and TFmode values are allowed in both pairs of X
4479 registers and individual Q registers. The available
4480 address modes are:
4481 X,X: 7-bit signed scaled offset
4482 Q: 9-bit signed offset
4483 We conservatively require an offset representable in either mode.
4484 When performing the check for pairs of X registers i.e. LDP/STP
4485 pass down DImode since that is the natural size of the LDP/STP
4486 instruction memory accesses. */
4487 if (mode == TImode || mode == TFmode)
4488 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4489 && (offset_9bit_signed_unscaled_p (mode, offset)
4490 || offset_12bit_unsigned_scaled_p (mode, offset)));
4492 /* A 7bit offset check because OImode will emit a ldp/stp
4493 instruction (only big endian will get here).
4494 For ldp/stp instructions, the offset is scaled for the size of a
4495 single element of the pair. */
4496 if (mode == OImode)
4497 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4499 /* Three 9/12 bit offsets checks because CImode will emit three
4500 ldr/str instructions (only big endian will get here). */
4501 if (mode == CImode)
4502 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4503 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4504 || offset_12bit_unsigned_scaled_p (V16QImode,
4505 offset + 32)));
4507 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4508 instructions (only big endian will get here). */
4509 if (mode == XImode)
4510 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4511 && aarch64_offset_7bit_signed_scaled_p (TImode,
4512 offset + 32));
4514 if (load_store_pair_p)
4515 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4516 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4517 else
4518 return (offset_9bit_signed_unscaled_p (mode, offset)
4519 || offset_12bit_unsigned_scaled_p (mode, offset));
4522 if (allow_reg_index_p)
4524 /* Look for base + (scaled/extended) index register. */
4525 if (aarch64_base_register_rtx_p (op0, strict_p)
4526 && aarch64_classify_index (info, op1, mode, strict_p))
4528 info->base = op0;
4529 return true;
4531 if (aarch64_base_register_rtx_p (op1, strict_p)
4532 && aarch64_classify_index (info, op0, mode, strict_p))
4534 info->base = op1;
4535 return true;
4539 return false;
4541 case POST_INC:
4542 case POST_DEC:
4543 case PRE_INC:
4544 case PRE_DEC:
4545 info->type = ADDRESS_REG_WB;
4546 info->base = XEXP (x, 0);
4547 info->offset = NULL_RTX;
4548 return aarch64_base_register_rtx_p (info->base, strict_p);
4550 case POST_MODIFY:
4551 case PRE_MODIFY:
4552 info->type = ADDRESS_REG_WB;
4553 info->base = XEXP (x, 0);
4554 if (GET_CODE (XEXP (x, 1)) == PLUS
4555 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4556 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4557 && aarch64_base_register_rtx_p (info->base, strict_p))
4559 HOST_WIDE_INT offset;
4560 info->offset = XEXP (XEXP (x, 1), 1);
4561 offset = INTVAL (info->offset);
4563 /* TImode and TFmode values are allowed in both pairs of X
4564 registers and individual Q registers. The available
4565 address modes are:
4566 X,X: 7-bit signed scaled offset
4567 Q: 9-bit signed offset
4568 We conservatively require an offset representable in either mode.
4570 if (mode == TImode || mode == TFmode)
4571 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4572 && offset_9bit_signed_unscaled_p (mode, offset));
4574 if (load_store_pair_p)
4575 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4576 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4577 else
4578 return offset_9bit_signed_unscaled_p (mode, offset);
4580 return false;
4582 case CONST:
4583 case SYMBOL_REF:
4584 case LABEL_REF:
4585 /* load literal: pc-relative constant pool entry. Only supported
4586 for SI mode or larger. */
4587 info->type = ADDRESS_SYMBOLIC;
4589 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4591 rtx sym, addend;
4593 split_const (x, &sym, &addend);
4594 return ((GET_CODE (sym) == LABEL_REF
4595 || (GET_CODE (sym) == SYMBOL_REF
4596 && CONSTANT_POOL_ADDRESS_P (sym)
4597 && aarch64_pcrelative_literal_loads)));
4599 return false;
4601 case LO_SUM:
4602 info->type = ADDRESS_LO_SUM;
4603 info->base = XEXP (x, 0);
4604 info->offset = XEXP (x, 1);
4605 if (allow_reg_index_p
4606 && aarch64_base_register_rtx_p (info->base, strict_p))
4608 rtx sym, offs;
4609 split_const (info->offset, &sym, &offs);
4610 if (GET_CODE (sym) == SYMBOL_REF
4611 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4613 /* The symbol and offset must be aligned to the access size. */
4614 unsigned int align;
4615 unsigned int ref_size;
4617 if (CONSTANT_POOL_ADDRESS_P (sym))
4618 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4619 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4621 tree exp = SYMBOL_REF_DECL (sym);
4622 align = TYPE_ALIGN (TREE_TYPE (exp));
4623 align = CONSTANT_ALIGNMENT (exp, align);
4625 else if (SYMBOL_REF_DECL (sym))
4626 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4627 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4628 && SYMBOL_REF_BLOCK (sym) != NULL)
4629 align = SYMBOL_REF_BLOCK (sym)->alignment;
4630 else
4631 align = BITS_PER_UNIT;
4633 ref_size = GET_MODE_SIZE (mode);
4634 if (ref_size == 0)
4635 ref_size = GET_MODE_SIZE (DImode);
4637 return ((INTVAL (offs) & (ref_size - 1)) == 0
4638 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4641 return false;
4643 default:
4644 return false;
4648 /* Return true if the address X is valid for a PRFM instruction.
4649 STRICT_P is true if we should do strict checking with
4650 aarch64_classify_address. */
4652 bool
4653 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4655 struct aarch64_address_info addr;
4657 /* PRFM accepts the same addresses as DImode... */
4658 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4659 if (!res)
4660 return false;
4662 /* ... except writeback forms. */
4663 return addr.type != ADDRESS_REG_WB;
4666 bool
4667 aarch64_symbolic_address_p (rtx x)
4669 rtx offset;
4671 split_const (x, &x, &offset);
4672 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4675 /* Classify the base of symbolic expression X. */
4677 enum aarch64_symbol_type
4678 aarch64_classify_symbolic_expression (rtx x)
4680 rtx offset;
4682 split_const (x, &x, &offset);
4683 return aarch64_classify_symbol (x, offset);
4687 /* Return TRUE if X is a legitimate address for accessing memory in
4688 mode MODE. */
4689 static bool
4690 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4692 struct aarch64_address_info addr;
4694 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4697 /* Return TRUE if X is a legitimate address for accessing memory in
4698 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4699 pair operation. */
4700 bool
4701 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4702 RTX_CODE outer_code, bool strict_p)
4704 struct aarch64_address_info addr;
4706 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4709 /* Split an out-of-range address displacement into a base and offset.
4710 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4711 to increase opportunities for sharing the base address of different sizes.
4712 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4713 static bool
4714 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4716 HOST_WIDE_INT offset = INTVAL (*disp);
4717 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4719 if (mode == TImode || mode == TFmode
4720 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4721 base = (offset + 0x100) & ~0x1ff;
4723 *off = GEN_INT (base);
4724 *disp = GEN_INT (offset - base);
4725 return true;
4728 /* Return the binary representation of floating point constant VALUE in INTVAL.
4729 If the value cannot be converted, return false without setting INTVAL.
4730 The conversion is done in the given MODE. */
4731 bool
4732 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4735 /* We make a general exception for 0. */
4736 if (aarch64_float_const_zero_rtx_p (value))
4738 *intval = 0;
4739 return true;
4742 machine_mode mode = GET_MODE (value);
4743 if (GET_CODE (value) != CONST_DOUBLE
4744 || !SCALAR_FLOAT_MODE_P (mode)
4745 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4746 /* Only support up to DF mode. */
4747 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4748 return false;
4750 unsigned HOST_WIDE_INT ival = 0;
4752 long res[2];
4753 real_to_target (res,
4754 CONST_DOUBLE_REAL_VALUE (value),
4755 REAL_MODE_FORMAT (mode));
4757 if (mode == DFmode)
4759 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4760 ival = zext_hwi (res[order], 32);
4761 ival |= (zext_hwi (res[1 - order], 32) << 32);
4763 else
4764 ival = zext_hwi (res[0], 32);
4766 *intval = ival;
4767 return true;
4770 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4771 single MOV(+MOVK) followed by an FMOV. */
4772 bool
4773 aarch64_float_const_rtx_p (rtx x)
4775 machine_mode mode = GET_MODE (x);
4776 if (mode == VOIDmode)
4777 return false;
4779 /* Determine whether it's cheaper to write float constants as
4780 mov/movk pairs over ldr/adrp pairs. */
4781 unsigned HOST_WIDE_INT ival;
4783 if (GET_CODE (x) == CONST_DOUBLE
4784 && SCALAR_FLOAT_MODE_P (mode)
4785 && aarch64_reinterpret_float_as_int (x, &ival))
4787 scalar_int_mode imode = (mode == HFmode
4788 ? SImode
4789 : int_mode_for_mode (mode).require ());
4790 int num_instr = aarch64_internal_mov_immediate
4791 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4792 return num_instr < 3;
4795 return false;
4798 /* Return TRUE if rtx X is immediate constant 0.0 */
4799 bool
4800 aarch64_float_const_zero_rtx_p (rtx x)
4802 if (GET_MODE (x) == VOIDmode)
4803 return false;
4805 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4806 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4807 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4810 /* Return TRUE if rtx X is immediate constant that fits in a single
4811 MOVI immediate operation. */
4812 bool
4813 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4815 if (!TARGET_SIMD)
4816 return false;
4818 machine_mode vmode;
4819 scalar_int_mode imode;
4820 unsigned HOST_WIDE_INT ival;
4822 if (GET_CODE (x) == CONST_DOUBLE
4823 && SCALAR_FLOAT_MODE_P (mode))
4825 if (!aarch64_reinterpret_float_as_int (x, &ival))
4826 return false;
4828 /* We make a general exception for 0. */
4829 if (aarch64_float_const_zero_rtx_p (x))
4830 return true;
4832 imode = int_mode_for_mode (mode).require ();
4834 else if (GET_CODE (x) == CONST_INT
4835 && is_a <scalar_int_mode> (mode, &imode))
4836 ival = INTVAL (x);
4837 else
4838 return false;
4840 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4841 a 128 bit vector mode. */
4842 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4844 vmode = aarch64_simd_container_mode (imode, width);
4845 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4847 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4851 /* Return the fixed registers used for condition codes. */
4853 static bool
4854 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4856 *p1 = CC_REGNUM;
4857 *p2 = INVALID_REGNUM;
4858 return true;
4861 /* This function is used by the call expanders of the machine description.
4862 RESULT is the register in which the result is returned. It's NULL for
4863 "call" and "sibcall".
4864 MEM is the location of the function call.
4865 SIBCALL indicates whether this function call is normal call or sibling call.
4866 It will generate different pattern accordingly. */
4868 void
4869 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4871 rtx call, callee, tmp;
4872 rtvec vec;
4873 machine_mode mode;
4875 gcc_assert (MEM_P (mem));
4876 callee = XEXP (mem, 0);
4877 mode = GET_MODE (callee);
4878 gcc_assert (mode == Pmode);
4880 /* Decide if we should generate indirect calls by loading the
4881 address of the callee into a register before performing
4882 the branch-and-link. */
4883 if (SYMBOL_REF_P (callee)
4884 ? (aarch64_is_long_call_p (callee)
4885 || aarch64_is_noplt_call_p (callee))
4886 : !REG_P (callee))
4887 XEXP (mem, 0) = force_reg (mode, callee);
4889 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4891 if (result != NULL_RTX)
4892 call = gen_rtx_SET (result, call);
4894 if (sibcall)
4895 tmp = ret_rtx;
4896 else
4897 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4899 vec = gen_rtvec (2, call, tmp);
4900 call = gen_rtx_PARALLEL (VOIDmode, vec);
4902 aarch64_emit_call_insn (call);
4905 /* Emit call insn with PAT and do aarch64-specific handling. */
4907 void
4908 aarch64_emit_call_insn (rtx pat)
4910 rtx insn = emit_call_insn (pat);
4912 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4913 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4914 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4917 machine_mode
4918 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4920 /* All floating point compares return CCFP if it is an equality
4921 comparison, and CCFPE otherwise. */
4922 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4924 switch (code)
4926 case EQ:
4927 case NE:
4928 case UNORDERED:
4929 case ORDERED:
4930 case UNLT:
4931 case UNLE:
4932 case UNGT:
4933 case UNGE:
4934 case UNEQ:
4935 case LTGT:
4936 return CCFPmode;
4938 case LT:
4939 case LE:
4940 case GT:
4941 case GE:
4942 return CCFPEmode;
4944 default:
4945 gcc_unreachable ();
4949 /* Equality comparisons of short modes against zero can be performed
4950 using the TST instruction with the appropriate bitmask. */
4951 if (y == const0_rtx && REG_P (x)
4952 && (code == EQ || code == NE)
4953 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4954 return CC_NZmode;
4956 /* Similarly, comparisons of zero_extends from shorter modes can
4957 be performed using an ANDS with an immediate mask. */
4958 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4959 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4960 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4961 && (code == EQ || code == NE))
4962 return CC_NZmode;
4964 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4965 && y == const0_rtx
4966 && (code == EQ || code == NE || code == LT || code == GE)
4967 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4968 || GET_CODE (x) == NEG
4969 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4970 && CONST_INT_P (XEXP (x, 2)))))
4971 return CC_NZmode;
4973 /* A compare with a shifted operand. Because of canonicalization,
4974 the comparison will have to be swapped when we emit the assembly
4975 code. */
4976 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4977 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4978 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4979 || GET_CODE (x) == LSHIFTRT
4980 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4981 return CC_SWPmode;
4983 /* Similarly for a negated operand, but we can only do this for
4984 equalities. */
4985 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4986 && (REG_P (y) || GET_CODE (y) == SUBREG)
4987 && (code == EQ || code == NE)
4988 && GET_CODE (x) == NEG)
4989 return CC_Zmode;
4991 /* A test for unsigned overflow. */
4992 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4993 && code == NE
4994 && GET_CODE (x) == PLUS
4995 && GET_CODE (y) == ZERO_EXTEND)
4996 return CC_Cmode;
4998 /* For everything else, return CCmode. */
4999 return CCmode;
5002 static int
5003 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5006 aarch64_get_condition_code (rtx x)
5008 machine_mode mode = GET_MODE (XEXP (x, 0));
5009 enum rtx_code comp_code = GET_CODE (x);
5011 if (GET_MODE_CLASS (mode) != MODE_CC)
5012 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5013 return aarch64_get_condition_code_1 (mode, comp_code);
5016 static int
5017 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5019 switch (mode)
5021 case E_CCFPmode:
5022 case E_CCFPEmode:
5023 switch (comp_code)
5025 case GE: return AARCH64_GE;
5026 case GT: return AARCH64_GT;
5027 case LE: return AARCH64_LS;
5028 case LT: return AARCH64_MI;
5029 case NE: return AARCH64_NE;
5030 case EQ: return AARCH64_EQ;
5031 case ORDERED: return AARCH64_VC;
5032 case UNORDERED: return AARCH64_VS;
5033 case UNLT: return AARCH64_LT;
5034 case UNLE: return AARCH64_LE;
5035 case UNGT: return AARCH64_HI;
5036 case UNGE: return AARCH64_PL;
5037 default: return -1;
5039 break;
5041 case E_CCmode:
5042 switch (comp_code)
5044 case NE: return AARCH64_NE;
5045 case EQ: return AARCH64_EQ;
5046 case GE: return AARCH64_GE;
5047 case GT: return AARCH64_GT;
5048 case LE: return AARCH64_LE;
5049 case LT: return AARCH64_LT;
5050 case GEU: return AARCH64_CS;
5051 case GTU: return AARCH64_HI;
5052 case LEU: return AARCH64_LS;
5053 case LTU: return AARCH64_CC;
5054 default: return -1;
5056 break;
5058 case E_CC_SWPmode:
5059 switch (comp_code)
5061 case NE: return AARCH64_NE;
5062 case EQ: return AARCH64_EQ;
5063 case GE: return AARCH64_LE;
5064 case GT: return AARCH64_LT;
5065 case LE: return AARCH64_GE;
5066 case LT: return AARCH64_GT;
5067 case GEU: return AARCH64_LS;
5068 case GTU: return AARCH64_CC;
5069 case LEU: return AARCH64_CS;
5070 case LTU: return AARCH64_HI;
5071 default: return -1;
5073 break;
5075 case E_CC_NZmode:
5076 switch (comp_code)
5078 case NE: return AARCH64_NE;
5079 case EQ: return AARCH64_EQ;
5080 case GE: return AARCH64_PL;
5081 case LT: return AARCH64_MI;
5082 default: return -1;
5084 break;
5086 case E_CC_Zmode:
5087 switch (comp_code)
5089 case NE: return AARCH64_NE;
5090 case EQ: return AARCH64_EQ;
5091 default: return -1;
5093 break;
5095 case E_CC_Cmode:
5096 switch (comp_code)
5098 case NE: return AARCH64_CS;
5099 case EQ: return AARCH64_CC;
5100 default: return -1;
5102 break;
5104 default:
5105 return -1;
5108 return -1;
5111 bool
5112 aarch64_const_vec_all_same_in_range_p (rtx x,
5113 HOST_WIDE_INT minval,
5114 HOST_WIDE_INT maxval)
5116 HOST_WIDE_INT firstval;
5117 int count, i;
5119 if (GET_CODE (x) != CONST_VECTOR
5120 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5121 return false;
5123 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5124 if (firstval < minval || firstval > maxval)
5125 return false;
5127 count = CONST_VECTOR_NUNITS (x);
5128 for (i = 1; i < count; i++)
5129 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5130 return false;
5132 return true;
5135 bool
5136 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5138 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5142 /* N Z C V. */
5143 #define AARCH64_CC_V 1
5144 #define AARCH64_CC_C (1 << 1)
5145 #define AARCH64_CC_Z (1 << 2)
5146 #define AARCH64_CC_N (1 << 3)
5148 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5149 static const int aarch64_nzcv_codes[] =
5151 0, /* EQ, Z == 1. */
5152 AARCH64_CC_Z, /* NE, Z == 0. */
5153 0, /* CS, C == 1. */
5154 AARCH64_CC_C, /* CC, C == 0. */
5155 0, /* MI, N == 1. */
5156 AARCH64_CC_N, /* PL, N == 0. */
5157 0, /* VS, V == 1. */
5158 AARCH64_CC_V, /* VC, V == 0. */
5159 0, /* HI, C ==1 && Z == 0. */
5160 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5161 AARCH64_CC_V, /* GE, N == V. */
5162 0, /* LT, N != V. */
5163 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5164 0, /* LE, !(Z == 0 && N == V). */
5165 0, /* AL, Any. */
5166 0 /* NV, Any. */
5169 /* Print operand X to file F in a target specific manner according to CODE.
5170 The acceptable formatting commands given by CODE are:
5171 'c': An integer or symbol address without a preceding #
5172 sign.
5173 'e': Print the sign/zero-extend size as a character 8->b,
5174 16->h, 32->w.
5175 'p': Prints N such that 2^N == X (X must be power of 2 and
5176 const int).
5177 'P': Print the number of non-zero bits in X (a const_int).
5178 'H': Print the higher numbered register of a pair (TImode)
5179 of regs.
5180 'm': Print a condition (eq, ne, etc).
5181 'M': Same as 'm', but invert condition.
5182 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5183 'S/T/U/V': Print a FP/SIMD register name for a register list.
5184 The register printed is the FP/SIMD register name
5185 of X + 0/1/2/3 for S/T/U/V.
5186 'R': Print a scalar FP/SIMD register name + 1.
5187 'X': Print bottom 16 bits of integer constant in hex.
5188 'w/x': Print a general register name or the zero register
5189 (32-bit or 64-bit).
5190 '0': Print a normal operand, if it's a general register,
5191 then we assume DImode.
5192 'k': Print NZCV for conditional compare instructions.
5193 'A': Output address constant representing the first
5194 argument of X, specifying a relocation offset
5195 if appropriate.
5196 'L': Output constant address specified by X
5197 with a relocation offset if appropriate.
5198 'G': Prints address of X, specifying a PC relative
5199 relocation mode if appropriate. */
5201 static void
5202 aarch64_print_operand (FILE *f, rtx x, int code)
5204 switch (code)
5206 case 'c':
5207 switch (GET_CODE (x))
5209 case CONST_INT:
5210 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5211 break;
5213 case SYMBOL_REF:
5214 output_addr_const (f, x);
5215 break;
5217 case CONST:
5218 if (GET_CODE (XEXP (x, 0)) == PLUS
5219 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5221 output_addr_const (f, x);
5222 break;
5224 /* Fall through. */
5226 default:
5227 output_operand_lossage ("Unsupported operand for code '%c'", code);
5229 break;
5231 case 'e':
5233 int n;
5235 if (!CONST_INT_P (x)
5236 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5238 output_operand_lossage ("invalid operand for '%%%c'", code);
5239 return;
5242 switch (n)
5244 case 3:
5245 fputc ('b', f);
5246 break;
5247 case 4:
5248 fputc ('h', f);
5249 break;
5250 case 5:
5251 fputc ('w', f);
5252 break;
5253 default:
5254 output_operand_lossage ("invalid operand for '%%%c'", code);
5255 return;
5258 break;
5260 case 'p':
5262 int n;
5264 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5266 output_operand_lossage ("invalid operand for '%%%c'", code);
5267 return;
5270 asm_fprintf (f, "%d", n);
5272 break;
5274 case 'P':
5275 if (!CONST_INT_P (x))
5277 output_operand_lossage ("invalid operand for '%%%c'", code);
5278 return;
5281 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5282 break;
5284 case 'H':
5285 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5287 output_operand_lossage ("invalid operand for '%%%c'", code);
5288 return;
5291 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5292 break;
5294 case 'M':
5295 case 'm':
5297 int cond_code;
5298 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5299 if (x == const_true_rtx)
5301 if (code == 'M')
5302 fputs ("nv", f);
5303 return;
5306 if (!COMPARISON_P (x))
5308 output_operand_lossage ("invalid operand for '%%%c'", code);
5309 return;
5312 cond_code = aarch64_get_condition_code (x);
5313 gcc_assert (cond_code >= 0);
5314 if (code == 'M')
5315 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5316 fputs (aarch64_condition_codes[cond_code], f);
5318 break;
5320 case 'b':
5321 case 'h':
5322 case 's':
5323 case 'd':
5324 case 'q':
5325 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5327 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5328 return;
5330 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5331 break;
5333 case 'S':
5334 case 'T':
5335 case 'U':
5336 case 'V':
5337 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5339 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5340 return;
5342 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5343 break;
5345 case 'R':
5346 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5348 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5349 return;
5351 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5352 break;
5354 case 'X':
5355 if (!CONST_INT_P (x))
5357 output_operand_lossage ("invalid operand for '%%%c'", code);
5358 return;
5360 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5361 break;
5363 case 'w':
5364 case 'x':
5365 if (x == const0_rtx
5366 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5368 asm_fprintf (f, "%czr", code);
5369 break;
5372 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5374 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5375 break;
5378 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5380 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5381 break;
5384 /* Fall through */
5386 case 0:
5387 if (x == NULL)
5389 output_operand_lossage ("missing operand");
5390 return;
5393 switch (GET_CODE (x))
5395 case REG:
5396 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5397 break;
5399 case MEM:
5400 output_address (GET_MODE (x), XEXP (x, 0));
5401 /* Check all memory references are Pmode - even with ILP32. */
5402 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5403 break;
5405 case CONST:
5406 case LABEL_REF:
5407 case SYMBOL_REF:
5408 output_addr_const (asm_out_file, x);
5409 break;
5411 case CONST_INT:
5412 asm_fprintf (f, "%wd", INTVAL (x));
5413 break;
5415 case CONST_VECTOR:
5416 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5418 gcc_assert (
5419 aarch64_const_vec_all_same_in_range_p (x,
5420 HOST_WIDE_INT_MIN,
5421 HOST_WIDE_INT_MAX));
5422 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5424 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5426 fputc ('0', f);
5428 else
5429 gcc_unreachable ();
5430 break;
5432 case CONST_DOUBLE:
5433 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5434 be getting CONST_DOUBLEs holding integers. */
5435 gcc_assert (GET_MODE (x) != VOIDmode);
5436 if (aarch64_float_const_zero_rtx_p (x))
5438 fputc ('0', f);
5439 break;
5441 else if (aarch64_float_const_representable_p (x))
5443 #define buf_size 20
5444 char float_buf[buf_size] = {'\0'};
5445 real_to_decimal_for_mode (float_buf,
5446 CONST_DOUBLE_REAL_VALUE (x),
5447 buf_size, buf_size,
5448 1, GET_MODE (x));
5449 asm_fprintf (asm_out_file, "%s", float_buf);
5450 break;
5451 #undef buf_size
5453 output_operand_lossage ("invalid constant");
5454 return;
5455 default:
5456 output_operand_lossage ("invalid operand");
5457 return;
5459 break;
5461 case 'A':
5462 if (GET_CODE (x) == HIGH)
5463 x = XEXP (x, 0);
5465 switch (aarch64_classify_symbolic_expression (x))
5467 case SYMBOL_SMALL_GOT_4G:
5468 asm_fprintf (asm_out_file, ":got:");
5469 break;
5471 case SYMBOL_SMALL_TLSGD:
5472 asm_fprintf (asm_out_file, ":tlsgd:");
5473 break;
5475 case SYMBOL_SMALL_TLSDESC:
5476 asm_fprintf (asm_out_file, ":tlsdesc:");
5477 break;
5479 case SYMBOL_SMALL_TLSIE:
5480 asm_fprintf (asm_out_file, ":gottprel:");
5481 break;
5483 case SYMBOL_TLSLE24:
5484 asm_fprintf (asm_out_file, ":tprel:");
5485 break;
5487 case SYMBOL_TINY_GOT:
5488 gcc_unreachable ();
5489 break;
5491 default:
5492 break;
5494 output_addr_const (asm_out_file, x);
5495 break;
5497 case 'L':
5498 switch (aarch64_classify_symbolic_expression (x))
5500 case SYMBOL_SMALL_GOT_4G:
5501 asm_fprintf (asm_out_file, ":lo12:");
5502 break;
5504 case SYMBOL_SMALL_TLSGD:
5505 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5506 break;
5508 case SYMBOL_SMALL_TLSDESC:
5509 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5510 break;
5512 case SYMBOL_SMALL_TLSIE:
5513 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5514 break;
5516 case SYMBOL_TLSLE12:
5517 asm_fprintf (asm_out_file, ":tprel_lo12:");
5518 break;
5520 case SYMBOL_TLSLE24:
5521 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5522 break;
5524 case SYMBOL_TINY_GOT:
5525 asm_fprintf (asm_out_file, ":got:");
5526 break;
5528 case SYMBOL_TINY_TLSIE:
5529 asm_fprintf (asm_out_file, ":gottprel:");
5530 break;
5532 default:
5533 break;
5535 output_addr_const (asm_out_file, x);
5536 break;
5538 case 'G':
5539 switch (aarch64_classify_symbolic_expression (x))
5541 case SYMBOL_TLSLE24:
5542 asm_fprintf (asm_out_file, ":tprel_hi12:");
5543 break;
5544 default:
5545 break;
5547 output_addr_const (asm_out_file, x);
5548 break;
5550 case 'k':
5552 HOST_WIDE_INT cond_code;
5554 if (!CONST_INT_P (x))
5556 output_operand_lossage ("invalid operand for '%%%c'", code);
5557 return;
5560 cond_code = INTVAL (x);
5561 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5562 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5564 break;
5566 default:
5567 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5568 return;
5572 static void
5573 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5575 struct aarch64_address_info addr;
5577 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5578 switch (addr.type)
5580 case ADDRESS_REG_IMM:
5581 if (addr.offset == const0_rtx)
5582 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5583 else
5584 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5585 INTVAL (addr.offset));
5586 return;
5588 case ADDRESS_REG_REG:
5589 if (addr.shift == 0)
5590 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5591 reg_names [REGNO (addr.offset)]);
5592 else
5593 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5594 reg_names [REGNO (addr.offset)], addr.shift);
5595 return;
5597 case ADDRESS_REG_UXTW:
5598 if (addr.shift == 0)
5599 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5600 REGNO (addr.offset) - R0_REGNUM);
5601 else
5602 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5603 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5604 return;
5606 case ADDRESS_REG_SXTW:
5607 if (addr.shift == 0)
5608 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5609 REGNO (addr.offset) - R0_REGNUM);
5610 else
5611 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5612 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5613 return;
5615 case ADDRESS_REG_WB:
5616 switch (GET_CODE (x))
5618 case PRE_INC:
5619 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5620 GET_MODE_SIZE (mode));
5621 return;
5622 case POST_INC:
5623 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5624 GET_MODE_SIZE (mode));
5625 return;
5626 case PRE_DEC:
5627 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5628 GET_MODE_SIZE (mode));
5629 return;
5630 case POST_DEC:
5631 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5632 GET_MODE_SIZE (mode));
5633 return;
5634 case PRE_MODIFY:
5635 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5636 INTVAL (addr.offset));
5637 return;
5638 case POST_MODIFY:
5639 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5640 INTVAL (addr.offset));
5641 return;
5642 default:
5643 break;
5645 break;
5647 case ADDRESS_LO_SUM:
5648 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5649 output_addr_const (f, addr.offset);
5650 asm_fprintf (f, "]");
5651 return;
5653 case ADDRESS_SYMBOLIC:
5654 break;
5657 output_addr_const (f, x);
5660 bool
5661 aarch64_label_mentioned_p (rtx x)
5663 const char *fmt;
5664 int i;
5666 if (GET_CODE (x) == LABEL_REF)
5667 return true;
5669 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5670 referencing instruction, but they are constant offsets, not
5671 symbols. */
5672 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5673 return false;
5675 fmt = GET_RTX_FORMAT (GET_CODE (x));
5676 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5678 if (fmt[i] == 'E')
5680 int j;
5682 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5683 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5684 return 1;
5686 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5687 return 1;
5690 return 0;
5693 /* Implement REGNO_REG_CLASS. */
5695 enum reg_class
5696 aarch64_regno_regclass (unsigned regno)
5698 if (GP_REGNUM_P (regno))
5699 return GENERAL_REGS;
5701 if (regno == SP_REGNUM)
5702 return STACK_REG;
5704 if (regno == FRAME_POINTER_REGNUM
5705 || regno == ARG_POINTER_REGNUM)
5706 return POINTER_REGS;
5708 if (FP_REGNUM_P (regno))
5709 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5711 return NO_REGS;
5714 static rtx
5715 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5717 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5718 where mask is selected by alignment and size of the offset.
5719 We try to pick as large a range for the offset as possible to
5720 maximize the chance of a CSE. However, for aligned addresses
5721 we limit the range to 4k so that structures with different sized
5722 elements are likely to use the same base. We need to be careful
5723 not to split a CONST for some forms of address expression, otherwise
5724 it will generate sub-optimal code. */
5726 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5728 rtx base = XEXP (x, 0);
5729 rtx offset_rtx = XEXP (x, 1);
5730 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5732 if (GET_CODE (base) == PLUS)
5734 rtx op0 = XEXP (base, 0);
5735 rtx op1 = XEXP (base, 1);
5737 /* Force any scaling into a temp for CSE. */
5738 op0 = force_reg (Pmode, op0);
5739 op1 = force_reg (Pmode, op1);
5741 /* Let the pointer register be in op0. */
5742 if (REG_POINTER (op1))
5743 std::swap (op0, op1);
5745 /* If the pointer is virtual or frame related, then we know that
5746 virtual register instantiation or register elimination is going
5747 to apply a second constant. We want the two constants folded
5748 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5749 if (virt_or_elim_regno_p (REGNO (op0)))
5751 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5752 NULL_RTX, true, OPTAB_DIRECT);
5753 return gen_rtx_PLUS (Pmode, base, op1);
5756 /* Otherwise, in order to encourage CSE (and thence loop strength
5757 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5758 base = expand_binop (Pmode, add_optab, op0, op1,
5759 NULL_RTX, true, OPTAB_DIRECT);
5760 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5763 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5764 HOST_WIDE_INT base_offset;
5765 if (GET_MODE_SIZE (mode) > 16)
5766 base_offset = (offset + 0x400) & ~0x7f0;
5767 /* For offsets aren't a multiple of the access size, the limit is
5768 -256...255. */
5769 else if (offset & (GET_MODE_SIZE (mode) - 1))
5771 base_offset = (offset + 0x100) & ~0x1ff;
5773 /* BLKmode typically uses LDP of X-registers. */
5774 if (mode == BLKmode)
5775 base_offset = (offset + 512) & ~0x3ff;
5777 /* Small negative offsets are supported. */
5778 else if (IN_RANGE (offset, -256, 0))
5779 base_offset = 0;
5780 else if (mode == TImode || mode == TFmode)
5781 base_offset = (offset + 0x100) & ~0x1ff;
5782 /* Use 12-bit offset by access size. */
5783 else
5784 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5786 if (base_offset != 0)
5788 base = plus_constant (Pmode, base, base_offset);
5789 base = force_operand (base, NULL_RTX);
5790 return plus_constant (Pmode, base, offset - base_offset);
5794 return x;
5797 /* Return the reload icode required for a constant pool in mode. */
5798 static enum insn_code
5799 aarch64_constant_pool_reload_icode (machine_mode mode)
5801 switch (mode)
5803 case E_SFmode:
5804 return CODE_FOR_aarch64_reload_movcpsfdi;
5806 case E_DFmode:
5807 return CODE_FOR_aarch64_reload_movcpdfdi;
5809 case E_TFmode:
5810 return CODE_FOR_aarch64_reload_movcptfdi;
5812 case E_V8QImode:
5813 return CODE_FOR_aarch64_reload_movcpv8qidi;
5815 case E_V16QImode:
5816 return CODE_FOR_aarch64_reload_movcpv16qidi;
5818 case E_V4HImode:
5819 return CODE_FOR_aarch64_reload_movcpv4hidi;
5821 case E_V8HImode:
5822 return CODE_FOR_aarch64_reload_movcpv8hidi;
5824 case E_V2SImode:
5825 return CODE_FOR_aarch64_reload_movcpv2sidi;
5827 case E_V4SImode:
5828 return CODE_FOR_aarch64_reload_movcpv4sidi;
5830 case E_V2DImode:
5831 return CODE_FOR_aarch64_reload_movcpv2didi;
5833 case E_V2DFmode:
5834 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5836 default:
5837 gcc_unreachable ();
5840 gcc_unreachable ();
5842 static reg_class_t
5843 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5844 reg_class_t rclass,
5845 machine_mode mode,
5846 secondary_reload_info *sri)
5849 /* If we have to disable direct literal pool loads and stores because the
5850 function is too big, then we need a scratch register. */
5851 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5852 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5853 || targetm.vector_mode_supported_p (GET_MODE (x)))
5854 && !aarch64_pcrelative_literal_loads)
5856 sri->icode = aarch64_constant_pool_reload_icode (mode);
5857 return NO_REGS;
5860 /* Without the TARGET_SIMD instructions we cannot move a Q register
5861 to a Q register directly. We need a scratch. */
5862 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5863 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5864 && reg_class_subset_p (rclass, FP_REGS))
5866 if (mode == TFmode)
5867 sri->icode = CODE_FOR_aarch64_reload_movtf;
5868 else if (mode == TImode)
5869 sri->icode = CODE_FOR_aarch64_reload_movti;
5870 return NO_REGS;
5873 /* A TFmode or TImode memory access should be handled via an FP_REGS
5874 because AArch64 has richer addressing modes for LDR/STR instructions
5875 than LDP/STP instructions. */
5876 if (TARGET_FLOAT && rclass == GENERAL_REGS
5877 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5878 return FP_REGS;
5880 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5881 return GENERAL_REGS;
5883 return NO_REGS;
5886 static bool
5887 aarch64_can_eliminate (const int from, const int to)
5889 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5890 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5892 if (frame_pointer_needed)
5894 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5895 return true;
5896 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5897 return false;
5898 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5899 && !cfun->calls_alloca)
5900 return true;
5901 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5902 return true;
5904 return false;
5906 else
5908 /* If we decided that we didn't need a leaf frame pointer but then used
5909 LR in the function, then we'll want a frame pointer after all, so
5910 prevent this elimination to ensure a frame pointer is used. */
5911 if (to == STACK_POINTER_REGNUM
5912 && flag_omit_leaf_frame_pointer
5913 && df_regs_ever_live_p (LR_REGNUM))
5914 return false;
5917 return true;
5920 HOST_WIDE_INT
5921 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5923 aarch64_layout_frame ();
5925 if (to == HARD_FRAME_POINTER_REGNUM)
5927 if (from == ARG_POINTER_REGNUM)
5928 return cfun->machine->frame.hard_fp_offset;
5930 if (from == FRAME_POINTER_REGNUM)
5931 return cfun->machine->frame.hard_fp_offset
5932 - cfun->machine->frame.locals_offset;
5935 if (to == STACK_POINTER_REGNUM)
5937 if (from == FRAME_POINTER_REGNUM)
5938 return cfun->machine->frame.frame_size
5939 - cfun->machine->frame.locals_offset;
5942 return cfun->machine->frame.frame_size;
5945 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5946 previous frame. */
5949 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5951 if (count != 0)
5952 return const0_rtx;
5953 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5957 static void
5958 aarch64_asm_trampoline_template (FILE *f)
5960 if (TARGET_ILP32)
5962 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5963 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5965 else
5967 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5968 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5970 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5971 assemble_aligned_integer (4, const0_rtx);
5972 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5973 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5976 static void
5977 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5979 rtx fnaddr, mem, a_tramp;
5980 const int tramp_code_sz = 16;
5982 /* Don't need to copy the trailing D-words, we fill those in below. */
5983 emit_block_move (m_tramp, assemble_trampoline_template (),
5984 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5985 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5986 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5987 if (GET_MODE (fnaddr) != ptr_mode)
5988 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5989 emit_move_insn (mem, fnaddr);
5991 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5992 emit_move_insn (mem, chain_value);
5994 /* XXX We should really define a "clear_cache" pattern and use
5995 gen_clear_cache(). */
5996 a_tramp = XEXP (m_tramp, 0);
5997 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5998 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
5999 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6000 ptr_mode);
6003 static unsigned char
6004 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6006 switch (regclass)
6008 case CALLER_SAVE_REGS:
6009 case POINTER_REGS:
6010 case GENERAL_REGS:
6011 case ALL_REGS:
6012 case FP_REGS:
6013 case FP_LO_REGS:
6014 return
6015 aarch64_vector_mode_p (mode)
6016 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6017 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6018 case STACK_REG:
6019 return 1;
6021 case NO_REGS:
6022 return 0;
6024 default:
6025 break;
6027 gcc_unreachable ();
6030 static reg_class_t
6031 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6033 if (regclass == POINTER_REGS)
6034 return GENERAL_REGS;
6036 if (regclass == STACK_REG)
6038 if (REG_P(x)
6039 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6040 return regclass;
6042 return NO_REGS;
6045 /* Register eliminiation can result in a request for
6046 SP+constant->FP_REGS. We cannot support such operations which
6047 use SP as source and an FP_REG as destination, so reject out
6048 right now. */
6049 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6051 rtx lhs = XEXP (x, 0);
6053 /* Look through a possible SUBREG introduced by ILP32. */
6054 if (GET_CODE (lhs) == SUBREG)
6055 lhs = SUBREG_REG (lhs);
6057 gcc_assert (REG_P (lhs));
6058 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6059 POINTER_REGS));
6060 return NO_REGS;
6063 return regclass;
6066 void
6067 aarch64_asm_output_labelref (FILE* f, const char *name)
6069 asm_fprintf (f, "%U%s", name);
6072 static void
6073 aarch64_elf_asm_constructor (rtx symbol, int priority)
6075 if (priority == DEFAULT_INIT_PRIORITY)
6076 default_ctor_section_asm_out_constructor (symbol, priority);
6077 else
6079 section *s;
6080 /* While priority is known to be in range [0, 65535], so 18 bytes
6081 would be enough, the compiler might not know that. To avoid
6082 -Wformat-truncation false positive, use a larger size. */
6083 char buf[23];
6084 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6085 s = get_section (buf, SECTION_WRITE, NULL);
6086 switch_to_section (s);
6087 assemble_align (POINTER_SIZE);
6088 assemble_aligned_integer (POINTER_BYTES, symbol);
6092 static void
6093 aarch64_elf_asm_destructor (rtx symbol, int priority)
6095 if (priority == DEFAULT_INIT_PRIORITY)
6096 default_dtor_section_asm_out_destructor (symbol, priority);
6097 else
6099 section *s;
6100 /* While priority is known to be in range [0, 65535], so 18 bytes
6101 would be enough, the compiler might not know that. To avoid
6102 -Wformat-truncation false positive, use a larger size. */
6103 char buf[23];
6104 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6105 s = get_section (buf, SECTION_WRITE, NULL);
6106 switch_to_section (s);
6107 assemble_align (POINTER_SIZE);
6108 assemble_aligned_integer (POINTER_BYTES, symbol);
6112 const char*
6113 aarch64_output_casesi (rtx *operands)
6115 char buf[100];
6116 char label[100];
6117 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6118 int index;
6119 static const char *const patterns[4][2] =
6122 "ldrb\t%w3, [%0,%w1,uxtw]",
6123 "add\t%3, %4, %w3, sxtb #2"
6126 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6127 "add\t%3, %4, %w3, sxth #2"
6130 "ldr\t%w3, [%0,%w1,uxtw #2]",
6131 "add\t%3, %4, %w3, sxtw #2"
6133 /* We assume that DImode is only generated when not optimizing and
6134 that we don't really need 64-bit address offsets. That would
6135 imply an object file with 8GB of code in a single function! */
6137 "ldr\t%w3, [%0,%w1,uxtw #2]",
6138 "add\t%3, %4, %w3, sxtw #2"
6142 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6144 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6145 index = exact_log2 (GET_MODE_SIZE (mode));
6147 gcc_assert (index >= 0 && index <= 3);
6149 /* Need to implement table size reduction, by chaning the code below. */
6150 output_asm_insn (patterns[index][0], operands);
6151 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6152 snprintf (buf, sizeof (buf),
6153 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6154 output_asm_insn (buf, operands);
6155 output_asm_insn (patterns[index][1], operands);
6156 output_asm_insn ("br\t%3", operands);
6157 assemble_label (asm_out_file, label);
6158 return "";
6162 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6163 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6164 operator. */
6167 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6169 if (shift >= 0 && shift <= 3)
6171 int size;
6172 for (size = 8; size <= 32; size *= 2)
6174 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6175 if (mask == bits << shift)
6176 return size;
6179 return 0;
6182 /* Constant pools are per function only when PC relative
6183 literal loads are true or we are in the large memory
6184 model. */
6186 static inline bool
6187 aarch64_can_use_per_function_literal_pools_p (void)
6189 return (aarch64_pcrelative_literal_loads
6190 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6193 static bool
6194 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6196 /* Fixme:: In an ideal world this would work similar
6197 to the logic in aarch64_select_rtx_section but this
6198 breaks bootstrap in gcc go. For now we workaround
6199 this by returning false here. */
6200 return false;
6203 /* Select appropriate section for constants depending
6204 on where we place literal pools. */
6206 static section *
6207 aarch64_select_rtx_section (machine_mode mode,
6208 rtx x,
6209 unsigned HOST_WIDE_INT align)
6211 if (aarch64_can_use_per_function_literal_pools_p ())
6212 return function_section (current_function_decl);
6214 return default_elf_select_rtx_section (mode, x, align);
6217 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6218 void
6219 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6220 HOST_WIDE_INT offset)
6222 /* When using per-function literal pools, we must ensure that any code
6223 section is aligned to the minimal instruction length, lest we get
6224 errors from the assembler re "unaligned instructions". */
6225 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6226 ASM_OUTPUT_ALIGN (f, 2);
6229 /* Costs. */
6231 /* Helper function for rtx cost calculation. Strip a shift expression
6232 from X. Returns the inner operand if successful, or the original
6233 expression on failure. */
6234 static rtx
6235 aarch64_strip_shift (rtx x)
6237 rtx op = x;
6239 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6240 we can convert both to ROR during final output. */
6241 if ((GET_CODE (op) == ASHIFT
6242 || GET_CODE (op) == ASHIFTRT
6243 || GET_CODE (op) == LSHIFTRT
6244 || GET_CODE (op) == ROTATERT
6245 || GET_CODE (op) == ROTATE)
6246 && CONST_INT_P (XEXP (op, 1)))
6247 return XEXP (op, 0);
6249 if (GET_CODE (op) == MULT
6250 && CONST_INT_P (XEXP (op, 1))
6251 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6252 return XEXP (op, 0);
6254 return x;
6257 /* Helper function for rtx cost calculation. Strip an extend
6258 expression from X. Returns the inner operand if successful, or the
6259 original expression on failure. We deal with a number of possible
6260 canonicalization variations here. If STRIP_SHIFT is true, then
6261 we can strip off a shift also. */
6262 static rtx
6263 aarch64_strip_extend (rtx x, bool strip_shift)
6265 scalar_int_mode mode;
6266 rtx op = x;
6268 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6269 return op;
6271 /* Zero and sign extraction of a widened value. */
6272 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6273 && XEXP (op, 2) == const0_rtx
6274 && GET_CODE (XEXP (op, 0)) == MULT
6275 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6276 XEXP (op, 1)))
6277 return XEXP (XEXP (op, 0), 0);
6279 /* It can also be represented (for zero-extend) as an AND with an
6280 immediate. */
6281 if (GET_CODE (op) == AND
6282 && GET_CODE (XEXP (op, 0)) == MULT
6283 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6284 && CONST_INT_P (XEXP (op, 1))
6285 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6286 INTVAL (XEXP (op, 1))) != 0)
6287 return XEXP (XEXP (op, 0), 0);
6289 /* Now handle extended register, as this may also have an optional
6290 left shift by 1..4. */
6291 if (strip_shift
6292 && GET_CODE (op) == ASHIFT
6293 && CONST_INT_P (XEXP (op, 1))
6294 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6295 op = XEXP (op, 0);
6297 if (GET_CODE (op) == ZERO_EXTEND
6298 || GET_CODE (op) == SIGN_EXTEND)
6299 op = XEXP (op, 0);
6301 if (op != x)
6302 return op;
6304 return x;
6307 /* Return true iff CODE is a shift supported in combination
6308 with arithmetic instructions. */
6310 static bool
6311 aarch64_shift_p (enum rtx_code code)
6313 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6317 /* Return true iff X is a cheap shift without a sign extend. */
6319 static bool
6320 aarch64_cheap_mult_shift_p (rtx x)
6322 rtx op0, op1;
6324 op0 = XEXP (x, 0);
6325 op1 = XEXP (x, 1);
6327 if (!(aarch64_tune_params.extra_tuning_flags
6328 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6329 return false;
6331 if (GET_CODE (op0) == SIGN_EXTEND)
6332 return false;
6334 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6335 && UINTVAL (op1) <= 4)
6336 return true;
6338 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6339 return false;
6341 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6343 if (l2 > 0 && l2 <= 4)
6344 return true;
6346 return false;
6349 /* Helper function for rtx cost calculation. Calculate the cost of
6350 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6351 Return the calculated cost of the expression, recursing manually in to
6352 operands where needed. */
6354 static int
6355 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6357 rtx op0, op1;
6358 const struct cpu_cost_table *extra_cost
6359 = aarch64_tune_params.insn_extra_cost;
6360 int cost = 0;
6361 bool compound_p = (outer == PLUS || outer == MINUS);
6362 machine_mode mode = GET_MODE (x);
6364 gcc_checking_assert (code == MULT);
6366 op0 = XEXP (x, 0);
6367 op1 = XEXP (x, 1);
6369 if (VECTOR_MODE_P (mode))
6370 mode = GET_MODE_INNER (mode);
6372 /* Integer multiply/fma. */
6373 if (GET_MODE_CLASS (mode) == MODE_INT)
6375 /* The multiply will be canonicalized as a shift, cost it as such. */
6376 if (aarch64_shift_p (GET_CODE (x))
6377 || (CONST_INT_P (op1)
6378 && exact_log2 (INTVAL (op1)) > 0))
6380 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6381 || GET_CODE (op0) == SIGN_EXTEND;
6382 if (speed)
6384 if (compound_p)
6386 /* If the shift is considered cheap,
6387 then don't add any cost. */
6388 if (aarch64_cheap_mult_shift_p (x))
6390 else if (REG_P (op1))
6391 /* ARITH + shift-by-register. */
6392 cost += extra_cost->alu.arith_shift_reg;
6393 else if (is_extend)
6394 /* ARITH + extended register. We don't have a cost field
6395 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6396 cost += extra_cost->alu.extend_arith;
6397 else
6398 /* ARITH + shift-by-immediate. */
6399 cost += extra_cost->alu.arith_shift;
6401 else
6402 /* LSL (immediate). */
6403 cost += extra_cost->alu.shift;
6406 /* Strip extends as we will have costed them in the case above. */
6407 if (is_extend)
6408 op0 = aarch64_strip_extend (op0, true);
6410 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6412 return cost;
6415 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6416 compound and let the below cases handle it. After all, MNEG is a
6417 special-case alias of MSUB. */
6418 if (GET_CODE (op0) == NEG)
6420 op0 = XEXP (op0, 0);
6421 compound_p = true;
6424 /* Integer multiplies or FMAs have zero/sign extending variants. */
6425 if ((GET_CODE (op0) == ZERO_EXTEND
6426 && GET_CODE (op1) == ZERO_EXTEND)
6427 || (GET_CODE (op0) == SIGN_EXTEND
6428 && GET_CODE (op1) == SIGN_EXTEND))
6430 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6431 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6433 if (speed)
6435 if (compound_p)
6436 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6437 cost += extra_cost->mult[0].extend_add;
6438 else
6439 /* MUL/SMULL/UMULL. */
6440 cost += extra_cost->mult[0].extend;
6443 return cost;
6446 /* This is either an integer multiply or a MADD. In both cases
6447 we want to recurse and cost the operands. */
6448 cost += rtx_cost (op0, mode, MULT, 0, speed);
6449 cost += rtx_cost (op1, mode, MULT, 1, speed);
6451 if (speed)
6453 if (compound_p)
6454 /* MADD/MSUB. */
6455 cost += extra_cost->mult[mode == DImode].add;
6456 else
6457 /* MUL. */
6458 cost += extra_cost->mult[mode == DImode].simple;
6461 return cost;
6463 else
6465 if (speed)
6467 /* Floating-point FMA/FMUL can also support negations of the
6468 operands, unless the rounding mode is upward or downward in
6469 which case FNMUL is different than FMUL with operand negation. */
6470 bool neg0 = GET_CODE (op0) == NEG;
6471 bool neg1 = GET_CODE (op1) == NEG;
6472 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6474 if (neg0)
6475 op0 = XEXP (op0, 0);
6476 if (neg1)
6477 op1 = XEXP (op1, 0);
6480 if (compound_p)
6481 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6482 cost += extra_cost->fp[mode == DFmode].fma;
6483 else
6484 /* FMUL/FNMUL. */
6485 cost += extra_cost->fp[mode == DFmode].mult;
6488 cost += rtx_cost (op0, mode, MULT, 0, speed);
6489 cost += rtx_cost (op1, mode, MULT, 1, speed);
6490 return cost;
6494 static int
6495 aarch64_address_cost (rtx x,
6496 machine_mode mode,
6497 addr_space_t as ATTRIBUTE_UNUSED,
6498 bool speed)
6500 enum rtx_code c = GET_CODE (x);
6501 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6502 struct aarch64_address_info info;
6503 int cost = 0;
6504 info.shift = 0;
6506 if (!aarch64_classify_address (&info, x, mode, c, false))
6508 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6510 /* This is a CONST or SYMBOL ref which will be split
6511 in a different way depending on the code model in use.
6512 Cost it through the generic infrastructure. */
6513 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6514 /* Divide through by the cost of one instruction to
6515 bring it to the same units as the address costs. */
6516 cost_symbol_ref /= COSTS_N_INSNS (1);
6517 /* The cost is then the cost of preparing the address,
6518 followed by an immediate (possibly 0) offset. */
6519 return cost_symbol_ref + addr_cost->imm_offset;
6521 else
6523 /* This is most likely a jump table from a case
6524 statement. */
6525 return addr_cost->register_offset;
6529 switch (info.type)
6531 case ADDRESS_LO_SUM:
6532 case ADDRESS_SYMBOLIC:
6533 case ADDRESS_REG_IMM:
6534 cost += addr_cost->imm_offset;
6535 break;
6537 case ADDRESS_REG_WB:
6538 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6539 cost += addr_cost->pre_modify;
6540 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6541 cost += addr_cost->post_modify;
6542 else
6543 gcc_unreachable ();
6545 break;
6547 case ADDRESS_REG_REG:
6548 cost += addr_cost->register_offset;
6549 break;
6551 case ADDRESS_REG_SXTW:
6552 cost += addr_cost->register_sextend;
6553 break;
6555 case ADDRESS_REG_UXTW:
6556 cost += addr_cost->register_zextend;
6557 break;
6559 default:
6560 gcc_unreachable ();
6564 if (info.shift > 0)
6566 /* For the sake of calculating the cost of the shifted register
6567 component, we can treat same sized modes in the same way. */
6568 switch (GET_MODE_BITSIZE (mode))
6570 case 16:
6571 cost += addr_cost->addr_scale_costs.hi;
6572 break;
6574 case 32:
6575 cost += addr_cost->addr_scale_costs.si;
6576 break;
6578 case 64:
6579 cost += addr_cost->addr_scale_costs.di;
6580 break;
6582 /* We can't tell, or this is a 128-bit vector. */
6583 default:
6584 cost += addr_cost->addr_scale_costs.ti;
6585 break;
6589 return cost;
6592 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6593 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6594 to be taken. */
6597 aarch64_branch_cost (bool speed_p, bool predictable_p)
6599 /* When optimizing for speed, use the cost of unpredictable branches. */
6600 const struct cpu_branch_cost *branch_costs =
6601 aarch64_tune_params.branch_costs;
6603 if (!speed_p || predictable_p)
6604 return branch_costs->predictable;
6605 else
6606 return branch_costs->unpredictable;
6609 /* Return true if the RTX X in mode MODE is a zero or sign extract
6610 usable in an ADD or SUB (extended register) instruction. */
6611 static bool
6612 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6614 /* Catch add with a sign extract.
6615 This is add_<optab><mode>_multp2. */
6616 if (GET_CODE (x) == SIGN_EXTRACT
6617 || GET_CODE (x) == ZERO_EXTRACT)
6619 rtx op0 = XEXP (x, 0);
6620 rtx op1 = XEXP (x, 1);
6621 rtx op2 = XEXP (x, 2);
6623 if (GET_CODE (op0) == MULT
6624 && CONST_INT_P (op1)
6625 && op2 == const0_rtx
6626 && CONST_INT_P (XEXP (op0, 1))
6627 && aarch64_is_extend_from_extract (mode,
6628 XEXP (op0, 1),
6629 op1))
6631 return true;
6634 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6635 No shift. */
6636 else if (GET_CODE (x) == SIGN_EXTEND
6637 || GET_CODE (x) == ZERO_EXTEND)
6638 return REG_P (XEXP (x, 0));
6640 return false;
6643 static bool
6644 aarch64_frint_unspec_p (unsigned int u)
6646 switch (u)
6648 case UNSPEC_FRINTZ:
6649 case UNSPEC_FRINTP:
6650 case UNSPEC_FRINTM:
6651 case UNSPEC_FRINTA:
6652 case UNSPEC_FRINTN:
6653 case UNSPEC_FRINTX:
6654 case UNSPEC_FRINTI:
6655 return true;
6657 default:
6658 return false;
6662 /* Return true iff X is an rtx that will match an extr instruction
6663 i.e. as described in the *extr<mode>5_insn family of patterns.
6664 OP0 and OP1 will be set to the operands of the shifts involved
6665 on success and will be NULL_RTX otherwise. */
6667 static bool
6668 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6670 rtx op0, op1;
6671 scalar_int_mode mode;
6672 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6673 return false;
6675 *res_op0 = NULL_RTX;
6676 *res_op1 = NULL_RTX;
6678 if (GET_CODE (x) != IOR)
6679 return false;
6681 op0 = XEXP (x, 0);
6682 op1 = XEXP (x, 1);
6684 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6685 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6687 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6688 if (GET_CODE (op1) == ASHIFT)
6689 std::swap (op0, op1);
6691 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6692 return false;
6694 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6695 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6697 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6698 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6700 *res_op0 = XEXP (op0, 0);
6701 *res_op1 = XEXP (op1, 0);
6702 return true;
6706 return false;
6709 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6710 storing it in *COST. Result is true if the total cost of the operation
6711 has now been calculated. */
6712 static bool
6713 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6715 rtx inner;
6716 rtx comparator;
6717 enum rtx_code cmpcode;
6719 if (COMPARISON_P (op0))
6721 inner = XEXP (op0, 0);
6722 comparator = XEXP (op0, 1);
6723 cmpcode = GET_CODE (op0);
6725 else
6727 inner = op0;
6728 comparator = const0_rtx;
6729 cmpcode = NE;
6732 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6734 /* Conditional branch. */
6735 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6736 return true;
6737 else
6739 if (cmpcode == NE || cmpcode == EQ)
6741 if (comparator == const0_rtx)
6743 /* TBZ/TBNZ/CBZ/CBNZ. */
6744 if (GET_CODE (inner) == ZERO_EXTRACT)
6745 /* TBZ/TBNZ. */
6746 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6747 ZERO_EXTRACT, 0, speed);
6748 else
6749 /* CBZ/CBNZ. */
6750 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6752 return true;
6755 else if (cmpcode == LT || cmpcode == GE)
6757 /* TBZ/TBNZ. */
6758 if (comparator == const0_rtx)
6759 return true;
6763 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6765 /* CCMP. */
6766 if (GET_CODE (op1) == COMPARE)
6768 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6769 if (XEXP (op1, 1) == const0_rtx)
6770 *cost += 1;
6771 if (speed)
6773 machine_mode mode = GET_MODE (XEXP (op1, 0));
6774 const struct cpu_cost_table *extra_cost
6775 = aarch64_tune_params.insn_extra_cost;
6777 if (GET_MODE_CLASS (mode) == MODE_INT)
6778 *cost += extra_cost->alu.arith;
6779 else
6780 *cost += extra_cost->fp[mode == DFmode].compare;
6782 return true;
6785 /* It's a conditional operation based on the status flags,
6786 so it must be some flavor of CSEL. */
6788 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6789 if (GET_CODE (op1) == NEG
6790 || GET_CODE (op1) == NOT
6791 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6792 op1 = XEXP (op1, 0);
6793 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6795 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6796 op1 = XEXP (op1, 0);
6797 op2 = XEXP (op2, 0);
6800 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6801 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6802 return true;
6805 /* We don't know what this is, cost all operands. */
6806 return false;
6809 /* Check whether X is a bitfield operation of the form shift + extend that
6810 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6811 operand to which the bitfield operation is applied. Otherwise return
6812 NULL_RTX. */
6814 static rtx
6815 aarch64_extend_bitfield_pattern_p (rtx x)
6817 rtx_code outer_code = GET_CODE (x);
6818 machine_mode outer_mode = GET_MODE (x);
6820 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6821 && outer_mode != SImode && outer_mode != DImode)
6822 return NULL_RTX;
6824 rtx inner = XEXP (x, 0);
6825 rtx_code inner_code = GET_CODE (inner);
6826 machine_mode inner_mode = GET_MODE (inner);
6827 rtx op = NULL_RTX;
6829 switch (inner_code)
6831 case ASHIFT:
6832 if (CONST_INT_P (XEXP (inner, 1))
6833 && (inner_mode == QImode || inner_mode == HImode))
6834 op = XEXP (inner, 0);
6835 break;
6836 case LSHIFTRT:
6837 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6838 && (inner_mode == QImode || inner_mode == HImode))
6839 op = XEXP (inner, 0);
6840 break;
6841 case ASHIFTRT:
6842 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6843 && (inner_mode == QImode || inner_mode == HImode))
6844 op = XEXP (inner, 0);
6845 break;
6846 default:
6847 break;
6850 return op;
6853 /* Return true if the mask and a shift amount from an RTX of the form
6854 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6855 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6857 bool
6858 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6859 rtx shft_amnt)
6861 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6862 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6863 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6864 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6867 /* Calculate the cost of calculating X, storing it in *COST. Result
6868 is true if the total cost of the operation has now been calculated. */
6869 static bool
6870 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6871 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6873 rtx op0, op1, op2;
6874 const struct cpu_cost_table *extra_cost
6875 = aarch64_tune_params.insn_extra_cost;
6876 int code = GET_CODE (x);
6877 scalar_int_mode int_mode;
6879 /* By default, assume that everything has equivalent cost to the
6880 cheapest instruction. Any additional costs are applied as a delta
6881 above this default. */
6882 *cost = COSTS_N_INSNS (1);
6884 switch (code)
6886 case SET:
6887 /* The cost depends entirely on the operands to SET. */
6888 *cost = 0;
6889 op0 = SET_DEST (x);
6890 op1 = SET_SRC (x);
6892 switch (GET_CODE (op0))
6894 case MEM:
6895 if (speed)
6897 rtx address = XEXP (op0, 0);
6898 if (VECTOR_MODE_P (mode))
6899 *cost += extra_cost->ldst.storev;
6900 else if (GET_MODE_CLASS (mode) == MODE_INT)
6901 *cost += extra_cost->ldst.store;
6902 else if (mode == SFmode)
6903 *cost += extra_cost->ldst.storef;
6904 else if (mode == DFmode)
6905 *cost += extra_cost->ldst.stored;
6907 *cost +=
6908 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6909 0, speed));
6912 *cost += rtx_cost (op1, mode, SET, 1, speed);
6913 return true;
6915 case SUBREG:
6916 if (! REG_P (SUBREG_REG (op0)))
6917 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6919 /* Fall through. */
6920 case REG:
6921 /* The cost is one per vector-register copied. */
6922 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6924 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6925 / GET_MODE_SIZE (V4SImode);
6926 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6928 /* const0_rtx is in general free, but we will use an
6929 instruction to set a register to 0. */
6930 else if (REG_P (op1) || op1 == const0_rtx)
6932 /* The cost is 1 per register copied. */
6933 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6934 / UNITS_PER_WORD;
6935 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6937 else
6938 /* Cost is just the cost of the RHS of the set. */
6939 *cost += rtx_cost (op1, mode, SET, 1, speed);
6940 return true;
6942 case ZERO_EXTRACT:
6943 case SIGN_EXTRACT:
6944 /* Bit-field insertion. Strip any redundant widening of
6945 the RHS to meet the width of the target. */
6946 if (GET_CODE (op1) == SUBREG)
6947 op1 = SUBREG_REG (op1);
6948 if ((GET_CODE (op1) == ZERO_EXTEND
6949 || GET_CODE (op1) == SIGN_EXTEND)
6950 && CONST_INT_P (XEXP (op0, 1))
6951 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6952 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6953 op1 = XEXP (op1, 0);
6955 if (CONST_INT_P (op1))
6957 /* MOV immediate is assumed to always be cheap. */
6958 *cost = COSTS_N_INSNS (1);
6960 else
6962 /* BFM. */
6963 if (speed)
6964 *cost += extra_cost->alu.bfi;
6965 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6968 return true;
6970 default:
6971 /* We can't make sense of this, assume default cost. */
6972 *cost = COSTS_N_INSNS (1);
6973 return false;
6975 return false;
6977 case CONST_INT:
6978 /* If an instruction can incorporate a constant within the
6979 instruction, the instruction's expression avoids calling
6980 rtx_cost() on the constant. If rtx_cost() is called on a
6981 constant, then it is usually because the constant must be
6982 moved into a register by one or more instructions.
6984 The exception is constant 0, which can be expressed
6985 as XZR/WZR and is therefore free. The exception to this is
6986 if we have (set (reg) (const0_rtx)) in which case we must cost
6987 the move. However, we can catch that when we cost the SET, so
6988 we don't need to consider that here. */
6989 if (x == const0_rtx)
6990 *cost = 0;
6991 else
6993 /* To an approximation, building any other constant is
6994 proportionally expensive to the number of instructions
6995 required to build that constant. This is true whether we
6996 are compiling for SPEED or otherwise. */
6997 if (!is_a <scalar_int_mode> (mode, &int_mode))
6998 int_mode = word_mode;
6999 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7000 (NULL_RTX, x, false, int_mode));
7002 return true;
7004 case CONST_DOUBLE:
7006 /* First determine number of instructions to do the move
7007 as an integer constant. */
7008 if (!aarch64_float_const_representable_p (x)
7009 && !aarch64_can_const_movi_rtx_p (x, mode)
7010 && aarch64_float_const_rtx_p (x))
7012 unsigned HOST_WIDE_INT ival;
7013 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7014 gcc_assert (succeed);
7016 scalar_int_mode imode = (mode == HFmode
7017 ? SImode
7018 : int_mode_for_mode (mode).require ());
7019 int ncost = aarch64_internal_mov_immediate
7020 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7021 *cost += COSTS_N_INSNS (ncost);
7022 return true;
7025 if (speed)
7027 /* mov[df,sf]_aarch64. */
7028 if (aarch64_float_const_representable_p (x))
7029 /* FMOV (scalar immediate). */
7030 *cost += extra_cost->fp[mode == DFmode].fpconst;
7031 else if (!aarch64_float_const_zero_rtx_p (x))
7033 /* This will be a load from memory. */
7034 if (mode == DFmode)
7035 *cost += extra_cost->ldst.loadd;
7036 else
7037 *cost += extra_cost->ldst.loadf;
7039 else
7040 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7041 or MOV v0.s[0], wzr - neither of which are modeled by the
7042 cost tables. Just use the default cost. */
7047 return true;
7049 case MEM:
7050 if (speed)
7052 /* For loads we want the base cost of a load, plus an
7053 approximation for the additional cost of the addressing
7054 mode. */
7055 rtx address = XEXP (x, 0);
7056 if (VECTOR_MODE_P (mode))
7057 *cost += extra_cost->ldst.loadv;
7058 else if (GET_MODE_CLASS (mode) == MODE_INT)
7059 *cost += extra_cost->ldst.load;
7060 else if (mode == SFmode)
7061 *cost += extra_cost->ldst.loadf;
7062 else if (mode == DFmode)
7063 *cost += extra_cost->ldst.loadd;
7065 *cost +=
7066 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7067 0, speed));
7070 return true;
7072 case NEG:
7073 op0 = XEXP (x, 0);
7075 if (VECTOR_MODE_P (mode))
7077 if (speed)
7079 /* FNEG. */
7080 *cost += extra_cost->vect.alu;
7082 return false;
7085 if (GET_MODE_CLASS (mode) == MODE_INT)
7087 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7088 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7090 /* CSETM. */
7091 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7092 return true;
7095 /* Cost this as SUB wzr, X. */
7096 op0 = CONST0_RTX (mode);
7097 op1 = XEXP (x, 0);
7098 goto cost_minus;
7101 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7103 /* Support (neg(fma...)) as a single instruction only if
7104 sign of zeros is unimportant. This matches the decision
7105 making in aarch64.md. */
7106 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7108 /* FNMADD. */
7109 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7110 return true;
7112 if (GET_CODE (op0) == MULT)
7114 /* FNMUL. */
7115 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7116 return true;
7118 if (speed)
7119 /* FNEG. */
7120 *cost += extra_cost->fp[mode == DFmode].neg;
7121 return false;
7124 return false;
7126 case CLRSB:
7127 case CLZ:
7128 if (speed)
7130 if (VECTOR_MODE_P (mode))
7131 *cost += extra_cost->vect.alu;
7132 else
7133 *cost += extra_cost->alu.clz;
7136 return false;
7138 case COMPARE:
7139 op0 = XEXP (x, 0);
7140 op1 = XEXP (x, 1);
7142 if (op1 == const0_rtx
7143 && GET_CODE (op0) == AND)
7145 x = op0;
7146 mode = GET_MODE (op0);
7147 goto cost_logic;
7150 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7152 /* TODO: A write to the CC flags possibly costs extra, this
7153 needs encoding in the cost tables. */
7155 mode = GET_MODE (op0);
7156 /* ANDS. */
7157 if (GET_CODE (op0) == AND)
7159 x = op0;
7160 goto cost_logic;
7163 if (GET_CODE (op0) == PLUS)
7165 /* ADDS (and CMN alias). */
7166 x = op0;
7167 goto cost_plus;
7170 if (GET_CODE (op0) == MINUS)
7172 /* SUBS. */
7173 x = op0;
7174 goto cost_minus;
7177 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7178 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7179 && CONST_INT_P (XEXP (op0, 2)))
7181 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7182 Handle it here directly rather than going to cost_logic
7183 since we know the immediate generated for the TST is valid
7184 so we can avoid creating an intermediate rtx for it only
7185 for costing purposes. */
7186 if (speed)
7187 *cost += extra_cost->alu.logical;
7189 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7190 ZERO_EXTRACT, 0, speed);
7191 return true;
7194 if (GET_CODE (op1) == NEG)
7196 /* CMN. */
7197 if (speed)
7198 *cost += extra_cost->alu.arith;
7200 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7201 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7202 return true;
7205 /* CMP.
7207 Compare can freely swap the order of operands, and
7208 canonicalization puts the more complex operation first.
7209 But the integer MINUS logic expects the shift/extend
7210 operation in op1. */
7211 if (! (REG_P (op0)
7212 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7214 op0 = XEXP (x, 1);
7215 op1 = XEXP (x, 0);
7217 goto cost_minus;
7220 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7222 /* FCMP. */
7223 if (speed)
7224 *cost += extra_cost->fp[mode == DFmode].compare;
7226 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7228 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7229 /* FCMP supports constant 0.0 for no extra cost. */
7230 return true;
7232 return false;
7235 if (VECTOR_MODE_P (mode))
7237 /* Vector compare. */
7238 if (speed)
7239 *cost += extra_cost->vect.alu;
7241 if (aarch64_float_const_zero_rtx_p (op1))
7243 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7244 cost. */
7245 return true;
7247 return false;
7249 return false;
7251 case MINUS:
7253 op0 = XEXP (x, 0);
7254 op1 = XEXP (x, 1);
7256 cost_minus:
7257 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7259 /* Detect valid immediates. */
7260 if ((GET_MODE_CLASS (mode) == MODE_INT
7261 || (GET_MODE_CLASS (mode) == MODE_CC
7262 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7263 && CONST_INT_P (op1)
7264 && aarch64_uimm12_shift (INTVAL (op1)))
7266 if (speed)
7267 /* SUB(S) (immediate). */
7268 *cost += extra_cost->alu.arith;
7269 return true;
7272 /* Look for SUB (extended register). */
7273 if (is_a <scalar_int_mode> (mode, &int_mode)
7274 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7276 if (speed)
7277 *cost += extra_cost->alu.extend_arith;
7279 op1 = aarch64_strip_extend (op1, true);
7280 *cost += rtx_cost (op1, VOIDmode,
7281 (enum rtx_code) GET_CODE (op1), 0, speed);
7282 return true;
7285 rtx new_op1 = aarch64_strip_extend (op1, false);
7287 /* Cost this as an FMA-alike operation. */
7288 if ((GET_CODE (new_op1) == MULT
7289 || aarch64_shift_p (GET_CODE (new_op1)))
7290 && code != COMPARE)
7292 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7293 (enum rtx_code) code,
7294 speed);
7295 return true;
7298 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7300 if (speed)
7302 if (VECTOR_MODE_P (mode))
7304 /* Vector SUB. */
7305 *cost += extra_cost->vect.alu;
7307 else if (GET_MODE_CLASS (mode) == MODE_INT)
7309 /* SUB(S). */
7310 *cost += extra_cost->alu.arith;
7312 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7314 /* FSUB. */
7315 *cost += extra_cost->fp[mode == DFmode].addsub;
7318 return true;
7321 case PLUS:
7323 rtx new_op0;
7325 op0 = XEXP (x, 0);
7326 op1 = XEXP (x, 1);
7328 cost_plus:
7329 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7330 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7332 /* CSINC. */
7333 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7334 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7335 return true;
7338 if (GET_MODE_CLASS (mode) == MODE_INT
7339 && CONST_INT_P (op1)
7340 && aarch64_uimm12_shift (INTVAL (op1)))
7342 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7344 if (speed)
7345 /* ADD (immediate). */
7346 *cost += extra_cost->alu.arith;
7347 return true;
7350 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7352 /* Look for ADD (extended register). */
7353 if (is_a <scalar_int_mode> (mode, &int_mode)
7354 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7356 if (speed)
7357 *cost += extra_cost->alu.extend_arith;
7359 op0 = aarch64_strip_extend (op0, true);
7360 *cost += rtx_cost (op0, VOIDmode,
7361 (enum rtx_code) GET_CODE (op0), 0, speed);
7362 return true;
7365 /* Strip any extend, leave shifts behind as we will
7366 cost them through mult_cost. */
7367 new_op0 = aarch64_strip_extend (op0, false);
7369 if (GET_CODE (new_op0) == MULT
7370 || aarch64_shift_p (GET_CODE (new_op0)))
7372 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7373 speed);
7374 return true;
7377 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7379 if (speed)
7381 if (VECTOR_MODE_P (mode))
7383 /* Vector ADD. */
7384 *cost += extra_cost->vect.alu;
7386 else if (GET_MODE_CLASS (mode) == MODE_INT)
7388 /* ADD. */
7389 *cost += extra_cost->alu.arith;
7391 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7393 /* FADD. */
7394 *cost += extra_cost->fp[mode == DFmode].addsub;
7397 return true;
7400 case BSWAP:
7401 *cost = COSTS_N_INSNS (1);
7403 if (speed)
7405 if (VECTOR_MODE_P (mode))
7406 *cost += extra_cost->vect.alu;
7407 else
7408 *cost += extra_cost->alu.rev;
7410 return false;
7412 case IOR:
7413 if (aarch_rev16_p (x))
7415 *cost = COSTS_N_INSNS (1);
7417 if (speed)
7419 if (VECTOR_MODE_P (mode))
7420 *cost += extra_cost->vect.alu;
7421 else
7422 *cost += extra_cost->alu.rev;
7424 return true;
7427 if (aarch64_extr_rtx_p (x, &op0, &op1))
7429 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7430 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7431 if (speed)
7432 *cost += extra_cost->alu.shift;
7434 return true;
7436 /* Fall through. */
7437 case XOR:
7438 case AND:
7439 cost_logic:
7440 op0 = XEXP (x, 0);
7441 op1 = XEXP (x, 1);
7443 if (VECTOR_MODE_P (mode))
7445 if (speed)
7446 *cost += extra_cost->vect.alu;
7447 return true;
7450 if (code == AND
7451 && GET_CODE (op0) == MULT
7452 && CONST_INT_P (XEXP (op0, 1))
7453 && CONST_INT_P (op1)
7454 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7455 INTVAL (op1)) != 0)
7457 /* This is a UBFM/SBFM. */
7458 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7459 if (speed)
7460 *cost += extra_cost->alu.bfx;
7461 return true;
7464 if (is_int_mode (mode, &int_mode))
7466 if (CONST_INT_P (op1))
7468 /* We have a mask + shift version of a UBFIZ
7469 i.e. the *andim_ashift<mode>_bfiz pattern. */
7470 if (GET_CODE (op0) == ASHIFT
7471 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7472 XEXP (op0, 1)))
7474 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7475 (enum rtx_code) code, 0, speed);
7476 if (speed)
7477 *cost += extra_cost->alu.bfx;
7479 return true;
7481 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7483 /* We possibly get the immediate for free, this is not
7484 modelled. */
7485 *cost += rtx_cost (op0, int_mode,
7486 (enum rtx_code) code, 0, speed);
7487 if (speed)
7488 *cost += extra_cost->alu.logical;
7490 return true;
7493 else
7495 rtx new_op0 = op0;
7497 /* Handle ORN, EON, or BIC. */
7498 if (GET_CODE (op0) == NOT)
7499 op0 = XEXP (op0, 0);
7501 new_op0 = aarch64_strip_shift (op0);
7503 /* If we had a shift on op0 then this is a logical-shift-
7504 by-register/immediate operation. Otherwise, this is just
7505 a logical operation. */
7506 if (speed)
7508 if (new_op0 != op0)
7510 /* Shift by immediate. */
7511 if (CONST_INT_P (XEXP (op0, 1)))
7512 *cost += extra_cost->alu.log_shift;
7513 else
7514 *cost += extra_cost->alu.log_shift_reg;
7516 else
7517 *cost += extra_cost->alu.logical;
7520 /* In both cases we want to cost both operands. */
7521 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7522 0, speed);
7523 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7524 1, speed);
7526 return true;
7529 return false;
7531 case NOT:
7532 x = XEXP (x, 0);
7533 op0 = aarch64_strip_shift (x);
7535 if (VECTOR_MODE_P (mode))
7537 /* Vector NOT. */
7538 *cost += extra_cost->vect.alu;
7539 return false;
7542 /* MVN-shifted-reg. */
7543 if (op0 != x)
7545 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7547 if (speed)
7548 *cost += extra_cost->alu.log_shift;
7550 return true;
7552 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7553 Handle the second form here taking care that 'a' in the above can
7554 be a shift. */
7555 else if (GET_CODE (op0) == XOR)
7557 rtx newop0 = XEXP (op0, 0);
7558 rtx newop1 = XEXP (op0, 1);
7559 rtx op0_stripped = aarch64_strip_shift (newop0);
7561 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7562 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7564 if (speed)
7566 if (op0_stripped != newop0)
7567 *cost += extra_cost->alu.log_shift;
7568 else
7569 *cost += extra_cost->alu.logical;
7572 return true;
7574 /* MVN. */
7575 if (speed)
7576 *cost += extra_cost->alu.logical;
7578 return false;
7580 case ZERO_EXTEND:
7582 op0 = XEXP (x, 0);
7583 /* If a value is written in SI mode, then zero extended to DI
7584 mode, the operation will in general be free as a write to
7585 a 'w' register implicitly zeroes the upper bits of an 'x'
7586 register. However, if this is
7588 (set (reg) (zero_extend (reg)))
7590 we must cost the explicit register move. */
7591 if (mode == DImode
7592 && GET_MODE (op0) == SImode
7593 && outer == SET)
7595 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7597 /* If OP_COST is non-zero, then the cost of the zero extend
7598 is effectively the cost of the inner operation. Otherwise
7599 we have a MOV instruction and we take the cost from the MOV
7600 itself. This is true independently of whether we are
7601 optimizing for space or time. */
7602 if (op_cost)
7603 *cost = op_cost;
7605 return true;
7607 else if (MEM_P (op0))
7609 /* All loads can zero extend to any size for free. */
7610 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7611 return true;
7614 op0 = aarch64_extend_bitfield_pattern_p (x);
7615 if (op0)
7617 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7618 if (speed)
7619 *cost += extra_cost->alu.bfx;
7620 return true;
7623 if (speed)
7625 if (VECTOR_MODE_P (mode))
7627 /* UMOV. */
7628 *cost += extra_cost->vect.alu;
7630 else
7632 /* We generate an AND instead of UXTB/UXTH. */
7633 *cost += extra_cost->alu.logical;
7636 return false;
7638 case SIGN_EXTEND:
7639 if (MEM_P (XEXP (x, 0)))
7641 /* LDRSH. */
7642 if (speed)
7644 rtx address = XEXP (XEXP (x, 0), 0);
7645 *cost += extra_cost->ldst.load_sign_extend;
7647 *cost +=
7648 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7649 0, speed));
7651 return true;
7654 op0 = aarch64_extend_bitfield_pattern_p (x);
7655 if (op0)
7657 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7658 if (speed)
7659 *cost += extra_cost->alu.bfx;
7660 return true;
7663 if (speed)
7665 if (VECTOR_MODE_P (mode))
7666 *cost += extra_cost->vect.alu;
7667 else
7668 *cost += extra_cost->alu.extend;
7670 return false;
7672 case ASHIFT:
7673 op0 = XEXP (x, 0);
7674 op1 = XEXP (x, 1);
7676 if (CONST_INT_P (op1))
7678 if (speed)
7680 if (VECTOR_MODE_P (mode))
7682 /* Vector shift (immediate). */
7683 *cost += extra_cost->vect.alu;
7685 else
7687 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7688 aliases. */
7689 *cost += extra_cost->alu.shift;
7693 /* We can incorporate zero/sign extend for free. */
7694 if (GET_CODE (op0) == ZERO_EXTEND
7695 || GET_CODE (op0) == SIGN_EXTEND)
7696 op0 = XEXP (op0, 0);
7698 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7699 return true;
7701 else
7703 if (VECTOR_MODE_P (mode))
7705 if (speed)
7706 /* Vector shift (register). */
7707 *cost += extra_cost->vect.alu;
7709 else
7711 if (speed)
7712 /* LSLV. */
7713 *cost += extra_cost->alu.shift_reg;
7715 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7716 && CONST_INT_P (XEXP (op1, 1))
7717 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7719 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7720 /* We already demanded XEXP (op1, 0) to be REG_P, so
7721 don't recurse into it. */
7722 return true;
7725 return false; /* All arguments need to be in registers. */
7728 case ROTATE:
7729 case ROTATERT:
7730 case LSHIFTRT:
7731 case ASHIFTRT:
7732 op0 = XEXP (x, 0);
7733 op1 = XEXP (x, 1);
7735 if (CONST_INT_P (op1))
7737 /* ASR (immediate) and friends. */
7738 if (speed)
7740 if (VECTOR_MODE_P (mode))
7741 *cost += extra_cost->vect.alu;
7742 else
7743 *cost += extra_cost->alu.shift;
7746 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7747 return true;
7749 else
7751 if (VECTOR_MODE_P (mode))
7753 if (speed)
7754 /* Vector shift (register). */
7755 *cost += extra_cost->vect.alu;
7757 else
7759 if (speed)
7760 /* ASR (register) and friends. */
7761 *cost += extra_cost->alu.shift_reg;
7763 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7764 && CONST_INT_P (XEXP (op1, 1))
7765 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7767 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7768 /* We already demanded XEXP (op1, 0) to be REG_P, so
7769 don't recurse into it. */
7770 return true;
7773 return false; /* All arguments need to be in registers. */
7776 case SYMBOL_REF:
7778 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7779 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7781 /* LDR. */
7782 if (speed)
7783 *cost += extra_cost->ldst.load;
7785 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7786 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7788 /* ADRP, followed by ADD. */
7789 *cost += COSTS_N_INSNS (1);
7790 if (speed)
7791 *cost += 2 * extra_cost->alu.arith;
7793 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7794 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7796 /* ADR. */
7797 if (speed)
7798 *cost += extra_cost->alu.arith;
7801 if (flag_pic)
7803 /* One extra load instruction, after accessing the GOT. */
7804 *cost += COSTS_N_INSNS (1);
7805 if (speed)
7806 *cost += extra_cost->ldst.load;
7808 return true;
7810 case HIGH:
7811 case LO_SUM:
7812 /* ADRP/ADD (immediate). */
7813 if (speed)
7814 *cost += extra_cost->alu.arith;
7815 return true;
7817 case ZERO_EXTRACT:
7818 case SIGN_EXTRACT:
7819 /* UBFX/SBFX. */
7820 if (speed)
7822 if (VECTOR_MODE_P (mode))
7823 *cost += extra_cost->vect.alu;
7824 else
7825 *cost += extra_cost->alu.bfx;
7828 /* We can trust that the immediates used will be correct (there
7829 are no by-register forms), so we need only cost op0. */
7830 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7831 return true;
7833 case MULT:
7834 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7835 /* aarch64_rtx_mult_cost always handles recursion to its
7836 operands. */
7837 return true;
7839 case MOD:
7840 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7841 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7842 an unconditional negate. This case should only ever be reached through
7843 the set_smod_pow2_cheap check in expmed.c. */
7844 if (CONST_INT_P (XEXP (x, 1))
7845 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7846 && (mode == SImode || mode == DImode))
7848 /* We expand to 4 instructions. Reset the baseline. */
7849 *cost = COSTS_N_INSNS (4);
7851 if (speed)
7852 *cost += 2 * extra_cost->alu.logical
7853 + 2 * extra_cost->alu.arith;
7855 return true;
7858 /* Fall-through. */
7859 case UMOD:
7860 if (speed)
7862 /* Slighly prefer UMOD over SMOD. */
7863 if (VECTOR_MODE_P (mode))
7864 *cost += extra_cost->vect.alu;
7865 else if (GET_MODE_CLASS (mode) == MODE_INT)
7866 *cost += (extra_cost->mult[mode == DImode].add
7867 + extra_cost->mult[mode == DImode].idiv
7868 + (code == MOD ? 1 : 0));
7870 return false; /* All arguments need to be in registers. */
7872 case DIV:
7873 case UDIV:
7874 case SQRT:
7875 if (speed)
7877 if (VECTOR_MODE_P (mode))
7878 *cost += extra_cost->vect.alu;
7879 else if (GET_MODE_CLASS (mode) == MODE_INT)
7880 /* There is no integer SQRT, so only DIV and UDIV can get
7881 here. */
7882 *cost += (extra_cost->mult[mode == DImode].idiv
7883 /* Slighly prefer UDIV over SDIV. */
7884 + (code == DIV ? 1 : 0));
7885 else
7886 *cost += extra_cost->fp[mode == DFmode].div;
7888 return false; /* All arguments need to be in registers. */
7890 case IF_THEN_ELSE:
7891 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7892 XEXP (x, 2), cost, speed);
7894 case EQ:
7895 case NE:
7896 case GT:
7897 case GTU:
7898 case LT:
7899 case LTU:
7900 case GE:
7901 case GEU:
7902 case LE:
7903 case LEU:
7905 return false; /* All arguments must be in registers. */
7907 case FMA:
7908 op0 = XEXP (x, 0);
7909 op1 = XEXP (x, 1);
7910 op2 = XEXP (x, 2);
7912 if (speed)
7914 if (VECTOR_MODE_P (mode))
7915 *cost += extra_cost->vect.alu;
7916 else
7917 *cost += extra_cost->fp[mode == DFmode].fma;
7920 /* FMSUB, FNMADD, and FNMSUB are free. */
7921 if (GET_CODE (op0) == NEG)
7922 op0 = XEXP (op0, 0);
7924 if (GET_CODE (op2) == NEG)
7925 op2 = XEXP (op2, 0);
7927 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7928 and the by-element operand as operand 0. */
7929 if (GET_CODE (op1) == NEG)
7930 op1 = XEXP (op1, 0);
7932 /* Catch vector-by-element operations. The by-element operand can
7933 either be (vec_duplicate (vec_select (x))) or just
7934 (vec_select (x)), depending on whether we are multiplying by
7935 a vector or a scalar.
7937 Canonicalization is not very good in these cases, FMA4 will put the
7938 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7939 if (GET_CODE (op0) == VEC_DUPLICATE)
7940 op0 = XEXP (op0, 0);
7941 else if (GET_CODE (op1) == VEC_DUPLICATE)
7942 op1 = XEXP (op1, 0);
7944 if (GET_CODE (op0) == VEC_SELECT)
7945 op0 = XEXP (op0, 0);
7946 else if (GET_CODE (op1) == VEC_SELECT)
7947 op1 = XEXP (op1, 0);
7949 /* If the remaining parameters are not registers,
7950 get the cost to put them into registers. */
7951 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7952 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7953 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7954 return true;
7956 case FLOAT:
7957 case UNSIGNED_FLOAT:
7958 if (speed)
7959 *cost += extra_cost->fp[mode == DFmode].fromint;
7960 return false;
7962 case FLOAT_EXTEND:
7963 if (speed)
7965 if (VECTOR_MODE_P (mode))
7967 /*Vector truncate. */
7968 *cost += extra_cost->vect.alu;
7970 else
7971 *cost += extra_cost->fp[mode == DFmode].widen;
7973 return false;
7975 case FLOAT_TRUNCATE:
7976 if (speed)
7978 if (VECTOR_MODE_P (mode))
7980 /*Vector conversion. */
7981 *cost += extra_cost->vect.alu;
7983 else
7984 *cost += extra_cost->fp[mode == DFmode].narrow;
7986 return false;
7988 case FIX:
7989 case UNSIGNED_FIX:
7990 x = XEXP (x, 0);
7991 /* Strip the rounding part. They will all be implemented
7992 by the fcvt* family of instructions anyway. */
7993 if (GET_CODE (x) == UNSPEC)
7995 unsigned int uns_code = XINT (x, 1);
7997 if (uns_code == UNSPEC_FRINTA
7998 || uns_code == UNSPEC_FRINTM
7999 || uns_code == UNSPEC_FRINTN
8000 || uns_code == UNSPEC_FRINTP
8001 || uns_code == UNSPEC_FRINTZ)
8002 x = XVECEXP (x, 0, 0);
8005 if (speed)
8007 if (VECTOR_MODE_P (mode))
8008 *cost += extra_cost->vect.alu;
8009 else
8010 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8013 /* We can combine fmul by a power of 2 followed by a fcvt into a single
8014 fixed-point fcvt. */
8015 if (GET_CODE (x) == MULT
8016 && ((VECTOR_MODE_P (mode)
8017 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8018 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8020 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8021 0, speed);
8022 return true;
8025 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8026 return true;
8028 case ABS:
8029 if (VECTOR_MODE_P (mode))
8031 /* ABS (vector). */
8032 if (speed)
8033 *cost += extra_cost->vect.alu;
8035 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8037 op0 = XEXP (x, 0);
8039 /* FABD, which is analogous to FADD. */
8040 if (GET_CODE (op0) == MINUS)
8042 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8043 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8044 if (speed)
8045 *cost += extra_cost->fp[mode == DFmode].addsub;
8047 return true;
8049 /* Simple FABS is analogous to FNEG. */
8050 if (speed)
8051 *cost += extra_cost->fp[mode == DFmode].neg;
8053 else
8055 /* Integer ABS will either be split to
8056 two arithmetic instructions, or will be an ABS
8057 (scalar), which we don't model. */
8058 *cost = COSTS_N_INSNS (2);
8059 if (speed)
8060 *cost += 2 * extra_cost->alu.arith;
8062 return false;
8064 case SMAX:
8065 case SMIN:
8066 if (speed)
8068 if (VECTOR_MODE_P (mode))
8069 *cost += extra_cost->vect.alu;
8070 else
8072 /* FMAXNM/FMINNM/FMAX/FMIN.
8073 TODO: This may not be accurate for all implementations, but
8074 we do not model this in the cost tables. */
8075 *cost += extra_cost->fp[mode == DFmode].addsub;
8078 return false;
8080 case UNSPEC:
8081 /* The floating point round to integer frint* instructions. */
8082 if (aarch64_frint_unspec_p (XINT (x, 1)))
8084 if (speed)
8085 *cost += extra_cost->fp[mode == DFmode].roundint;
8087 return false;
8090 if (XINT (x, 1) == UNSPEC_RBIT)
8092 if (speed)
8093 *cost += extra_cost->alu.rev;
8095 return false;
8097 break;
8099 case TRUNCATE:
8101 /* Decompose <su>muldi3_highpart. */
8102 if (/* (truncate:DI */
8103 mode == DImode
8104 /* (lshiftrt:TI */
8105 && GET_MODE (XEXP (x, 0)) == TImode
8106 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8107 /* (mult:TI */
8108 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8109 /* (ANY_EXTEND:TI (reg:DI))
8110 (ANY_EXTEND:TI (reg:DI))) */
8111 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8112 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8113 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8114 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8115 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8116 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8117 /* (const_int 64) */
8118 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8119 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8121 /* UMULH/SMULH. */
8122 if (speed)
8123 *cost += extra_cost->mult[mode == DImode].extend;
8124 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8125 mode, MULT, 0, speed);
8126 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8127 mode, MULT, 1, speed);
8128 return true;
8131 /* Fall through. */
8132 default:
8133 break;
8136 if (dump_file
8137 && flag_aarch64_verbose_cost)
8138 fprintf (dump_file,
8139 "\nFailed to cost RTX. Assuming default cost.\n");
8141 return true;
8144 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8145 calculated for X. This cost is stored in *COST. Returns true
8146 if the total cost of X was calculated. */
8147 static bool
8148 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8149 int param, int *cost, bool speed)
8151 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8153 if (dump_file
8154 && flag_aarch64_verbose_cost)
8156 print_rtl_single (dump_file, x);
8157 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8158 speed ? "Hot" : "Cold",
8159 *cost, result ? "final" : "partial");
8162 return result;
8165 static int
8166 aarch64_register_move_cost (machine_mode mode,
8167 reg_class_t from_i, reg_class_t to_i)
8169 enum reg_class from = (enum reg_class) from_i;
8170 enum reg_class to = (enum reg_class) to_i;
8171 const struct cpu_regmove_cost *regmove_cost
8172 = aarch64_tune_params.regmove_cost;
8174 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8175 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8176 to = GENERAL_REGS;
8178 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8179 from = GENERAL_REGS;
8181 /* Moving between GPR and stack cost is the same as GP2GP. */
8182 if ((from == GENERAL_REGS && to == STACK_REG)
8183 || (to == GENERAL_REGS && from == STACK_REG))
8184 return regmove_cost->GP2GP;
8186 /* To/From the stack register, we move via the gprs. */
8187 if (to == STACK_REG || from == STACK_REG)
8188 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8189 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8191 if (GET_MODE_SIZE (mode) == 16)
8193 /* 128-bit operations on general registers require 2 instructions. */
8194 if (from == GENERAL_REGS && to == GENERAL_REGS)
8195 return regmove_cost->GP2GP * 2;
8196 else if (from == GENERAL_REGS)
8197 return regmove_cost->GP2FP * 2;
8198 else if (to == GENERAL_REGS)
8199 return regmove_cost->FP2GP * 2;
8201 /* When AdvSIMD instructions are disabled it is not possible to move
8202 a 128-bit value directly between Q registers. This is handled in
8203 secondary reload. A general register is used as a scratch to move
8204 the upper DI value and the lower DI value is moved directly,
8205 hence the cost is the sum of three moves. */
8206 if (! TARGET_SIMD)
8207 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8209 return regmove_cost->FP2FP;
8212 if (from == GENERAL_REGS && to == GENERAL_REGS)
8213 return regmove_cost->GP2GP;
8214 else if (from == GENERAL_REGS)
8215 return regmove_cost->GP2FP;
8216 else if (to == GENERAL_REGS)
8217 return regmove_cost->FP2GP;
8219 return regmove_cost->FP2FP;
8222 static int
8223 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8224 reg_class_t rclass ATTRIBUTE_UNUSED,
8225 bool in ATTRIBUTE_UNUSED)
8227 return aarch64_tune_params.memmov_cost;
8230 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8231 to optimize 1.0/sqrt. */
8233 static bool
8234 use_rsqrt_p (machine_mode mode)
8236 return (!flag_trapping_math
8237 && flag_unsafe_math_optimizations
8238 && ((aarch64_tune_params.approx_modes->recip_sqrt
8239 & AARCH64_APPROX_MODE (mode))
8240 || flag_mrecip_low_precision_sqrt));
8243 /* Function to decide when to use the approximate reciprocal square root
8244 builtin. */
8246 static tree
8247 aarch64_builtin_reciprocal (tree fndecl)
8249 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8251 if (!use_rsqrt_p (mode))
8252 return NULL_TREE;
8253 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8256 typedef rtx (*rsqrte_type) (rtx, rtx);
8258 /* Select reciprocal square root initial estimate insn depending on machine
8259 mode. */
8261 static rsqrte_type
8262 get_rsqrte_type (machine_mode mode)
8264 switch (mode)
8266 case E_DFmode: return gen_aarch64_rsqrtedf;
8267 case E_SFmode: return gen_aarch64_rsqrtesf;
8268 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8269 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8270 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8271 default: gcc_unreachable ();
8275 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8277 /* Select reciprocal square root series step insn depending on machine mode. */
8279 static rsqrts_type
8280 get_rsqrts_type (machine_mode mode)
8282 switch (mode)
8284 case E_DFmode: return gen_aarch64_rsqrtsdf;
8285 case E_SFmode: return gen_aarch64_rsqrtssf;
8286 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8287 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8288 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8289 default: gcc_unreachable ();
8293 /* Emit instruction sequence to compute either the approximate square root
8294 or its approximate reciprocal, depending on the flag RECP, and return
8295 whether the sequence was emitted or not. */
8297 bool
8298 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8300 machine_mode mode = GET_MODE (dst);
8302 if (GET_MODE_INNER (mode) == HFmode)
8304 gcc_assert (!recp);
8305 return false;
8308 if (!recp)
8310 if (!(flag_mlow_precision_sqrt
8311 || (aarch64_tune_params.approx_modes->sqrt
8312 & AARCH64_APPROX_MODE (mode))))
8313 return false;
8315 if (flag_finite_math_only
8316 || flag_trapping_math
8317 || !flag_unsafe_math_optimizations
8318 || optimize_function_for_size_p (cfun))
8319 return false;
8321 else
8322 /* Caller assumes we cannot fail. */
8323 gcc_assert (use_rsqrt_p (mode));
8325 machine_mode mmsk = mode_for_int_vector (mode).require ();
8326 rtx xmsk = gen_reg_rtx (mmsk);
8327 if (!recp)
8328 /* When calculating the approximate square root, compare the
8329 argument with 0.0 and create a mask. */
8330 emit_insn (gen_rtx_SET (xmsk,
8331 gen_rtx_NEG (mmsk,
8332 gen_rtx_EQ (mmsk, src,
8333 CONST0_RTX (mode)))));
8335 /* Estimate the approximate reciprocal square root. */
8336 rtx xdst = gen_reg_rtx (mode);
8337 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8339 /* Iterate over the series twice for SF and thrice for DF. */
8340 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8342 /* Optionally iterate over the series once less for faster performance
8343 while sacrificing the accuracy. */
8344 if ((recp && flag_mrecip_low_precision_sqrt)
8345 || (!recp && flag_mlow_precision_sqrt))
8346 iterations--;
8348 /* Iterate over the series to calculate the approximate reciprocal square
8349 root. */
8350 rtx x1 = gen_reg_rtx (mode);
8351 while (iterations--)
8353 rtx x2 = gen_reg_rtx (mode);
8354 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8356 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8358 if (iterations > 0)
8359 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8362 if (!recp)
8364 /* Qualify the approximate reciprocal square root when the argument is
8365 0.0 by squashing the intermediary result to 0.0. */
8366 rtx xtmp = gen_reg_rtx (mmsk);
8367 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8368 gen_rtx_SUBREG (mmsk, xdst, 0)));
8369 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8371 /* Calculate the approximate square root. */
8372 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8375 /* Finalize the approximation. */
8376 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8378 return true;
8381 typedef rtx (*recpe_type) (rtx, rtx);
8383 /* Select reciprocal initial estimate insn depending on machine mode. */
8385 static recpe_type
8386 get_recpe_type (machine_mode mode)
8388 switch (mode)
8390 case E_SFmode: return (gen_aarch64_frecpesf);
8391 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8392 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8393 case E_DFmode: return (gen_aarch64_frecpedf);
8394 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8395 default: gcc_unreachable ();
8399 typedef rtx (*recps_type) (rtx, rtx, rtx);
8401 /* Select reciprocal series step insn depending on machine mode. */
8403 static recps_type
8404 get_recps_type (machine_mode mode)
8406 switch (mode)
8408 case E_SFmode: return (gen_aarch64_frecpssf);
8409 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8410 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8411 case E_DFmode: return (gen_aarch64_frecpsdf);
8412 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8413 default: gcc_unreachable ();
8417 /* Emit the instruction sequence to compute the approximation for the division
8418 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8420 bool
8421 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8423 machine_mode mode = GET_MODE (quo);
8425 if (GET_MODE_INNER (mode) == HFmode)
8426 return false;
8428 bool use_approx_division_p = (flag_mlow_precision_div
8429 || (aarch64_tune_params.approx_modes->division
8430 & AARCH64_APPROX_MODE (mode)));
8432 if (!flag_finite_math_only
8433 || flag_trapping_math
8434 || !flag_unsafe_math_optimizations
8435 || optimize_function_for_size_p (cfun)
8436 || !use_approx_division_p)
8437 return false;
8439 /* Estimate the approximate reciprocal. */
8440 rtx xrcp = gen_reg_rtx (mode);
8441 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8443 /* Iterate over the series twice for SF and thrice for DF. */
8444 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8446 /* Optionally iterate over the series once less for faster performance,
8447 while sacrificing the accuracy. */
8448 if (flag_mlow_precision_div)
8449 iterations--;
8451 /* Iterate over the series to calculate the approximate reciprocal. */
8452 rtx xtmp = gen_reg_rtx (mode);
8453 while (iterations--)
8455 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8457 if (iterations > 0)
8458 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8461 if (num != CONST1_RTX (mode))
8463 /* As the approximate reciprocal of DEN is already calculated, only
8464 calculate the approximate division when NUM is not 1.0. */
8465 rtx xnum = force_reg (mode, num);
8466 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8469 /* Finalize the approximation. */
8470 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8471 return true;
8474 /* Return the number of instructions that can be issued per cycle. */
8475 static int
8476 aarch64_sched_issue_rate (void)
8478 return aarch64_tune_params.issue_rate;
8481 static int
8482 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8484 int issue_rate = aarch64_sched_issue_rate ();
8486 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8490 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8491 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8492 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8494 static int
8495 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8496 int ready_index)
8498 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8502 /* Vectorizer cost model target hooks. */
8504 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8505 static int
8506 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8507 tree vectype,
8508 int misalign ATTRIBUTE_UNUSED)
8510 unsigned elements;
8511 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8512 bool fp = false;
8514 if (vectype != NULL)
8515 fp = FLOAT_TYPE_P (vectype);
8517 switch (type_of_cost)
8519 case scalar_stmt:
8520 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8522 case scalar_load:
8523 return costs->scalar_load_cost;
8525 case scalar_store:
8526 return costs->scalar_store_cost;
8528 case vector_stmt:
8529 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8531 case vector_load:
8532 return costs->vec_align_load_cost;
8534 case vector_store:
8535 return costs->vec_store_cost;
8537 case vec_to_scalar:
8538 return costs->vec_to_scalar_cost;
8540 case scalar_to_vec:
8541 return costs->scalar_to_vec_cost;
8543 case unaligned_load:
8544 return costs->vec_unalign_load_cost;
8546 case unaligned_store:
8547 return costs->vec_unalign_store_cost;
8549 case cond_branch_taken:
8550 return costs->cond_taken_branch_cost;
8552 case cond_branch_not_taken:
8553 return costs->cond_not_taken_branch_cost;
8555 case vec_perm:
8556 return costs->vec_permute_cost;
8558 case vec_promote_demote:
8559 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8561 case vec_construct:
8562 elements = TYPE_VECTOR_SUBPARTS (vectype);
8563 return elements / 2 + 1;
8565 default:
8566 gcc_unreachable ();
8570 /* Implement targetm.vectorize.add_stmt_cost. */
8571 static unsigned
8572 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8573 struct _stmt_vec_info *stmt_info, int misalign,
8574 enum vect_cost_model_location where)
8576 unsigned *cost = (unsigned *) data;
8577 unsigned retval = 0;
8579 if (flag_vect_cost_model)
8581 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8582 int stmt_cost =
8583 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8585 /* Statements in an inner loop relative to the loop being
8586 vectorized are weighted more heavily. The value here is
8587 arbitrary and could potentially be improved with analysis. */
8588 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8589 count *= 50; /* FIXME */
8591 retval = (unsigned) (count * stmt_cost);
8592 cost[where] += retval;
8595 return retval;
8598 static void initialize_aarch64_code_model (struct gcc_options *);
8600 /* Parse the TO_PARSE string and put the architecture struct that it
8601 selects into RES and the architectural features into ISA_FLAGS.
8602 Return an aarch64_parse_opt_result describing the parse result.
8603 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8605 static enum aarch64_parse_opt_result
8606 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8607 unsigned long *isa_flags)
8609 char *ext;
8610 const struct processor *arch;
8611 char *str = (char *) alloca (strlen (to_parse) + 1);
8612 size_t len;
8614 strcpy (str, to_parse);
8616 ext = strchr (str, '+');
8618 if (ext != NULL)
8619 len = ext - str;
8620 else
8621 len = strlen (str);
8623 if (len == 0)
8624 return AARCH64_PARSE_MISSING_ARG;
8627 /* Loop through the list of supported ARCHes to find a match. */
8628 for (arch = all_architectures; arch->name != NULL; arch++)
8630 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8632 unsigned long isa_temp = arch->flags;
8634 if (ext != NULL)
8636 /* TO_PARSE string contains at least one extension. */
8637 enum aarch64_parse_opt_result ext_res
8638 = aarch64_parse_extension (ext, &isa_temp);
8640 if (ext_res != AARCH64_PARSE_OK)
8641 return ext_res;
8643 /* Extension parsing was successful. Confirm the result
8644 arch and ISA flags. */
8645 *res = arch;
8646 *isa_flags = isa_temp;
8647 return AARCH64_PARSE_OK;
8651 /* ARCH name not found in list. */
8652 return AARCH64_PARSE_INVALID_ARG;
8655 /* Parse the TO_PARSE string and put the result tuning in RES and the
8656 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8657 describing the parse result. If there is an error parsing, RES and
8658 ISA_FLAGS are left unchanged. */
8660 static enum aarch64_parse_opt_result
8661 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8662 unsigned long *isa_flags)
8664 char *ext;
8665 const struct processor *cpu;
8666 char *str = (char *) alloca (strlen (to_parse) + 1);
8667 size_t len;
8669 strcpy (str, to_parse);
8671 ext = strchr (str, '+');
8673 if (ext != NULL)
8674 len = ext - str;
8675 else
8676 len = strlen (str);
8678 if (len == 0)
8679 return AARCH64_PARSE_MISSING_ARG;
8682 /* Loop through the list of supported CPUs to find a match. */
8683 for (cpu = all_cores; cpu->name != NULL; cpu++)
8685 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8687 unsigned long isa_temp = cpu->flags;
8690 if (ext != NULL)
8692 /* TO_PARSE string contains at least one extension. */
8693 enum aarch64_parse_opt_result ext_res
8694 = aarch64_parse_extension (ext, &isa_temp);
8696 if (ext_res != AARCH64_PARSE_OK)
8697 return ext_res;
8699 /* Extension parsing was successfull. Confirm the result
8700 cpu and ISA flags. */
8701 *res = cpu;
8702 *isa_flags = isa_temp;
8703 return AARCH64_PARSE_OK;
8707 /* CPU name not found in list. */
8708 return AARCH64_PARSE_INVALID_ARG;
8711 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8712 Return an aarch64_parse_opt_result describing the parse result.
8713 If the parsing fails the RES does not change. */
8715 static enum aarch64_parse_opt_result
8716 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8718 const struct processor *cpu;
8719 char *str = (char *) alloca (strlen (to_parse) + 1);
8721 strcpy (str, to_parse);
8723 /* Loop through the list of supported CPUs to find a match. */
8724 for (cpu = all_cores; cpu->name != NULL; cpu++)
8726 if (strcmp (cpu->name, str) == 0)
8728 *res = cpu;
8729 return AARCH64_PARSE_OK;
8733 /* CPU name not found in list. */
8734 return AARCH64_PARSE_INVALID_ARG;
8737 /* Parse TOKEN, which has length LENGTH to see if it is an option
8738 described in FLAG. If it is, return the index bit for that fusion type.
8739 If not, error (printing OPTION_NAME) and return zero. */
8741 static unsigned int
8742 aarch64_parse_one_option_token (const char *token,
8743 size_t length,
8744 const struct aarch64_flag_desc *flag,
8745 const char *option_name)
8747 for (; flag->name != NULL; flag++)
8749 if (length == strlen (flag->name)
8750 && !strncmp (flag->name, token, length))
8751 return flag->flag;
8754 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8755 return 0;
8758 /* Parse OPTION which is a comma-separated list of flags to enable.
8759 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8760 default state we inherit from the CPU tuning structures. OPTION_NAME
8761 gives the top-level option we are parsing in the -moverride string,
8762 for use in error messages. */
8764 static unsigned int
8765 aarch64_parse_boolean_options (const char *option,
8766 const struct aarch64_flag_desc *flags,
8767 unsigned int initial_state,
8768 const char *option_name)
8770 const char separator = '.';
8771 const char* specs = option;
8772 const char* ntoken = option;
8773 unsigned int found_flags = initial_state;
8775 while ((ntoken = strchr (specs, separator)))
8777 size_t token_length = ntoken - specs;
8778 unsigned token_ops = aarch64_parse_one_option_token (specs,
8779 token_length,
8780 flags,
8781 option_name);
8782 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8783 in the token stream, reset the supported operations. So:
8785 adrp+add.cmp+branch.none.adrp+add
8787 would have the result of turning on only adrp+add fusion. */
8788 if (!token_ops)
8789 found_flags = 0;
8791 found_flags |= token_ops;
8792 specs = ++ntoken;
8795 /* We ended with a comma, print something. */
8796 if (!(*specs))
8798 error ("%s string ill-formed\n", option_name);
8799 return 0;
8802 /* We still have one more token to parse. */
8803 size_t token_length = strlen (specs);
8804 unsigned token_ops = aarch64_parse_one_option_token (specs,
8805 token_length,
8806 flags,
8807 option_name);
8808 if (!token_ops)
8809 found_flags = 0;
8811 found_flags |= token_ops;
8812 return found_flags;
8815 /* Support for overriding instruction fusion. */
8817 static void
8818 aarch64_parse_fuse_string (const char *fuse_string,
8819 struct tune_params *tune)
8821 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8822 aarch64_fusible_pairs,
8823 tune->fusible_ops,
8824 "fuse=");
8827 /* Support for overriding other tuning flags. */
8829 static void
8830 aarch64_parse_tune_string (const char *tune_string,
8831 struct tune_params *tune)
8833 tune->extra_tuning_flags
8834 = aarch64_parse_boolean_options (tune_string,
8835 aarch64_tuning_flags,
8836 tune->extra_tuning_flags,
8837 "tune=");
8840 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8841 we understand. If it is, extract the option string and handoff to
8842 the appropriate function. */
8844 void
8845 aarch64_parse_one_override_token (const char* token,
8846 size_t length,
8847 struct tune_params *tune)
8849 const struct aarch64_tuning_override_function *fn
8850 = aarch64_tuning_override_functions;
8852 const char *option_part = strchr (token, '=');
8853 if (!option_part)
8855 error ("tuning string missing in option (%s)", token);
8856 return;
8859 /* Get the length of the option name. */
8860 length = option_part - token;
8861 /* Skip the '=' to get to the option string. */
8862 option_part++;
8864 for (; fn->name != NULL; fn++)
8866 if (!strncmp (fn->name, token, length))
8868 fn->parse_override (option_part, tune);
8869 return;
8873 error ("unknown tuning option (%s)",token);
8874 return;
8877 /* A checking mechanism for the implementation of the tls size. */
8879 static void
8880 initialize_aarch64_tls_size (struct gcc_options *opts)
8882 if (aarch64_tls_size == 0)
8883 aarch64_tls_size = 24;
8885 switch (opts->x_aarch64_cmodel_var)
8887 case AARCH64_CMODEL_TINY:
8888 /* Both the default and maximum TLS size allowed under tiny is 1M which
8889 needs two instructions to address, so we clamp the size to 24. */
8890 if (aarch64_tls_size > 24)
8891 aarch64_tls_size = 24;
8892 break;
8893 case AARCH64_CMODEL_SMALL:
8894 /* The maximum TLS size allowed under small is 4G. */
8895 if (aarch64_tls_size > 32)
8896 aarch64_tls_size = 32;
8897 break;
8898 case AARCH64_CMODEL_LARGE:
8899 /* The maximum TLS size allowed under large is 16E.
8900 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8901 if (aarch64_tls_size > 48)
8902 aarch64_tls_size = 48;
8903 break;
8904 default:
8905 gcc_unreachable ();
8908 return;
8911 /* Parse STRING looking for options in the format:
8912 string :: option:string
8913 option :: name=substring
8914 name :: {a-z}
8915 substring :: defined by option. */
8917 static void
8918 aarch64_parse_override_string (const char* input_string,
8919 struct tune_params* tune)
8921 const char separator = ':';
8922 size_t string_length = strlen (input_string) + 1;
8923 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8924 char *string = string_root;
8925 strncpy (string, input_string, string_length);
8926 string[string_length - 1] = '\0';
8928 char* ntoken = string;
8930 while ((ntoken = strchr (string, separator)))
8932 size_t token_length = ntoken - string;
8933 /* Make this substring look like a string. */
8934 *ntoken = '\0';
8935 aarch64_parse_one_override_token (string, token_length, tune);
8936 string = ++ntoken;
8939 /* One last option to parse. */
8940 aarch64_parse_one_override_token (string, strlen (string), tune);
8941 free (string_root);
8945 static void
8946 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8948 /* The logic here is that if we are disabling all frame pointer generation
8949 then we do not need to disable leaf frame pointer generation as a
8950 separate operation. But if we are *only* disabling leaf frame pointer
8951 generation then we set flag_omit_frame_pointer to true, but in
8952 aarch64_frame_pointer_required we return false only for leaf functions.
8954 PR 70044: We have to be careful about being called multiple times for the
8955 same function. Once we have decided to set flag_omit_frame_pointer just
8956 so that we can omit leaf frame pointers, we must then not interpret a
8957 second call as meaning that all frame pointer generation should be
8958 omitted. We do this by setting flag_omit_frame_pointer to a special,
8959 non-zero value. */
8960 if (opts->x_flag_omit_frame_pointer == 2)
8961 opts->x_flag_omit_frame_pointer = 0;
8963 if (opts->x_flag_omit_frame_pointer)
8964 opts->x_flag_omit_leaf_frame_pointer = false;
8965 else if (opts->x_flag_omit_leaf_frame_pointer)
8966 opts->x_flag_omit_frame_pointer = 2;
8968 /* If not optimizing for size, set the default
8969 alignment to what the target wants. */
8970 if (!opts->x_optimize_size)
8972 if (opts->x_align_loops <= 0)
8973 opts->x_align_loops = aarch64_tune_params.loop_align;
8974 if (opts->x_align_jumps <= 0)
8975 opts->x_align_jumps = aarch64_tune_params.jump_align;
8976 if (opts->x_align_functions <= 0)
8977 opts->x_align_functions = aarch64_tune_params.function_align;
8980 /* We default to no pc-relative literal loads. */
8982 aarch64_pcrelative_literal_loads = false;
8984 /* If -mpc-relative-literal-loads is set on the command line, this
8985 implies that the user asked for PC relative literal loads. */
8986 if (opts->x_pcrelative_literal_loads == 1)
8987 aarch64_pcrelative_literal_loads = true;
8989 /* In the tiny memory model it makes no sense to disallow PC relative
8990 literal pool loads. */
8991 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8992 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8993 aarch64_pcrelative_literal_loads = true;
8995 /* When enabling the lower precision Newton series for the square root, also
8996 enable it for the reciprocal square root, since the latter is an
8997 intermediary step for the former. */
8998 if (flag_mlow_precision_sqrt)
8999 flag_mrecip_low_precision_sqrt = true;
9002 /* 'Unpack' up the internal tuning structs and update the options
9003 in OPTS. The caller must have set up selected_tune and selected_arch
9004 as all the other target-specific codegen decisions are
9005 derived from them. */
9007 void
9008 aarch64_override_options_internal (struct gcc_options *opts)
9010 aarch64_tune_flags = selected_tune->flags;
9011 aarch64_tune = selected_tune->sched_core;
9012 /* Make a copy of the tuning parameters attached to the core, which
9013 we may later overwrite. */
9014 aarch64_tune_params = *(selected_tune->tune);
9015 aarch64_architecture_version = selected_arch->architecture_version;
9017 if (opts->x_aarch64_override_tune_string)
9018 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9019 &aarch64_tune_params);
9021 /* This target defaults to strict volatile bitfields. */
9022 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9023 opts->x_flag_strict_volatile_bitfields = 1;
9025 initialize_aarch64_code_model (opts);
9026 initialize_aarch64_tls_size (opts);
9028 int queue_depth = 0;
9029 switch (aarch64_tune_params.autoprefetcher_model)
9031 case tune_params::AUTOPREFETCHER_OFF:
9032 queue_depth = -1;
9033 break;
9034 case tune_params::AUTOPREFETCHER_WEAK:
9035 queue_depth = 0;
9036 break;
9037 case tune_params::AUTOPREFETCHER_STRONG:
9038 queue_depth = max_insn_queue_index + 1;
9039 break;
9040 default:
9041 gcc_unreachable ();
9044 /* We don't mind passing in global_options_set here as we don't use
9045 the *options_set structs anyway. */
9046 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9047 queue_depth,
9048 opts->x_param_values,
9049 global_options_set.x_param_values);
9051 /* Set up parameters to be used in prefetching algorithm. Do not
9052 override the defaults unless we are tuning for a core we have
9053 researched values for. */
9054 if (aarch64_tune_params.prefetch->num_slots > 0)
9055 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9056 aarch64_tune_params.prefetch->num_slots,
9057 opts->x_param_values,
9058 global_options_set.x_param_values);
9059 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9060 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9061 aarch64_tune_params.prefetch->l1_cache_size,
9062 opts->x_param_values,
9063 global_options_set.x_param_values);
9064 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9065 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9066 aarch64_tune_params.prefetch->l1_cache_line_size,
9067 opts->x_param_values,
9068 global_options_set.x_param_values);
9069 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9070 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9071 aarch64_tune_params.prefetch->l2_cache_size,
9072 opts->x_param_values,
9073 global_options_set.x_param_values);
9075 /* Enable sw prefetching at specified optimization level for
9076 CPUS that have prefetch. Lower optimization level threshold by 1
9077 when profiling is enabled. */
9078 if (opts->x_flag_prefetch_loop_arrays < 0
9079 && !opts->x_optimize_size
9080 && aarch64_tune_params.prefetch->default_opt_level >= 0
9081 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9082 opts->x_flag_prefetch_loop_arrays = 1;
9084 aarch64_override_options_after_change_1 (opts);
9087 /* Print a hint with a suggestion for a core or architecture name that
9088 most closely resembles what the user passed in STR. ARCH is true if
9089 the user is asking for an architecture name. ARCH is false if the user
9090 is asking for a core name. */
9092 static void
9093 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9095 auto_vec<const char *> candidates;
9096 const struct processor *entry = arch ? all_architectures : all_cores;
9097 for (; entry->name != NULL; entry++)
9098 candidates.safe_push (entry->name);
9099 char *s;
9100 const char *hint = candidates_list_and_hint (str, s, candidates);
9101 if (hint)
9102 inform (input_location, "valid arguments are: %s;"
9103 " did you mean %qs?", s, hint);
9104 XDELETEVEC (s);
9107 /* Print a hint with a suggestion for a core name that most closely resembles
9108 what the user passed in STR. */
9110 inline static void
9111 aarch64_print_hint_for_core (const char *str)
9113 aarch64_print_hint_for_core_or_arch (str, false);
9116 /* Print a hint with a suggestion for an architecture name that most closely
9117 resembles what the user passed in STR. */
9119 inline static void
9120 aarch64_print_hint_for_arch (const char *str)
9122 aarch64_print_hint_for_core_or_arch (str, true);
9125 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9126 specified in STR and throw errors if appropriate. Put the results if
9127 they are valid in RES and ISA_FLAGS. Return whether the option is
9128 valid. */
9130 static bool
9131 aarch64_validate_mcpu (const char *str, const struct processor **res,
9132 unsigned long *isa_flags)
9134 enum aarch64_parse_opt_result parse_res
9135 = aarch64_parse_cpu (str, res, isa_flags);
9137 if (parse_res == AARCH64_PARSE_OK)
9138 return true;
9140 switch (parse_res)
9142 case AARCH64_PARSE_MISSING_ARG:
9143 error ("missing cpu name in %<-mcpu=%s%>", str);
9144 break;
9145 case AARCH64_PARSE_INVALID_ARG:
9146 error ("unknown value %qs for -mcpu", str);
9147 aarch64_print_hint_for_core (str);
9148 break;
9149 case AARCH64_PARSE_INVALID_FEATURE:
9150 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9151 break;
9152 default:
9153 gcc_unreachable ();
9156 return false;
9159 /* Validate a command-line -march option. Parse the arch and extensions
9160 (if any) specified in STR and throw errors if appropriate. Put the
9161 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9162 option is valid. */
9164 static bool
9165 aarch64_validate_march (const char *str, const struct processor **res,
9166 unsigned long *isa_flags)
9168 enum aarch64_parse_opt_result parse_res
9169 = aarch64_parse_arch (str, res, isa_flags);
9171 if (parse_res == AARCH64_PARSE_OK)
9172 return true;
9174 switch (parse_res)
9176 case AARCH64_PARSE_MISSING_ARG:
9177 error ("missing arch name in %<-march=%s%>", str);
9178 break;
9179 case AARCH64_PARSE_INVALID_ARG:
9180 error ("unknown value %qs for -march", str);
9181 aarch64_print_hint_for_arch (str);
9182 break;
9183 case AARCH64_PARSE_INVALID_FEATURE:
9184 error ("invalid feature modifier in %<-march=%s%>", str);
9185 break;
9186 default:
9187 gcc_unreachable ();
9190 return false;
9193 /* Validate a command-line -mtune option. Parse the cpu
9194 specified in STR and throw errors if appropriate. Put the
9195 result, if it is valid, in RES. Return whether the option is
9196 valid. */
9198 static bool
9199 aarch64_validate_mtune (const char *str, const struct processor **res)
9201 enum aarch64_parse_opt_result parse_res
9202 = aarch64_parse_tune (str, res);
9204 if (parse_res == AARCH64_PARSE_OK)
9205 return true;
9207 switch (parse_res)
9209 case AARCH64_PARSE_MISSING_ARG:
9210 error ("missing cpu name in %<-mtune=%s%>", str);
9211 break;
9212 case AARCH64_PARSE_INVALID_ARG:
9213 error ("unknown value %qs for -mtune", str);
9214 aarch64_print_hint_for_core (str);
9215 break;
9216 default:
9217 gcc_unreachable ();
9219 return false;
9222 /* Return the CPU corresponding to the enum CPU.
9223 If it doesn't specify a cpu, return the default. */
9225 static const struct processor *
9226 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9228 if (cpu != aarch64_none)
9229 return &all_cores[cpu];
9231 /* The & 0x3f is to extract the bottom 6 bits that encode the
9232 default cpu as selected by the --with-cpu GCC configure option
9233 in config.gcc.
9234 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9235 flags mechanism should be reworked to make it more sane. */
9236 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9239 /* Return the architecture corresponding to the enum ARCH.
9240 If it doesn't specify a valid architecture, return the default. */
9242 static const struct processor *
9243 aarch64_get_arch (enum aarch64_arch arch)
9245 if (arch != aarch64_no_arch)
9246 return &all_architectures[arch];
9248 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9250 return &all_architectures[cpu->arch];
9253 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9254 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9255 tuning structs. In particular it must set selected_tune and
9256 aarch64_isa_flags that define the available ISA features and tuning
9257 decisions. It must also set selected_arch as this will be used to
9258 output the .arch asm tags for each function. */
9260 static void
9261 aarch64_override_options (void)
9263 unsigned long cpu_isa = 0;
9264 unsigned long arch_isa = 0;
9265 aarch64_isa_flags = 0;
9267 bool valid_cpu = true;
9268 bool valid_tune = true;
9269 bool valid_arch = true;
9271 selected_cpu = NULL;
9272 selected_arch = NULL;
9273 selected_tune = NULL;
9275 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9276 If either of -march or -mtune is given, they override their
9277 respective component of -mcpu. */
9278 if (aarch64_cpu_string)
9279 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9280 &cpu_isa);
9282 if (aarch64_arch_string)
9283 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9284 &arch_isa);
9286 if (aarch64_tune_string)
9287 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9289 /* If the user did not specify a processor, choose the default
9290 one for them. This will be the CPU set during configuration using
9291 --with-cpu, otherwise it is "generic". */
9292 if (!selected_cpu)
9294 if (selected_arch)
9296 selected_cpu = &all_cores[selected_arch->ident];
9297 aarch64_isa_flags = arch_isa;
9298 explicit_arch = selected_arch->arch;
9300 else
9302 /* Get default configure-time CPU. */
9303 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9304 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9307 if (selected_tune)
9308 explicit_tune_core = selected_tune->ident;
9310 /* If both -mcpu and -march are specified check that they are architecturally
9311 compatible, warn if they're not and prefer the -march ISA flags. */
9312 else if (selected_arch)
9314 if (selected_arch->arch != selected_cpu->arch)
9316 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9317 all_architectures[selected_cpu->arch].name,
9318 selected_arch->name);
9320 aarch64_isa_flags = arch_isa;
9321 explicit_arch = selected_arch->arch;
9322 explicit_tune_core = selected_tune ? selected_tune->ident
9323 : selected_cpu->ident;
9325 else
9327 /* -mcpu but no -march. */
9328 aarch64_isa_flags = cpu_isa;
9329 explicit_tune_core = selected_tune ? selected_tune->ident
9330 : selected_cpu->ident;
9331 gcc_assert (selected_cpu);
9332 selected_arch = &all_architectures[selected_cpu->arch];
9333 explicit_arch = selected_arch->arch;
9336 /* Set the arch as well as we will need it when outputing
9337 the .arch directive in assembly. */
9338 if (!selected_arch)
9340 gcc_assert (selected_cpu);
9341 selected_arch = &all_architectures[selected_cpu->arch];
9344 if (!selected_tune)
9345 selected_tune = selected_cpu;
9347 #ifndef HAVE_AS_MABI_OPTION
9348 /* The compiler may have been configured with 2.23.* binutils, which does
9349 not have support for ILP32. */
9350 if (TARGET_ILP32)
9351 error ("Assembler does not support -mabi=ilp32");
9352 #endif
9354 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9355 sorry ("Return address signing is only supported for -mabi=lp64");
9357 /* Make sure we properly set up the explicit options. */
9358 if ((aarch64_cpu_string && valid_cpu)
9359 || (aarch64_tune_string && valid_tune))
9360 gcc_assert (explicit_tune_core != aarch64_none);
9362 if ((aarch64_cpu_string && valid_cpu)
9363 || (aarch64_arch_string && valid_arch))
9364 gcc_assert (explicit_arch != aarch64_no_arch);
9366 aarch64_override_options_internal (&global_options);
9368 /* Save these options as the default ones in case we push and pop them later
9369 while processing functions with potential target attributes. */
9370 target_option_default_node = target_option_current_node
9371 = build_target_option_node (&global_options);
9374 /* Implement targetm.override_options_after_change. */
9376 static void
9377 aarch64_override_options_after_change (void)
9379 aarch64_override_options_after_change_1 (&global_options);
9382 static struct machine_function *
9383 aarch64_init_machine_status (void)
9385 struct machine_function *machine;
9386 machine = ggc_cleared_alloc<machine_function> ();
9387 return machine;
9390 void
9391 aarch64_init_expanders (void)
9393 init_machine_status = aarch64_init_machine_status;
9396 /* A checking mechanism for the implementation of the various code models. */
9397 static void
9398 initialize_aarch64_code_model (struct gcc_options *opts)
9400 if (opts->x_flag_pic)
9402 switch (opts->x_aarch64_cmodel_var)
9404 case AARCH64_CMODEL_TINY:
9405 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9406 break;
9407 case AARCH64_CMODEL_SMALL:
9408 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9409 aarch64_cmodel = (flag_pic == 2
9410 ? AARCH64_CMODEL_SMALL_PIC
9411 : AARCH64_CMODEL_SMALL_SPIC);
9412 #else
9413 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9414 #endif
9415 break;
9416 case AARCH64_CMODEL_LARGE:
9417 sorry ("code model %qs with -f%s", "large",
9418 opts->x_flag_pic > 1 ? "PIC" : "pic");
9419 break;
9420 default:
9421 gcc_unreachable ();
9424 else
9425 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9428 /* Implement TARGET_OPTION_SAVE. */
9430 static void
9431 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9433 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9436 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9437 using the information saved in PTR. */
9439 static void
9440 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9442 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9443 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9444 opts->x_explicit_arch = ptr->x_explicit_arch;
9445 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9446 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9448 aarch64_override_options_internal (opts);
9451 /* Implement TARGET_OPTION_PRINT. */
9453 static void
9454 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9456 const struct processor *cpu
9457 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9458 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9459 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9460 std::string extension
9461 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9463 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9464 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9465 arch->name, extension.c_str ());
9468 static GTY(()) tree aarch64_previous_fndecl;
9470 void
9471 aarch64_reset_previous_fndecl (void)
9473 aarch64_previous_fndecl = NULL;
9476 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9477 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9478 make sure optab availability predicates are recomputed when necessary. */
9480 void
9481 aarch64_save_restore_target_globals (tree new_tree)
9483 if (TREE_TARGET_GLOBALS (new_tree))
9484 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9485 else if (new_tree == target_option_default_node)
9486 restore_target_globals (&default_target_globals);
9487 else
9488 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9491 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9492 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9493 of the function, if such exists. This function may be called multiple
9494 times on a single function so use aarch64_previous_fndecl to avoid
9495 setting up identical state. */
9497 static void
9498 aarch64_set_current_function (tree fndecl)
9500 if (!fndecl || fndecl == aarch64_previous_fndecl)
9501 return;
9503 tree old_tree = (aarch64_previous_fndecl
9504 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9505 : NULL_TREE);
9507 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9509 /* If current function has no attributes but the previous one did,
9510 use the default node. */
9511 if (!new_tree && old_tree)
9512 new_tree = target_option_default_node;
9514 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9515 the default have been handled by aarch64_save_restore_target_globals from
9516 aarch64_pragma_target_parse. */
9517 if (old_tree == new_tree)
9518 return;
9520 aarch64_previous_fndecl = fndecl;
9522 /* First set the target options. */
9523 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9525 aarch64_save_restore_target_globals (new_tree);
9528 /* Enum describing the various ways we can handle attributes.
9529 In many cases we can reuse the generic option handling machinery. */
9531 enum aarch64_attr_opt_type
9533 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9534 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9535 aarch64_attr_enum, /* Attribute sets an enum variable. */
9536 aarch64_attr_custom /* Attribute requires a custom handling function. */
9539 /* All the information needed to handle a target attribute.
9540 NAME is the name of the attribute.
9541 ATTR_TYPE specifies the type of behavior of the attribute as described
9542 in the definition of enum aarch64_attr_opt_type.
9543 ALLOW_NEG is true if the attribute supports a "no-" form.
9544 HANDLER is the function that takes the attribute string and whether
9545 it is a pragma or attribute and handles the option. It is needed only
9546 when the ATTR_TYPE is aarch64_attr_custom.
9547 OPT_NUM is the enum specifying the option that the attribute modifies.
9548 This is needed for attributes that mirror the behavior of a command-line
9549 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9550 aarch64_attr_enum. */
9552 struct aarch64_attribute_info
9554 const char *name;
9555 enum aarch64_attr_opt_type attr_type;
9556 bool allow_neg;
9557 bool (*handler) (const char *, const char *);
9558 enum opt_code opt_num;
9561 /* Handle the ARCH_STR argument to the arch= target attribute.
9562 PRAGMA_OR_ATTR is used in potential error messages. */
9564 static bool
9565 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9567 const struct processor *tmp_arch = NULL;
9568 enum aarch64_parse_opt_result parse_res
9569 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9571 if (parse_res == AARCH64_PARSE_OK)
9573 gcc_assert (tmp_arch);
9574 selected_arch = tmp_arch;
9575 explicit_arch = selected_arch->arch;
9576 return true;
9579 switch (parse_res)
9581 case AARCH64_PARSE_MISSING_ARG:
9582 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9583 break;
9584 case AARCH64_PARSE_INVALID_ARG:
9585 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9586 aarch64_print_hint_for_arch (str);
9587 break;
9588 case AARCH64_PARSE_INVALID_FEATURE:
9589 error ("invalid feature modifier %qs for 'arch' target %s",
9590 str, pragma_or_attr);
9591 break;
9592 default:
9593 gcc_unreachable ();
9596 return false;
9599 /* Handle the argument CPU_STR to the cpu= target attribute.
9600 PRAGMA_OR_ATTR is used in potential error messages. */
9602 static bool
9603 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9605 const struct processor *tmp_cpu = NULL;
9606 enum aarch64_parse_opt_result parse_res
9607 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9609 if (parse_res == AARCH64_PARSE_OK)
9611 gcc_assert (tmp_cpu);
9612 selected_tune = tmp_cpu;
9613 explicit_tune_core = selected_tune->ident;
9615 selected_arch = &all_architectures[tmp_cpu->arch];
9616 explicit_arch = selected_arch->arch;
9617 return true;
9620 switch (parse_res)
9622 case AARCH64_PARSE_MISSING_ARG:
9623 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9624 break;
9625 case AARCH64_PARSE_INVALID_ARG:
9626 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9627 aarch64_print_hint_for_core (str);
9628 break;
9629 case AARCH64_PARSE_INVALID_FEATURE:
9630 error ("invalid feature modifier %qs for 'cpu' target %s",
9631 str, pragma_or_attr);
9632 break;
9633 default:
9634 gcc_unreachable ();
9637 return false;
9640 /* Handle the argument STR to the tune= target attribute.
9641 PRAGMA_OR_ATTR is used in potential error messages. */
9643 static bool
9644 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9646 const struct processor *tmp_tune = NULL;
9647 enum aarch64_parse_opt_result parse_res
9648 = aarch64_parse_tune (str, &tmp_tune);
9650 if (parse_res == AARCH64_PARSE_OK)
9652 gcc_assert (tmp_tune);
9653 selected_tune = tmp_tune;
9654 explicit_tune_core = selected_tune->ident;
9655 return true;
9658 switch (parse_res)
9660 case AARCH64_PARSE_INVALID_ARG:
9661 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9662 aarch64_print_hint_for_core (str);
9663 break;
9664 default:
9665 gcc_unreachable ();
9668 return false;
9671 /* Parse an architecture extensions target attribute string specified in STR.
9672 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9673 if successful. Update aarch64_isa_flags to reflect the ISA features
9674 modified.
9675 PRAGMA_OR_ATTR is used in potential error messages. */
9677 static bool
9678 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9680 enum aarch64_parse_opt_result parse_res;
9681 unsigned long isa_flags = aarch64_isa_flags;
9683 /* We allow "+nothing" in the beginning to clear out all architectural
9684 features if the user wants to handpick specific features. */
9685 if (strncmp ("+nothing", str, 8) == 0)
9687 isa_flags = 0;
9688 str += 8;
9691 parse_res = aarch64_parse_extension (str, &isa_flags);
9693 if (parse_res == AARCH64_PARSE_OK)
9695 aarch64_isa_flags = isa_flags;
9696 return true;
9699 switch (parse_res)
9701 case AARCH64_PARSE_MISSING_ARG:
9702 error ("missing feature modifier in target %s %qs",
9703 pragma_or_attr, str);
9704 break;
9706 case AARCH64_PARSE_INVALID_FEATURE:
9707 error ("invalid feature modifier in target %s %qs",
9708 pragma_or_attr, str);
9709 break;
9711 default:
9712 gcc_unreachable ();
9715 return false;
9718 /* The target attributes that we support. On top of these we also support just
9719 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9720 handled explicitly in aarch64_process_one_target_attr. */
9722 static const struct aarch64_attribute_info aarch64_attributes[] =
9724 { "general-regs-only", aarch64_attr_mask, false, NULL,
9725 OPT_mgeneral_regs_only },
9726 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9727 OPT_mfix_cortex_a53_835769 },
9728 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9729 OPT_mfix_cortex_a53_843419 },
9730 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9731 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9732 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9733 OPT_momit_leaf_frame_pointer },
9734 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9735 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9736 OPT_march_ },
9737 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9738 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9739 OPT_mtune_ },
9740 { "sign-return-address", aarch64_attr_enum, false, NULL,
9741 OPT_msign_return_address_ },
9742 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9745 /* Parse ARG_STR which contains the definition of one target attribute.
9746 Show appropriate errors if any or return true if the attribute is valid.
9747 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9748 we're processing a target attribute or pragma. */
9750 static bool
9751 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9753 bool invert = false;
9755 size_t len = strlen (arg_str);
9757 if (len == 0)
9759 error ("malformed target %s", pragma_or_attr);
9760 return false;
9763 char *str_to_check = (char *) alloca (len + 1);
9764 strcpy (str_to_check, arg_str);
9766 /* Skip leading whitespace. */
9767 while (*str_to_check == ' ' || *str_to_check == '\t')
9768 str_to_check++;
9770 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9771 It is easier to detect and handle it explicitly here rather than going
9772 through the machinery for the rest of the target attributes in this
9773 function. */
9774 if (*str_to_check == '+')
9775 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9777 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9779 invert = true;
9780 str_to_check += 3;
9782 char *arg = strchr (str_to_check, '=');
9784 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9785 and point ARG to "foo". */
9786 if (arg)
9788 *arg = '\0';
9789 arg++;
9791 const struct aarch64_attribute_info *p_attr;
9792 bool found = false;
9793 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9795 /* If the names don't match up, or the user has given an argument
9796 to an attribute that doesn't accept one, or didn't give an argument
9797 to an attribute that expects one, fail to match. */
9798 if (strcmp (str_to_check, p_attr->name) != 0)
9799 continue;
9801 found = true;
9802 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9803 || p_attr->attr_type == aarch64_attr_enum;
9805 if (attr_need_arg_p ^ (arg != NULL))
9807 error ("target %s %qs does not accept an argument",
9808 pragma_or_attr, str_to_check);
9809 return false;
9812 /* If the name matches but the attribute does not allow "no-" versions
9813 then we can't match. */
9814 if (invert && !p_attr->allow_neg)
9816 error ("target %s %qs does not allow a negated form",
9817 pragma_or_attr, str_to_check);
9818 return false;
9821 switch (p_attr->attr_type)
9823 /* Has a custom handler registered.
9824 For example, cpu=, arch=, tune=. */
9825 case aarch64_attr_custom:
9826 gcc_assert (p_attr->handler);
9827 if (!p_attr->handler (arg, pragma_or_attr))
9828 return false;
9829 break;
9831 /* Either set or unset a boolean option. */
9832 case aarch64_attr_bool:
9834 struct cl_decoded_option decoded;
9836 generate_option (p_attr->opt_num, NULL, !invert,
9837 CL_TARGET, &decoded);
9838 aarch64_handle_option (&global_options, &global_options_set,
9839 &decoded, input_location);
9840 break;
9842 /* Set or unset a bit in the target_flags. aarch64_handle_option
9843 should know what mask to apply given the option number. */
9844 case aarch64_attr_mask:
9846 struct cl_decoded_option decoded;
9847 /* We only need to specify the option number.
9848 aarch64_handle_option will know which mask to apply. */
9849 decoded.opt_index = p_attr->opt_num;
9850 decoded.value = !invert;
9851 aarch64_handle_option (&global_options, &global_options_set,
9852 &decoded, input_location);
9853 break;
9855 /* Use the option setting machinery to set an option to an enum. */
9856 case aarch64_attr_enum:
9858 gcc_assert (arg);
9859 bool valid;
9860 int value;
9861 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9862 &value, CL_TARGET);
9863 if (valid)
9865 set_option (&global_options, NULL, p_attr->opt_num, value,
9866 NULL, DK_UNSPECIFIED, input_location,
9867 global_dc);
9869 else
9871 error ("target %s %s=%s is not valid",
9872 pragma_or_attr, str_to_check, arg);
9874 break;
9876 default:
9877 gcc_unreachable ();
9881 /* If we reached here we either have found an attribute and validated
9882 it or didn't match any. If we matched an attribute but its arguments
9883 were malformed we will have returned false already. */
9884 return found;
9887 /* Count how many times the character C appears in
9888 NULL-terminated string STR. */
9890 static unsigned int
9891 num_occurences_in_str (char c, char *str)
9893 unsigned int res = 0;
9894 while (*str != '\0')
9896 if (*str == c)
9897 res++;
9899 str++;
9902 return res;
9905 /* Parse the tree in ARGS that contains the target attribute information
9906 and update the global target options space. PRAGMA_OR_ATTR is a string
9907 to be used in error messages, specifying whether this is processing
9908 a target attribute or a target pragma. */
9910 bool
9911 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9913 if (TREE_CODE (args) == TREE_LIST)
9917 tree head = TREE_VALUE (args);
9918 if (head)
9920 if (!aarch64_process_target_attr (head, pragma_or_attr))
9921 return false;
9923 args = TREE_CHAIN (args);
9924 } while (args);
9926 return true;
9929 if (TREE_CODE (args) != STRING_CST)
9931 error ("attribute %<target%> argument not a string");
9932 return false;
9935 size_t len = strlen (TREE_STRING_POINTER (args));
9936 char *str_to_check = (char *) alloca (len + 1);
9937 strcpy (str_to_check, TREE_STRING_POINTER (args));
9939 if (len == 0)
9941 error ("malformed target %s value", pragma_or_attr);
9942 return false;
9945 /* Used to catch empty spaces between commas i.e.
9946 attribute ((target ("attr1,,attr2"))). */
9947 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9949 /* Handle multiple target attributes separated by ','. */
9950 char *token = strtok (str_to_check, ",");
9952 unsigned int num_attrs = 0;
9953 while (token)
9955 num_attrs++;
9956 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9958 error ("target %s %qs is invalid", pragma_or_attr, token);
9959 return false;
9962 token = strtok (NULL, ",");
9965 if (num_attrs != num_commas + 1)
9967 error ("malformed target %s list %qs",
9968 pragma_or_attr, TREE_STRING_POINTER (args));
9969 return false;
9972 return true;
9975 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9976 process attribute ((target ("..."))). */
9978 static bool
9979 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9981 struct cl_target_option cur_target;
9982 bool ret;
9983 tree old_optimize;
9984 tree new_target, new_optimize;
9985 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9987 /* If what we're processing is the current pragma string then the
9988 target option node is already stored in target_option_current_node
9989 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9990 having to re-parse the string. This is especially useful to keep
9991 arm_neon.h compile times down since that header contains a lot
9992 of intrinsics enclosed in pragmas. */
9993 if (!existing_target && args == current_target_pragma)
9995 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9996 return true;
9998 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10000 old_optimize = build_optimization_node (&global_options);
10001 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10003 /* If the function changed the optimization levels as well as setting
10004 target options, start with the optimizations specified. */
10005 if (func_optimize && func_optimize != old_optimize)
10006 cl_optimization_restore (&global_options,
10007 TREE_OPTIMIZATION (func_optimize));
10009 /* Save the current target options to restore at the end. */
10010 cl_target_option_save (&cur_target, &global_options);
10012 /* If fndecl already has some target attributes applied to it, unpack
10013 them so that we add this attribute on top of them, rather than
10014 overwriting them. */
10015 if (existing_target)
10017 struct cl_target_option *existing_options
10018 = TREE_TARGET_OPTION (existing_target);
10020 if (existing_options)
10021 cl_target_option_restore (&global_options, existing_options);
10023 else
10024 cl_target_option_restore (&global_options,
10025 TREE_TARGET_OPTION (target_option_current_node));
10028 ret = aarch64_process_target_attr (args, "attribute");
10030 /* Set up any additional state. */
10031 if (ret)
10033 aarch64_override_options_internal (&global_options);
10034 /* Initialize SIMD builtins if we haven't already.
10035 Set current_target_pragma to NULL for the duration so that
10036 the builtin initialization code doesn't try to tag the functions
10037 being built with the attributes specified by any current pragma, thus
10038 going into an infinite recursion. */
10039 if (TARGET_SIMD)
10041 tree saved_current_target_pragma = current_target_pragma;
10042 current_target_pragma = NULL;
10043 aarch64_init_simd_builtins ();
10044 current_target_pragma = saved_current_target_pragma;
10046 new_target = build_target_option_node (&global_options);
10048 else
10049 new_target = NULL;
10051 new_optimize = build_optimization_node (&global_options);
10053 if (fndecl && ret)
10055 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10057 if (old_optimize != new_optimize)
10058 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10061 cl_target_option_restore (&global_options, &cur_target);
10063 if (old_optimize != new_optimize)
10064 cl_optimization_restore (&global_options,
10065 TREE_OPTIMIZATION (old_optimize));
10066 return ret;
10069 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10070 tri-bool options (yes, no, don't care) and the default value is
10071 DEF, determine whether to reject inlining. */
10073 static bool
10074 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10075 int dont_care, int def)
10077 /* If the callee doesn't care, always allow inlining. */
10078 if (callee == dont_care)
10079 return true;
10081 /* If the caller doesn't care, always allow inlining. */
10082 if (caller == dont_care)
10083 return true;
10085 /* Otherwise, allow inlining if either the callee and caller values
10086 agree, or if the callee is using the default value. */
10087 return (callee == caller || callee == def);
10090 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10091 to inline CALLEE into CALLER based on target-specific info.
10092 Make sure that the caller and callee have compatible architectural
10093 features. Then go through the other possible target attributes
10094 and see if they can block inlining. Try not to reject always_inline
10095 callees unless they are incompatible architecturally. */
10097 static bool
10098 aarch64_can_inline_p (tree caller, tree callee)
10100 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10101 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10103 /* If callee has no option attributes, then it is ok to inline. */
10104 if (!callee_tree)
10105 return true;
10107 struct cl_target_option *caller_opts
10108 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10109 : target_option_default_node);
10111 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10114 /* Callee's ISA flags should be a subset of the caller's. */
10115 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10116 != callee_opts->x_aarch64_isa_flags)
10117 return false;
10119 /* Allow non-strict aligned functions inlining into strict
10120 aligned ones. */
10121 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10122 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10123 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10124 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10125 return false;
10127 bool always_inline = lookup_attribute ("always_inline",
10128 DECL_ATTRIBUTES (callee));
10130 /* If the architectural features match up and the callee is always_inline
10131 then the other attributes don't matter. */
10132 if (always_inline)
10133 return true;
10135 if (caller_opts->x_aarch64_cmodel_var
10136 != callee_opts->x_aarch64_cmodel_var)
10137 return false;
10139 if (caller_opts->x_aarch64_tls_dialect
10140 != callee_opts->x_aarch64_tls_dialect)
10141 return false;
10143 /* Honour explicit requests to workaround errata. */
10144 if (!aarch64_tribools_ok_for_inlining_p (
10145 caller_opts->x_aarch64_fix_a53_err835769,
10146 callee_opts->x_aarch64_fix_a53_err835769,
10147 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10148 return false;
10150 if (!aarch64_tribools_ok_for_inlining_p (
10151 caller_opts->x_aarch64_fix_a53_err843419,
10152 callee_opts->x_aarch64_fix_a53_err843419,
10153 2, TARGET_FIX_ERR_A53_843419))
10154 return false;
10156 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10157 caller and calle and they don't match up, reject inlining. */
10158 if (!aarch64_tribools_ok_for_inlining_p (
10159 caller_opts->x_flag_omit_leaf_frame_pointer,
10160 callee_opts->x_flag_omit_leaf_frame_pointer,
10161 2, 1))
10162 return false;
10164 /* If the callee has specific tuning overrides, respect them. */
10165 if (callee_opts->x_aarch64_override_tune_string != NULL
10166 && caller_opts->x_aarch64_override_tune_string == NULL)
10167 return false;
10169 /* If the user specified tuning override strings for the
10170 caller and callee and they don't match up, reject inlining.
10171 We just do a string compare here, we don't analyze the meaning
10172 of the string, as it would be too costly for little gain. */
10173 if (callee_opts->x_aarch64_override_tune_string
10174 && caller_opts->x_aarch64_override_tune_string
10175 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10176 caller_opts->x_aarch64_override_tune_string) != 0))
10177 return false;
10179 return true;
10182 /* Return true if SYMBOL_REF X binds locally. */
10184 static bool
10185 aarch64_symbol_binds_local_p (const_rtx x)
10187 return (SYMBOL_REF_DECL (x)
10188 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10189 : SYMBOL_REF_LOCAL_P (x));
10192 /* Return true if SYMBOL_REF X is thread local */
10193 static bool
10194 aarch64_tls_symbol_p (rtx x)
10196 if (! TARGET_HAVE_TLS)
10197 return false;
10199 if (GET_CODE (x) != SYMBOL_REF)
10200 return false;
10202 return SYMBOL_REF_TLS_MODEL (x) != 0;
10205 /* Classify a TLS symbol into one of the TLS kinds. */
10206 enum aarch64_symbol_type
10207 aarch64_classify_tls_symbol (rtx x)
10209 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10211 switch (tls_kind)
10213 case TLS_MODEL_GLOBAL_DYNAMIC:
10214 case TLS_MODEL_LOCAL_DYNAMIC:
10215 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10217 case TLS_MODEL_INITIAL_EXEC:
10218 switch (aarch64_cmodel)
10220 case AARCH64_CMODEL_TINY:
10221 case AARCH64_CMODEL_TINY_PIC:
10222 return SYMBOL_TINY_TLSIE;
10223 default:
10224 return SYMBOL_SMALL_TLSIE;
10227 case TLS_MODEL_LOCAL_EXEC:
10228 if (aarch64_tls_size == 12)
10229 return SYMBOL_TLSLE12;
10230 else if (aarch64_tls_size == 24)
10231 return SYMBOL_TLSLE24;
10232 else if (aarch64_tls_size == 32)
10233 return SYMBOL_TLSLE32;
10234 else if (aarch64_tls_size == 48)
10235 return SYMBOL_TLSLE48;
10236 else
10237 gcc_unreachable ();
10239 case TLS_MODEL_EMULATED:
10240 case TLS_MODEL_NONE:
10241 return SYMBOL_FORCE_TO_MEM;
10243 default:
10244 gcc_unreachable ();
10248 /* Return the method that should be used to access SYMBOL_REF or
10249 LABEL_REF X. */
10251 enum aarch64_symbol_type
10252 aarch64_classify_symbol (rtx x, rtx offset)
10254 if (GET_CODE (x) == LABEL_REF)
10256 switch (aarch64_cmodel)
10258 case AARCH64_CMODEL_LARGE:
10259 return SYMBOL_FORCE_TO_MEM;
10261 case AARCH64_CMODEL_TINY_PIC:
10262 case AARCH64_CMODEL_TINY:
10263 return SYMBOL_TINY_ABSOLUTE;
10265 case AARCH64_CMODEL_SMALL_SPIC:
10266 case AARCH64_CMODEL_SMALL_PIC:
10267 case AARCH64_CMODEL_SMALL:
10268 return SYMBOL_SMALL_ABSOLUTE;
10270 default:
10271 gcc_unreachable ();
10275 if (GET_CODE (x) == SYMBOL_REF)
10277 if (aarch64_tls_symbol_p (x))
10278 return aarch64_classify_tls_symbol (x);
10280 switch (aarch64_cmodel)
10282 case AARCH64_CMODEL_TINY:
10283 /* When we retrieve symbol + offset address, we have to make sure
10284 the offset does not cause overflow of the final address. But
10285 we have no way of knowing the address of symbol at compile time
10286 so we can't accurately say if the distance between the PC and
10287 symbol + offset is outside the addressible range of +/-1M in the
10288 TINY code model. So we rely on images not being greater than
10289 1M and cap the offset at 1M and anything beyond 1M will have to
10290 be loaded using an alternative mechanism. Furthermore if the
10291 symbol is a weak reference to something that isn't known to
10292 resolve to a symbol in this module, then force to memory. */
10293 if ((SYMBOL_REF_WEAK (x)
10294 && !aarch64_symbol_binds_local_p (x))
10295 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10296 return SYMBOL_FORCE_TO_MEM;
10297 return SYMBOL_TINY_ABSOLUTE;
10299 case AARCH64_CMODEL_SMALL:
10300 /* Same reasoning as the tiny code model, but the offset cap here is
10301 4G. */
10302 if ((SYMBOL_REF_WEAK (x)
10303 && !aarch64_symbol_binds_local_p (x))
10304 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10305 HOST_WIDE_INT_C (4294967264)))
10306 return SYMBOL_FORCE_TO_MEM;
10307 return SYMBOL_SMALL_ABSOLUTE;
10309 case AARCH64_CMODEL_TINY_PIC:
10310 if (!aarch64_symbol_binds_local_p (x))
10311 return SYMBOL_TINY_GOT;
10312 return SYMBOL_TINY_ABSOLUTE;
10314 case AARCH64_CMODEL_SMALL_SPIC:
10315 case AARCH64_CMODEL_SMALL_PIC:
10316 if (!aarch64_symbol_binds_local_p (x))
10317 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10318 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10319 return SYMBOL_SMALL_ABSOLUTE;
10321 case AARCH64_CMODEL_LARGE:
10322 /* This is alright even in PIC code as the constant
10323 pool reference is always PC relative and within
10324 the same translation unit. */
10325 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10326 return SYMBOL_SMALL_ABSOLUTE;
10327 else
10328 return SYMBOL_FORCE_TO_MEM;
10330 default:
10331 gcc_unreachable ();
10335 /* By default push everything into the constant pool. */
10336 return SYMBOL_FORCE_TO_MEM;
10339 bool
10340 aarch64_constant_address_p (rtx x)
10342 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10345 bool
10346 aarch64_legitimate_pic_operand_p (rtx x)
10348 if (GET_CODE (x) == SYMBOL_REF
10349 || (GET_CODE (x) == CONST
10350 && GET_CODE (XEXP (x, 0)) == PLUS
10351 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10352 return false;
10354 return true;
10357 /* Return true if X holds either a quarter-precision or
10358 floating-point +0.0 constant. */
10359 static bool
10360 aarch64_valid_floating_const (rtx x)
10362 if (!CONST_DOUBLE_P (x))
10363 return false;
10365 /* This call determines which constants can be used in mov<mode>
10366 as integer moves instead of constant loads. */
10367 if (aarch64_float_const_rtx_p (x))
10368 return true;
10370 return aarch64_float_const_representable_p (x);
10373 static bool
10374 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10376 /* Do not allow vector struct mode constants. We could support
10377 0 and -1 easily, but they need support in aarch64-simd.md. */
10378 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10379 return false;
10381 /* For these cases we never want to use a literal load.
10382 As such we have to prevent the compiler from forcing these
10383 to memory. */
10384 if ((GET_CODE (x) == CONST_VECTOR
10385 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10386 || CONST_INT_P (x)
10387 || aarch64_valid_floating_const (x)
10388 || aarch64_can_const_movi_rtx_p (x, mode)
10389 || aarch64_float_const_rtx_p (x))
10390 return !targetm.cannot_force_const_mem (mode, x);
10392 if (GET_CODE (x) == HIGH
10393 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10394 return true;
10396 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10397 so spilling them is better than rematerialization. */
10398 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10399 return true;
10401 return aarch64_constant_address_p (x);
10405 aarch64_load_tp (rtx target)
10407 if (!target
10408 || GET_MODE (target) != Pmode
10409 || !register_operand (target, Pmode))
10410 target = gen_reg_rtx (Pmode);
10412 /* Can return in any reg. */
10413 emit_insn (gen_aarch64_load_tp_hard (target));
10414 return target;
10417 /* On AAPCS systems, this is the "struct __va_list". */
10418 static GTY(()) tree va_list_type;
10420 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10421 Return the type to use as __builtin_va_list.
10423 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10425 struct __va_list
10427 void *__stack;
10428 void *__gr_top;
10429 void *__vr_top;
10430 int __gr_offs;
10431 int __vr_offs;
10432 }; */
10434 static tree
10435 aarch64_build_builtin_va_list (void)
10437 tree va_list_name;
10438 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10440 /* Create the type. */
10441 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10442 /* Give it the required name. */
10443 va_list_name = build_decl (BUILTINS_LOCATION,
10444 TYPE_DECL,
10445 get_identifier ("__va_list"),
10446 va_list_type);
10447 DECL_ARTIFICIAL (va_list_name) = 1;
10448 TYPE_NAME (va_list_type) = va_list_name;
10449 TYPE_STUB_DECL (va_list_type) = va_list_name;
10451 /* Create the fields. */
10452 f_stack = build_decl (BUILTINS_LOCATION,
10453 FIELD_DECL, get_identifier ("__stack"),
10454 ptr_type_node);
10455 f_grtop = build_decl (BUILTINS_LOCATION,
10456 FIELD_DECL, get_identifier ("__gr_top"),
10457 ptr_type_node);
10458 f_vrtop = build_decl (BUILTINS_LOCATION,
10459 FIELD_DECL, get_identifier ("__vr_top"),
10460 ptr_type_node);
10461 f_groff = build_decl (BUILTINS_LOCATION,
10462 FIELD_DECL, get_identifier ("__gr_offs"),
10463 integer_type_node);
10464 f_vroff = build_decl (BUILTINS_LOCATION,
10465 FIELD_DECL, get_identifier ("__vr_offs"),
10466 integer_type_node);
10468 /* Tell tree-stdarg pass about our internal offset fields.
10469 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10470 purpose to identify whether the code is updating va_list internal
10471 offset fields through irregular way. */
10472 va_list_gpr_counter_field = f_groff;
10473 va_list_fpr_counter_field = f_vroff;
10475 DECL_ARTIFICIAL (f_stack) = 1;
10476 DECL_ARTIFICIAL (f_grtop) = 1;
10477 DECL_ARTIFICIAL (f_vrtop) = 1;
10478 DECL_ARTIFICIAL (f_groff) = 1;
10479 DECL_ARTIFICIAL (f_vroff) = 1;
10481 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10482 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10483 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10484 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10485 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10487 TYPE_FIELDS (va_list_type) = f_stack;
10488 DECL_CHAIN (f_stack) = f_grtop;
10489 DECL_CHAIN (f_grtop) = f_vrtop;
10490 DECL_CHAIN (f_vrtop) = f_groff;
10491 DECL_CHAIN (f_groff) = f_vroff;
10493 /* Compute its layout. */
10494 layout_type (va_list_type);
10496 return va_list_type;
10499 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10500 static void
10501 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10503 const CUMULATIVE_ARGS *cum;
10504 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10505 tree stack, grtop, vrtop, groff, vroff;
10506 tree t;
10507 int gr_save_area_size = cfun->va_list_gpr_size;
10508 int vr_save_area_size = cfun->va_list_fpr_size;
10509 int vr_offset;
10511 cum = &crtl->args.info;
10512 if (cfun->va_list_gpr_size)
10513 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10514 cfun->va_list_gpr_size);
10515 if (cfun->va_list_fpr_size)
10516 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10517 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10519 if (!TARGET_FLOAT)
10521 gcc_assert (cum->aapcs_nvrn == 0);
10522 vr_save_area_size = 0;
10525 f_stack = TYPE_FIELDS (va_list_type_node);
10526 f_grtop = DECL_CHAIN (f_stack);
10527 f_vrtop = DECL_CHAIN (f_grtop);
10528 f_groff = DECL_CHAIN (f_vrtop);
10529 f_vroff = DECL_CHAIN (f_groff);
10531 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10532 NULL_TREE);
10533 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10534 NULL_TREE);
10535 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10536 NULL_TREE);
10537 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10538 NULL_TREE);
10539 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10540 NULL_TREE);
10542 /* Emit code to initialize STACK, which points to the next varargs stack
10543 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10544 by named arguments. STACK is 8-byte aligned. */
10545 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10546 if (cum->aapcs_stack_size > 0)
10547 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10548 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10549 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10551 /* Emit code to initialize GRTOP, the top of the GR save area.
10552 virtual_incoming_args_rtx should have been 16 byte aligned. */
10553 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10554 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10555 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10557 /* Emit code to initialize VRTOP, the top of the VR save area.
10558 This address is gr_save_area_bytes below GRTOP, rounded
10559 down to the next 16-byte boundary. */
10560 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10561 vr_offset = ROUND_UP (gr_save_area_size,
10562 STACK_BOUNDARY / BITS_PER_UNIT);
10564 if (vr_offset)
10565 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10566 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10567 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10569 /* Emit code to initialize GROFF, the offset from GRTOP of the
10570 next GPR argument. */
10571 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10572 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10573 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10575 /* Likewise emit code to initialize VROFF, the offset from FTOP
10576 of the next VR argument. */
10577 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10578 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10579 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10582 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10584 static tree
10585 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10586 gimple_seq *post_p ATTRIBUTE_UNUSED)
10588 tree addr;
10589 bool indirect_p;
10590 bool is_ha; /* is HFA or HVA. */
10591 bool dw_align; /* double-word align. */
10592 machine_mode ag_mode = VOIDmode;
10593 int nregs;
10594 machine_mode mode;
10596 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10597 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10598 HOST_WIDE_INT size, rsize, adjust, align;
10599 tree t, u, cond1, cond2;
10601 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10602 if (indirect_p)
10603 type = build_pointer_type (type);
10605 mode = TYPE_MODE (type);
10607 f_stack = TYPE_FIELDS (va_list_type_node);
10608 f_grtop = DECL_CHAIN (f_stack);
10609 f_vrtop = DECL_CHAIN (f_grtop);
10610 f_groff = DECL_CHAIN (f_vrtop);
10611 f_vroff = DECL_CHAIN (f_groff);
10613 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10614 f_stack, NULL_TREE);
10615 size = int_size_in_bytes (type);
10616 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10618 dw_align = false;
10619 adjust = 0;
10620 if (aarch64_vfp_is_call_or_return_candidate (mode,
10621 type,
10622 &ag_mode,
10623 &nregs,
10624 &is_ha))
10626 /* TYPE passed in fp/simd registers. */
10627 if (!TARGET_FLOAT)
10628 aarch64_err_no_fpadvsimd (mode, "varargs");
10630 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10631 unshare_expr (valist), f_vrtop, NULL_TREE);
10632 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10633 unshare_expr (valist), f_vroff, NULL_TREE);
10635 rsize = nregs * UNITS_PER_VREG;
10637 if (is_ha)
10639 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10640 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10642 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10643 && size < UNITS_PER_VREG)
10645 adjust = UNITS_PER_VREG - size;
10648 else
10650 /* TYPE passed in general registers. */
10651 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10652 unshare_expr (valist), f_grtop, NULL_TREE);
10653 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10654 unshare_expr (valist), f_groff, NULL_TREE);
10655 rsize = ROUND_UP (size, UNITS_PER_WORD);
10656 nregs = rsize / UNITS_PER_WORD;
10658 if (align > 8)
10659 dw_align = true;
10661 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10662 && size < UNITS_PER_WORD)
10664 adjust = UNITS_PER_WORD - size;
10668 /* Get a local temporary for the field value. */
10669 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10671 /* Emit code to branch if off >= 0. */
10672 t = build2 (GE_EXPR, boolean_type_node, off,
10673 build_int_cst (TREE_TYPE (off), 0));
10674 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10676 if (dw_align)
10678 /* Emit: offs = (offs + 15) & -16. */
10679 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10680 build_int_cst (TREE_TYPE (off), 15));
10681 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10682 build_int_cst (TREE_TYPE (off), -16));
10683 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10685 else
10686 roundup = NULL;
10688 /* Update ap.__[g|v]r_offs */
10689 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10690 build_int_cst (TREE_TYPE (off), rsize));
10691 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10693 /* String up. */
10694 if (roundup)
10695 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10697 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10698 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10699 build_int_cst (TREE_TYPE (f_off), 0));
10700 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10702 /* String up: make sure the assignment happens before the use. */
10703 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10704 COND_EXPR_ELSE (cond1) = t;
10706 /* Prepare the trees handling the argument that is passed on the stack;
10707 the top level node will store in ON_STACK. */
10708 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10709 if (align > 8)
10711 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10712 t = fold_convert (intDI_type_node, arg);
10713 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10714 build_int_cst (TREE_TYPE (t), 15));
10715 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10716 build_int_cst (TREE_TYPE (t), -16));
10717 t = fold_convert (TREE_TYPE (arg), t);
10718 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10720 else
10721 roundup = NULL;
10722 /* Advance ap.__stack */
10723 t = fold_convert (intDI_type_node, arg);
10724 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10725 build_int_cst (TREE_TYPE (t), size + 7));
10726 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10727 build_int_cst (TREE_TYPE (t), -8));
10728 t = fold_convert (TREE_TYPE (arg), t);
10729 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10730 /* String up roundup and advance. */
10731 if (roundup)
10732 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10733 /* String up with arg */
10734 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10735 /* Big-endianness related address adjustment. */
10736 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10737 && size < UNITS_PER_WORD)
10739 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10740 size_int (UNITS_PER_WORD - size));
10741 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10744 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10745 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10747 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10748 t = off;
10749 if (adjust)
10750 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10751 build_int_cst (TREE_TYPE (off), adjust));
10753 t = fold_convert (sizetype, t);
10754 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10756 if (is_ha)
10758 /* type ha; // treat as "struct {ftype field[n];}"
10759 ... [computing offs]
10760 for (i = 0; i <nregs; ++i, offs += 16)
10761 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10762 return ha; */
10763 int i;
10764 tree tmp_ha, field_t, field_ptr_t;
10766 /* Declare a local variable. */
10767 tmp_ha = create_tmp_var_raw (type, "ha");
10768 gimple_add_tmp_var (tmp_ha);
10770 /* Establish the base type. */
10771 switch (ag_mode)
10773 case E_SFmode:
10774 field_t = float_type_node;
10775 field_ptr_t = float_ptr_type_node;
10776 break;
10777 case E_DFmode:
10778 field_t = double_type_node;
10779 field_ptr_t = double_ptr_type_node;
10780 break;
10781 case E_TFmode:
10782 field_t = long_double_type_node;
10783 field_ptr_t = long_double_ptr_type_node;
10784 break;
10785 case E_HFmode:
10786 field_t = aarch64_fp16_type_node;
10787 field_ptr_t = aarch64_fp16_ptr_type_node;
10788 break;
10789 case E_V2SImode:
10790 case E_V4SImode:
10792 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10793 field_t = build_vector_type_for_mode (innertype, ag_mode);
10794 field_ptr_t = build_pointer_type (field_t);
10796 break;
10797 default:
10798 gcc_assert (0);
10801 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10802 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10803 addr = t;
10804 t = fold_convert (field_ptr_t, addr);
10805 t = build2 (MODIFY_EXPR, field_t,
10806 build1 (INDIRECT_REF, field_t, tmp_ha),
10807 build1 (INDIRECT_REF, field_t, t));
10809 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10810 for (i = 1; i < nregs; ++i)
10812 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10813 u = fold_convert (field_ptr_t, addr);
10814 u = build2 (MODIFY_EXPR, field_t,
10815 build2 (MEM_REF, field_t, tmp_ha,
10816 build_int_cst (field_ptr_t,
10817 (i *
10818 int_size_in_bytes (field_t)))),
10819 build1 (INDIRECT_REF, field_t, u));
10820 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10823 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10824 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10827 COND_EXPR_ELSE (cond2) = t;
10828 addr = fold_convert (build_pointer_type (type), cond1);
10829 addr = build_va_arg_indirect_ref (addr);
10831 if (indirect_p)
10832 addr = build_va_arg_indirect_ref (addr);
10834 return addr;
10837 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10839 static void
10840 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10841 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10842 int no_rtl)
10844 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10845 CUMULATIVE_ARGS local_cum;
10846 int gr_saved = cfun->va_list_gpr_size;
10847 int vr_saved = cfun->va_list_fpr_size;
10849 /* The caller has advanced CUM up to, but not beyond, the last named
10850 argument. Advance a local copy of CUM past the last "real" named
10851 argument, to find out how many registers are left over. */
10852 local_cum = *cum;
10853 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10855 /* Found out how many registers we need to save.
10856 Honor tree-stdvar analysis results. */
10857 if (cfun->va_list_gpr_size)
10858 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10859 cfun->va_list_gpr_size / UNITS_PER_WORD);
10860 if (cfun->va_list_fpr_size)
10861 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10862 cfun->va_list_fpr_size / UNITS_PER_VREG);
10864 if (!TARGET_FLOAT)
10866 gcc_assert (local_cum.aapcs_nvrn == 0);
10867 vr_saved = 0;
10870 if (!no_rtl)
10872 if (gr_saved > 0)
10874 rtx ptr, mem;
10876 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10877 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10878 - gr_saved * UNITS_PER_WORD);
10879 mem = gen_frame_mem (BLKmode, ptr);
10880 set_mem_alias_set (mem, get_varargs_alias_set ());
10882 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10883 mem, gr_saved);
10885 if (vr_saved > 0)
10887 /* We can't use move_block_from_reg, because it will use
10888 the wrong mode, storing D regs only. */
10889 machine_mode mode = TImode;
10890 int off, i, vr_start;
10892 /* Set OFF to the offset from virtual_incoming_args_rtx of
10893 the first vector register. The VR save area lies below
10894 the GR one, and is aligned to 16 bytes. */
10895 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10896 STACK_BOUNDARY / BITS_PER_UNIT);
10897 off -= vr_saved * UNITS_PER_VREG;
10899 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10900 for (i = 0; i < vr_saved; ++i)
10902 rtx ptr, mem;
10904 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10905 mem = gen_frame_mem (mode, ptr);
10906 set_mem_alias_set (mem, get_varargs_alias_set ());
10907 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10908 off += UNITS_PER_VREG;
10913 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10914 any complication of having crtl->args.pretend_args_size changed. */
10915 cfun->machine->frame.saved_varargs_size
10916 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10917 STACK_BOUNDARY / BITS_PER_UNIT)
10918 + vr_saved * UNITS_PER_VREG);
10921 static void
10922 aarch64_conditional_register_usage (void)
10924 int i;
10925 if (!TARGET_FLOAT)
10927 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10929 fixed_regs[i] = 1;
10930 call_used_regs[i] = 1;
10935 /* Walk down the type tree of TYPE counting consecutive base elements.
10936 If *MODEP is VOIDmode, then set it to the first valid floating point
10937 type. If a non-floating point type is found, or if a floating point
10938 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10939 otherwise return the count in the sub-tree. */
10940 static int
10941 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10943 machine_mode mode;
10944 HOST_WIDE_INT size;
10946 switch (TREE_CODE (type))
10948 case REAL_TYPE:
10949 mode = TYPE_MODE (type);
10950 if (mode != DFmode && mode != SFmode
10951 && mode != TFmode && mode != HFmode)
10952 return -1;
10954 if (*modep == VOIDmode)
10955 *modep = mode;
10957 if (*modep == mode)
10958 return 1;
10960 break;
10962 case COMPLEX_TYPE:
10963 mode = TYPE_MODE (TREE_TYPE (type));
10964 if (mode != DFmode && mode != SFmode
10965 && mode != TFmode && mode != HFmode)
10966 return -1;
10968 if (*modep == VOIDmode)
10969 *modep = mode;
10971 if (*modep == mode)
10972 return 2;
10974 break;
10976 case VECTOR_TYPE:
10977 /* Use V2SImode and V4SImode as representatives of all 64-bit
10978 and 128-bit vector types. */
10979 size = int_size_in_bytes (type);
10980 switch (size)
10982 case 8:
10983 mode = V2SImode;
10984 break;
10985 case 16:
10986 mode = V4SImode;
10987 break;
10988 default:
10989 return -1;
10992 if (*modep == VOIDmode)
10993 *modep = mode;
10995 /* Vector modes are considered to be opaque: two vectors are
10996 equivalent for the purposes of being homogeneous aggregates
10997 if they are the same size. */
10998 if (*modep == mode)
10999 return 1;
11001 break;
11003 case ARRAY_TYPE:
11005 int count;
11006 tree index = TYPE_DOMAIN (type);
11008 /* Can't handle incomplete types nor sizes that are not
11009 fixed. */
11010 if (!COMPLETE_TYPE_P (type)
11011 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11012 return -1;
11014 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11015 if (count == -1
11016 || !index
11017 || !TYPE_MAX_VALUE (index)
11018 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11019 || !TYPE_MIN_VALUE (index)
11020 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11021 || count < 0)
11022 return -1;
11024 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11025 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11027 /* There must be no padding. */
11028 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11029 return -1;
11031 return count;
11034 case RECORD_TYPE:
11036 int count = 0;
11037 int sub_count;
11038 tree field;
11040 /* Can't handle incomplete types nor sizes that are not
11041 fixed. */
11042 if (!COMPLETE_TYPE_P (type)
11043 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11044 return -1;
11046 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11048 if (TREE_CODE (field) != FIELD_DECL)
11049 continue;
11051 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11052 if (sub_count < 0)
11053 return -1;
11054 count += sub_count;
11057 /* There must be no padding. */
11058 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11059 return -1;
11061 return count;
11064 case UNION_TYPE:
11065 case QUAL_UNION_TYPE:
11067 /* These aren't very interesting except in a degenerate case. */
11068 int count = 0;
11069 int sub_count;
11070 tree field;
11072 /* Can't handle incomplete types nor sizes that are not
11073 fixed. */
11074 if (!COMPLETE_TYPE_P (type)
11075 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11076 return -1;
11078 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11080 if (TREE_CODE (field) != FIELD_DECL)
11081 continue;
11083 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11084 if (sub_count < 0)
11085 return -1;
11086 count = count > sub_count ? count : sub_count;
11089 /* There must be no padding. */
11090 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11091 return -1;
11093 return count;
11096 default:
11097 break;
11100 return -1;
11103 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11104 type as described in AAPCS64 \S 4.1.2.
11106 See the comment above aarch64_composite_type_p for the notes on MODE. */
11108 static bool
11109 aarch64_short_vector_p (const_tree type,
11110 machine_mode mode)
11112 HOST_WIDE_INT size = -1;
11114 if (type && TREE_CODE (type) == VECTOR_TYPE)
11115 size = int_size_in_bytes (type);
11116 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11117 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11118 size = GET_MODE_SIZE (mode);
11120 return (size == 8 || size == 16);
11123 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11124 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11125 array types. The C99 floating-point complex types are also considered
11126 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11127 types, which are GCC extensions and out of the scope of AAPCS64, are
11128 treated as composite types here as well.
11130 Note that MODE itself is not sufficient in determining whether a type
11131 is such a composite type or not. This is because
11132 stor-layout.c:compute_record_mode may have already changed the MODE
11133 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11134 structure with only one field may have its MODE set to the mode of the
11135 field. Also an integer mode whose size matches the size of the
11136 RECORD_TYPE type may be used to substitute the original mode
11137 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11138 solely relied on. */
11140 static bool
11141 aarch64_composite_type_p (const_tree type,
11142 machine_mode mode)
11144 if (aarch64_short_vector_p (type, mode))
11145 return false;
11147 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11148 return true;
11150 if (mode == BLKmode
11151 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11152 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11153 return true;
11155 return false;
11158 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11159 shall be passed or returned in simd/fp register(s) (providing these
11160 parameter passing registers are available).
11162 Upon successful return, *COUNT returns the number of needed registers,
11163 *BASE_MODE returns the mode of the individual register and when IS_HAF
11164 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11165 floating-point aggregate or a homogeneous short-vector aggregate. */
11167 static bool
11168 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11169 const_tree type,
11170 machine_mode *base_mode,
11171 int *count,
11172 bool *is_ha)
11174 machine_mode new_mode = VOIDmode;
11175 bool composite_p = aarch64_composite_type_p (type, mode);
11177 if (is_ha != NULL) *is_ha = false;
11179 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11180 || aarch64_short_vector_p (type, mode))
11182 *count = 1;
11183 new_mode = mode;
11185 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11187 if (is_ha != NULL) *is_ha = true;
11188 *count = 2;
11189 new_mode = GET_MODE_INNER (mode);
11191 else if (type && composite_p)
11193 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11195 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11197 if (is_ha != NULL) *is_ha = true;
11198 *count = ag_count;
11200 else
11201 return false;
11203 else
11204 return false;
11206 *base_mode = new_mode;
11207 return true;
11210 /* Implement TARGET_STRUCT_VALUE_RTX. */
11212 static rtx
11213 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11214 int incoming ATTRIBUTE_UNUSED)
11216 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11219 /* Implements target hook vector_mode_supported_p. */
11220 static bool
11221 aarch64_vector_mode_supported_p (machine_mode mode)
11223 if (TARGET_SIMD
11224 && (mode == V4SImode || mode == V8HImode
11225 || mode == V16QImode || mode == V2DImode
11226 || mode == V2SImode || mode == V4HImode
11227 || mode == V8QImode || mode == V2SFmode
11228 || mode == V4SFmode || mode == V2DFmode
11229 || mode == V4HFmode || mode == V8HFmode
11230 || mode == V1DFmode))
11231 return true;
11233 return false;
11236 /* Return appropriate SIMD container
11237 for MODE within a vector of WIDTH bits. */
11238 static machine_mode
11239 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11241 gcc_assert (width == 64 || width == 128);
11242 if (TARGET_SIMD)
11244 if (width == 128)
11245 switch (mode)
11247 case E_DFmode:
11248 return V2DFmode;
11249 case E_SFmode:
11250 return V4SFmode;
11251 case E_HFmode:
11252 return V8HFmode;
11253 case E_SImode:
11254 return V4SImode;
11255 case E_HImode:
11256 return V8HImode;
11257 case E_QImode:
11258 return V16QImode;
11259 case E_DImode:
11260 return V2DImode;
11261 default:
11262 break;
11264 else
11265 switch (mode)
11267 case E_SFmode:
11268 return V2SFmode;
11269 case E_HFmode:
11270 return V4HFmode;
11271 case E_SImode:
11272 return V2SImode;
11273 case E_HImode:
11274 return V4HImode;
11275 case E_QImode:
11276 return V8QImode;
11277 default:
11278 break;
11281 return word_mode;
11284 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11285 static machine_mode
11286 aarch64_preferred_simd_mode (scalar_mode mode)
11288 return aarch64_simd_container_mode (mode, 128);
11291 /* Return the bitmask of possible vector sizes for the vectorizer
11292 to iterate over. */
11293 static unsigned int
11294 aarch64_autovectorize_vector_sizes (void)
11296 return (16 | 8);
11299 /* Implement TARGET_MANGLE_TYPE. */
11301 static const char *
11302 aarch64_mangle_type (const_tree type)
11304 /* The AArch64 ABI documents say that "__va_list" has to be
11305 managled as if it is in the "std" namespace. */
11306 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11307 return "St9__va_list";
11309 /* Half-precision float. */
11310 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11311 return "Dh";
11313 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11314 builtin types. */
11315 if (TYPE_NAME (type) != NULL)
11316 return aarch64_mangle_builtin_type (type);
11318 /* Use the default mangling. */
11319 return NULL;
11322 /* Find the first rtx_insn before insn that will generate an assembly
11323 instruction. */
11325 static rtx_insn *
11326 aarch64_prev_real_insn (rtx_insn *insn)
11328 if (!insn)
11329 return NULL;
11333 insn = prev_real_insn (insn);
11335 while (insn && recog_memoized (insn) < 0);
11337 return insn;
11340 static bool
11341 is_madd_op (enum attr_type t1)
11343 unsigned int i;
11344 /* A number of these may be AArch32 only. */
11345 enum attr_type mlatypes[] = {
11346 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11347 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11348 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11351 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11353 if (t1 == mlatypes[i])
11354 return true;
11357 return false;
11360 /* Check if there is a register dependency between a load and the insn
11361 for which we hold recog_data. */
11363 static bool
11364 dep_between_memop_and_curr (rtx memop)
11366 rtx load_reg;
11367 int opno;
11369 gcc_assert (GET_CODE (memop) == SET);
11371 if (!REG_P (SET_DEST (memop)))
11372 return false;
11374 load_reg = SET_DEST (memop);
11375 for (opno = 1; opno < recog_data.n_operands; opno++)
11377 rtx operand = recog_data.operand[opno];
11378 if (REG_P (operand)
11379 && reg_overlap_mentioned_p (load_reg, operand))
11380 return true;
11383 return false;
11387 /* When working around the Cortex-A53 erratum 835769,
11388 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11389 instruction and has a preceding memory instruction such that a NOP
11390 should be inserted between them. */
11392 bool
11393 aarch64_madd_needs_nop (rtx_insn* insn)
11395 enum attr_type attr_type;
11396 rtx_insn *prev;
11397 rtx body;
11399 if (!TARGET_FIX_ERR_A53_835769)
11400 return false;
11402 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11403 return false;
11405 attr_type = get_attr_type (insn);
11406 if (!is_madd_op (attr_type))
11407 return false;
11409 prev = aarch64_prev_real_insn (insn);
11410 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11411 Restore recog state to INSN to avoid state corruption. */
11412 extract_constrain_insn_cached (insn);
11414 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11415 return false;
11417 body = single_set (prev);
11419 /* If the previous insn is a memory op and there is no dependency between
11420 it and the DImode madd, emit a NOP between them. If body is NULL then we
11421 have a complex memory operation, probably a load/store pair.
11422 Be conservative for now and emit a NOP. */
11423 if (GET_MODE (recog_data.operand[0]) == DImode
11424 && (!body || !dep_between_memop_and_curr (body)))
11425 return true;
11427 return false;
11432 /* Implement FINAL_PRESCAN_INSN. */
11434 void
11435 aarch64_final_prescan_insn (rtx_insn *insn)
11437 if (aarch64_madd_needs_nop (insn))
11438 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11442 /* Return the equivalent letter for size. */
11443 static char
11444 sizetochar (int size)
11446 switch (size)
11448 case 64: return 'd';
11449 case 32: return 's';
11450 case 16: return 'h';
11451 case 8 : return 'b';
11452 default: gcc_unreachable ();
11456 /* Return true iff x is a uniform vector of floating-point
11457 constants, and the constant can be represented in
11458 quarter-precision form. Note, as aarch64_float_const_representable
11459 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11460 static bool
11461 aarch64_vect_float_const_representable_p (rtx x)
11463 rtx elt;
11464 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11465 && const_vec_duplicate_p (x, &elt)
11466 && aarch64_float_const_representable_p (elt));
11469 /* Return true for valid and false for invalid. */
11470 bool
11471 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11472 struct simd_immediate_info *info)
11474 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11475 matches = 1; \
11476 for (i = 0; i < idx; i += (STRIDE)) \
11477 if (!(TEST)) \
11478 matches = 0; \
11479 if (matches) \
11481 immtype = (CLASS); \
11482 elsize = (ELSIZE); \
11483 eshift = (SHIFT); \
11484 emvn = (NEG); \
11485 break; \
11488 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11489 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11490 unsigned char bytes[16];
11491 int immtype = -1, matches;
11492 unsigned int invmask = inverse ? 0xff : 0;
11493 int eshift, emvn;
11495 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11497 if (! (aarch64_simd_imm_zero_p (op, mode)
11498 || aarch64_vect_float_const_representable_p (op)))
11499 return false;
11501 if (info)
11503 rtx elt = CONST_VECTOR_ELT (op, 0);
11504 scalar_float_mode elt_mode
11505 = as_a <scalar_float_mode> (GET_MODE (elt));
11507 info->value = elt;
11508 info->element_width = GET_MODE_BITSIZE (elt_mode);
11509 info->mvn = false;
11510 info->shift = 0;
11513 return true;
11516 /* Splat vector constant out into a byte vector. */
11517 for (i = 0; i < n_elts; i++)
11519 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11520 it must be laid out in the vector register in reverse order. */
11521 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11522 unsigned HOST_WIDE_INT elpart;
11524 gcc_assert (CONST_INT_P (el));
11525 elpart = INTVAL (el);
11527 for (unsigned int byte = 0; byte < innersize; byte++)
11529 bytes[idx++] = (elpart & 0xff) ^ invmask;
11530 elpart >>= BITS_PER_UNIT;
11535 /* Sanity check. */
11536 gcc_assert (idx == GET_MODE_SIZE (mode));
11540 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11541 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11543 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11544 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11546 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11547 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11549 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11550 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11552 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11554 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11556 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11557 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11559 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11560 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11562 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11563 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11565 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11566 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11568 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11570 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11572 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11573 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11575 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11576 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11578 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11579 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11581 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11582 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11584 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11586 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11587 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11589 while (0);
11591 if (immtype == -1)
11592 return false;
11594 if (info)
11596 info->element_width = elsize;
11597 info->mvn = emvn != 0;
11598 info->shift = eshift;
11600 unsigned HOST_WIDE_INT imm = 0;
11602 if (immtype >= 12 && immtype <= 15)
11603 info->msl = true;
11605 /* Un-invert bytes of recognized vector, if necessary. */
11606 if (invmask != 0)
11607 for (i = 0; i < idx; i++)
11608 bytes[i] ^= invmask;
11610 if (immtype == 17)
11612 /* FIXME: Broken on 32-bit H_W_I hosts. */
11613 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11615 for (i = 0; i < 8; i++)
11616 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11617 << (i * BITS_PER_UNIT);
11620 info->value = GEN_INT (imm);
11622 else
11624 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11625 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11627 /* Construct 'abcdefgh' because the assembler cannot handle
11628 generic constants. */
11629 if (info->mvn)
11630 imm = ~imm;
11631 imm = (imm >> info->shift) & 0xff;
11632 info->value = GEN_INT (imm);
11636 return true;
11637 #undef CHECK
11640 /* Check of immediate shift constants are within range. */
11641 bool
11642 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11644 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11645 if (left)
11646 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11647 else
11648 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11651 /* Return true if X is a uniform vector where all elements
11652 are either the floating-point constant 0.0 or the
11653 integer constant 0. */
11654 bool
11655 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11657 return x == CONST0_RTX (mode);
11661 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11662 operation of width WIDTH at bit position POS. */
11665 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11667 gcc_assert (CONST_INT_P (width));
11668 gcc_assert (CONST_INT_P (pos));
11670 unsigned HOST_WIDE_INT mask
11671 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11672 return GEN_INT (mask << UINTVAL (pos));
11675 bool
11676 aarch64_mov_operand_p (rtx x, machine_mode mode)
11678 if (GET_CODE (x) == HIGH
11679 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11680 return true;
11682 if (CONST_INT_P (x))
11683 return true;
11685 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11686 return true;
11688 return aarch64_classify_symbolic_expression (x)
11689 == SYMBOL_TINY_ABSOLUTE;
11692 /* Return a const_int vector of VAL. */
11694 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11696 int nunits = GET_MODE_NUNITS (mode);
11697 rtvec v = rtvec_alloc (nunits);
11698 int i;
11700 rtx cache = GEN_INT (val);
11702 for (i=0; i < nunits; i++)
11703 RTVEC_ELT (v, i) = cache;
11705 return gen_rtx_CONST_VECTOR (mode, v);
11708 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11710 bool
11711 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11713 machine_mode vmode;
11715 vmode = aarch64_preferred_simd_mode (mode);
11716 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11717 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11720 /* Construct and return a PARALLEL RTX vector with elements numbering the
11721 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11722 the vector - from the perspective of the architecture. This does not
11723 line up with GCC's perspective on lane numbers, so we end up with
11724 different masks depending on our target endian-ness. The diagram
11725 below may help. We must draw the distinction when building masks
11726 which select one half of the vector. An instruction selecting
11727 architectural low-lanes for a big-endian target, must be described using
11728 a mask selecting GCC high-lanes.
11730 Big-Endian Little-Endian
11732 GCC 0 1 2 3 3 2 1 0
11733 | x | x | x | x | | x | x | x | x |
11734 Architecture 3 2 1 0 3 2 1 0
11736 Low Mask: { 2, 3 } { 0, 1 }
11737 High Mask: { 0, 1 } { 2, 3 }
11741 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11743 int nunits = GET_MODE_NUNITS (mode);
11744 rtvec v = rtvec_alloc (nunits / 2);
11745 int high_base = nunits / 2;
11746 int low_base = 0;
11747 int base;
11748 rtx t1;
11749 int i;
11751 if (BYTES_BIG_ENDIAN)
11752 base = high ? low_base : high_base;
11753 else
11754 base = high ? high_base : low_base;
11756 for (i = 0; i < nunits / 2; i++)
11757 RTVEC_ELT (v, i) = GEN_INT (base + i);
11759 t1 = gen_rtx_PARALLEL (mode, v);
11760 return t1;
11763 /* Check OP for validity as a PARALLEL RTX vector with elements
11764 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11765 from the perspective of the architecture. See the diagram above
11766 aarch64_simd_vect_par_cnst_half for more details. */
11768 bool
11769 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11770 bool high)
11772 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11773 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11774 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11775 int i = 0;
11777 if (!VECTOR_MODE_P (mode))
11778 return false;
11780 if (count_op != count_ideal)
11781 return false;
11783 for (i = 0; i < count_ideal; i++)
11785 rtx elt_op = XVECEXP (op, 0, i);
11786 rtx elt_ideal = XVECEXP (ideal, 0, i);
11788 if (!CONST_INT_P (elt_op)
11789 || INTVAL (elt_ideal) != INTVAL (elt_op))
11790 return false;
11792 return true;
11795 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11796 HIGH (exclusive). */
11797 void
11798 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11799 const_tree exp)
11801 HOST_WIDE_INT lane;
11802 gcc_assert (CONST_INT_P (operand));
11803 lane = INTVAL (operand);
11805 if (lane < low || lane >= high)
11807 if (exp)
11808 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11809 else
11810 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11814 /* Return TRUE if OP is a valid vector addressing mode. */
11815 bool
11816 aarch64_simd_mem_operand_p (rtx op)
11818 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11819 || REG_P (XEXP (op, 0)));
11822 /* Emit a register copy from operand to operand, taking care not to
11823 early-clobber source registers in the process.
11825 COUNT is the number of components into which the copy needs to be
11826 decomposed. */
11827 void
11828 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11829 unsigned int count)
11831 unsigned int i;
11832 int rdest = REGNO (operands[0]);
11833 int rsrc = REGNO (operands[1]);
11835 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11836 || rdest < rsrc)
11837 for (i = 0; i < count; i++)
11838 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11839 gen_rtx_REG (mode, rsrc + i));
11840 else
11841 for (i = 0; i < count; i++)
11842 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11843 gen_rtx_REG (mode, rsrc + count - i - 1));
11846 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11847 one of VSTRUCT modes: OI, CI, or XI. */
11849 aarch64_simd_attr_length_rglist (machine_mode mode)
11851 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11854 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11855 alignment of a vector to 128 bits. */
11856 static HOST_WIDE_INT
11857 aarch64_simd_vector_alignment (const_tree type)
11859 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11860 return MIN (align, 128);
11863 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11864 static bool
11865 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11867 if (is_packed)
11868 return false;
11870 /* We guarantee alignment for vectors up to 128-bits. */
11871 if (tree_int_cst_compare (TYPE_SIZE (type),
11872 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11873 return false;
11875 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11876 return true;
11879 /* Return true if the vector misalignment factor is supported by the
11880 target. */
11881 static bool
11882 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11883 const_tree type, int misalignment,
11884 bool is_packed)
11886 if (TARGET_SIMD && STRICT_ALIGNMENT)
11888 /* Return if movmisalign pattern is not supported for this mode. */
11889 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11890 return false;
11892 if (misalignment == -1)
11894 /* Misalignment factor is unknown at compile time but we know
11895 it's word aligned. */
11896 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11898 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11900 if (element_size != 64)
11901 return true;
11903 return false;
11906 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11907 is_packed);
11910 /* If VALS is a vector constant that can be loaded into a register
11911 using DUP, generate instructions to do so and return an RTX to
11912 assign to the register. Otherwise return NULL_RTX. */
11913 static rtx
11914 aarch64_simd_dup_constant (rtx vals)
11916 machine_mode mode = GET_MODE (vals);
11917 machine_mode inner_mode = GET_MODE_INNER (mode);
11918 rtx x;
11920 if (!const_vec_duplicate_p (vals, &x))
11921 return NULL_RTX;
11923 /* We can load this constant by using DUP and a constant in a
11924 single ARM register. This will be cheaper than a vector
11925 load. */
11926 x = copy_to_mode_reg (inner_mode, x);
11927 return gen_rtx_VEC_DUPLICATE (mode, x);
11931 /* Generate code to load VALS, which is a PARALLEL containing only
11932 constants (for vec_init) or CONST_VECTOR, efficiently into a
11933 register. Returns an RTX to copy into the register, or NULL_RTX
11934 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11935 static rtx
11936 aarch64_simd_make_constant (rtx vals)
11938 machine_mode mode = GET_MODE (vals);
11939 rtx const_dup;
11940 rtx const_vec = NULL_RTX;
11941 int n_elts = GET_MODE_NUNITS (mode);
11942 int n_const = 0;
11943 int i;
11945 if (GET_CODE (vals) == CONST_VECTOR)
11946 const_vec = vals;
11947 else if (GET_CODE (vals) == PARALLEL)
11949 /* A CONST_VECTOR must contain only CONST_INTs and
11950 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11951 Only store valid constants in a CONST_VECTOR. */
11952 for (i = 0; i < n_elts; ++i)
11954 rtx x = XVECEXP (vals, 0, i);
11955 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11956 n_const++;
11958 if (n_const == n_elts)
11959 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11961 else
11962 gcc_unreachable ();
11964 if (const_vec != NULL_RTX
11965 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11966 /* Load using MOVI/MVNI. */
11967 return const_vec;
11968 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11969 /* Loaded using DUP. */
11970 return const_dup;
11971 else if (const_vec != NULL_RTX)
11972 /* Load from constant pool. We can not take advantage of single-cycle
11973 LD1 because we need a PC-relative addressing mode. */
11974 return const_vec;
11975 else
11976 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11977 We can not construct an initializer. */
11978 return NULL_RTX;
11981 /* Expand a vector initialisation sequence, such that TARGET is
11982 initialised to contain VALS. */
11984 void
11985 aarch64_expand_vector_init (rtx target, rtx vals)
11987 machine_mode mode = GET_MODE (target);
11988 scalar_mode inner_mode = GET_MODE_INNER (mode);
11989 /* The number of vector elements. */
11990 int n_elts = GET_MODE_NUNITS (mode);
11991 /* The number of vector elements which are not constant. */
11992 int n_var = 0;
11993 rtx any_const = NULL_RTX;
11994 /* The first element of vals. */
11995 rtx v0 = XVECEXP (vals, 0, 0);
11996 bool all_same = true;
11998 /* Count the number of variable elements to initialise. */
11999 for (int i = 0; i < n_elts; ++i)
12001 rtx x = XVECEXP (vals, 0, i);
12002 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12003 ++n_var;
12004 else
12005 any_const = x;
12007 all_same &= rtx_equal_p (x, v0);
12010 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12011 how best to handle this. */
12012 if (n_var == 0)
12014 rtx constant = aarch64_simd_make_constant (vals);
12015 if (constant != NULL_RTX)
12017 emit_move_insn (target, constant);
12018 return;
12022 /* Splat a single non-constant element if we can. */
12023 if (all_same)
12025 rtx x = copy_to_mode_reg (inner_mode, v0);
12026 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12027 return;
12030 enum insn_code icode = optab_handler (vec_set_optab, mode);
12031 gcc_assert (icode != CODE_FOR_nothing);
12033 /* If there are only variable elements, try to optimize
12034 the insertion using dup for the most common element
12035 followed by insertions. */
12037 /* The algorithm will fill matches[*][0] with the earliest matching element,
12038 and matches[X][1] with the count of duplicate elements (if X is the
12039 earliest element which has duplicates). */
12041 if (n_var == n_elts && n_elts <= 16)
12043 int matches[16][2] = {0};
12044 for (int i = 0; i < n_elts; i++)
12046 for (int j = 0; j <= i; j++)
12048 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12050 matches[i][0] = j;
12051 matches[j][1]++;
12052 break;
12056 int maxelement = 0;
12057 int maxv = 0;
12058 for (int i = 0; i < n_elts; i++)
12059 if (matches[i][1] > maxv)
12061 maxelement = i;
12062 maxv = matches[i][1];
12065 /* Create a duplicate of the most common element. */
12066 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12067 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12069 /* Insert the rest. */
12070 for (int i = 0; i < n_elts; i++)
12072 rtx x = XVECEXP (vals, 0, i);
12073 if (matches[i][0] == maxelement)
12074 continue;
12075 x = copy_to_mode_reg (inner_mode, x);
12076 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12078 return;
12081 /* Initialise a vector which is part-variable. We want to first try
12082 to build those lanes which are constant in the most efficient way we
12083 can. */
12084 if (n_var != n_elts)
12086 rtx copy = copy_rtx (vals);
12088 /* Load constant part of vector. We really don't care what goes into the
12089 parts we will overwrite, but we're more likely to be able to load the
12090 constant efficiently if it has fewer, larger, repeating parts
12091 (see aarch64_simd_valid_immediate). */
12092 for (int i = 0; i < n_elts; i++)
12094 rtx x = XVECEXP (vals, 0, i);
12095 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12096 continue;
12097 rtx subst = any_const;
12098 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12100 /* Look in the copied vector, as more elements are const. */
12101 rtx test = XVECEXP (copy, 0, i ^ bit);
12102 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12104 subst = test;
12105 break;
12108 XVECEXP (copy, 0, i) = subst;
12110 aarch64_expand_vector_init (target, copy);
12113 /* Insert the variable lanes directly. */
12114 for (int i = 0; i < n_elts; i++)
12116 rtx x = XVECEXP (vals, 0, i);
12117 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12118 continue;
12119 x = copy_to_mode_reg (inner_mode, x);
12120 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12124 static unsigned HOST_WIDE_INT
12125 aarch64_shift_truncation_mask (machine_mode mode)
12127 return
12128 (!SHIFT_COUNT_TRUNCATED
12129 || aarch64_vector_mode_supported_p (mode)
12130 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12133 /* Select a format to encode pointers in exception handling data. */
12135 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12137 int type;
12138 switch (aarch64_cmodel)
12140 case AARCH64_CMODEL_TINY:
12141 case AARCH64_CMODEL_TINY_PIC:
12142 case AARCH64_CMODEL_SMALL:
12143 case AARCH64_CMODEL_SMALL_PIC:
12144 case AARCH64_CMODEL_SMALL_SPIC:
12145 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12146 for everything. */
12147 type = DW_EH_PE_sdata4;
12148 break;
12149 default:
12150 /* No assumptions here. 8-byte relocs required. */
12151 type = DW_EH_PE_sdata8;
12152 break;
12154 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12157 /* The last .arch and .tune assembly strings that we printed. */
12158 static std::string aarch64_last_printed_arch_string;
12159 static std::string aarch64_last_printed_tune_string;
12161 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12162 by the function fndecl. */
12164 void
12165 aarch64_declare_function_name (FILE *stream, const char* name,
12166 tree fndecl)
12168 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12170 struct cl_target_option *targ_options;
12171 if (target_parts)
12172 targ_options = TREE_TARGET_OPTION (target_parts);
12173 else
12174 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12175 gcc_assert (targ_options);
12177 const struct processor *this_arch
12178 = aarch64_get_arch (targ_options->x_explicit_arch);
12180 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12181 std::string extension
12182 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12183 this_arch->flags);
12184 /* Only update the assembler .arch string if it is distinct from the last
12185 such string we printed. */
12186 std::string to_print = this_arch->name + extension;
12187 if (to_print != aarch64_last_printed_arch_string)
12189 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12190 aarch64_last_printed_arch_string = to_print;
12193 /* Print the cpu name we're tuning for in the comments, might be
12194 useful to readers of the generated asm. Do it only when it changes
12195 from function to function and verbose assembly is requested. */
12196 const struct processor *this_tune
12197 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12199 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12201 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12202 this_tune->name);
12203 aarch64_last_printed_tune_string = this_tune->name;
12206 /* Don't forget the type directive for ELF. */
12207 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12208 ASM_OUTPUT_LABEL (stream, name);
12211 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12213 static void
12214 aarch64_start_file (void)
12216 struct cl_target_option *default_options
12217 = TREE_TARGET_OPTION (target_option_default_node);
12219 const struct processor *default_arch
12220 = aarch64_get_arch (default_options->x_explicit_arch);
12221 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12222 std::string extension
12223 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12224 default_arch->flags);
12226 aarch64_last_printed_arch_string = default_arch->name + extension;
12227 aarch64_last_printed_tune_string = "";
12228 asm_fprintf (asm_out_file, "\t.arch %s\n",
12229 aarch64_last_printed_arch_string.c_str ());
12231 default_file_start ();
12234 /* Emit load exclusive. */
12236 static void
12237 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12238 rtx mem, rtx model_rtx)
12240 rtx (*gen) (rtx, rtx, rtx);
12242 switch (mode)
12244 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12245 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12246 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12247 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12248 default:
12249 gcc_unreachable ();
12252 emit_insn (gen (rval, mem, model_rtx));
12255 /* Emit store exclusive. */
12257 static void
12258 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12259 rtx rval, rtx mem, rtx model_rtx)
12261 rtx (*gen) (rtx, rtx, rtx, rtx);
12263 switch (mode)
12265 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12266 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12267 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12268 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12269 default:
12270 gcc_unreachable ();
12273 emit_insn (gen (bval, rval, mem, model_rtx));
12276 /* Mark the previous jump instruction as unlikely. */
12278 static void
12279 aarch64_emit_unlikely_jump (rtx insn)
12281 rtx_insn *jump = emit_jump_insn (insn);
12282 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12285 /* Expand a compare and swap pattern. */
12287 void
12288 aarch64_expand_compare_and_swap (rtx operands[])
12290 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12291 machine_mode mode, cmp_mode;
12292 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12293 int idx;
12294 gen_cas_fn gen;
12295 const gen_cas_fn split_cas[] =
12297 gen_aarch64_compare_and_swapqi,
12298 gen_aarch64_compare_and_swaphi,
12299 gen_aarch64_compare_and_swapsi,
12300 gen_aarch64_compare_and_swapdi
12302 const gen_cas_fn atomic_cas[] =
12304 gen_aarch64_compare_and_swapqi_lse,
12305 gen_aarch64_compare_and_swaphi_lse,
12306 gen_aarch64_compare_and_swapsi_lse,
12307 gen_aarch64_compare_and_swapdi_lse
12310 bval = operands[0];
12311 rval = operands[1];
12312 mem = operands[2];
12313 oldval = operands[3];
12314 newval = operands[4];
12315 is_weak = operands[5];
12316 mod_s = operands[6];
12317 mod_f = operands[7];
12318 mode = GET_MODE (mem);
12319 cmp_mode = mode;
12321 /* Normally the succ memory model must be stronger than fail, but in the
12322 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12323 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12325 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12326 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12327 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12329 switch (mode)
12331 case E_QImode:
12332 case E_HImode:
12333 /* For short modes, we're going to perform the comparison in SImode,
12334 so do the zero-extension now. */
12335 cmp_mode = SImode;
12336 rval = gen_reg_rtx (SImode);
12337 oldval = convert_modes (SImode, mode, oldval, true);
12338 /* Fall through. */
12340 case E_SImode:
12341 case E_DImode:
12342 /* Force the value into a register if needed. */
12343 if (!aarch64_plus_operand (oldval, mode))
12344 oldval = force_reg (cmp_mode, oldval);
12345 break;
12347 default:
12348 gcc_unreachable ();
12351 switch (mode)
12353 case E_QImode: idx = 0; break;
12354 case E_HImode: idx = 1; break;
12355 case E_SImode: idx = 2; break;
12356 case E_DImode: idx = 3; break;
12357 default:
12358 gcc_unreachable ();
12360 if (TARGET_LSE)
12361 gen = atomic_cas[idx];
12362 else
12363 gen = split_cas[idx];
12365 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12367 if (mode == QImode || mode == HImode)
12368 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12370 x = gen_rtx_REG (CCmode, CC_REGNUM);
12371 x = gen_rtx_EQ (SImode, x, const0_rtx);
12372 emit_insn (gen_rtx_SET (bval, x));
12375 /* Test whether the target supports using a atomic load-operate instruction.
12376 CODE is the operation and AFTER is TRUE if the data in memory after the
12377 operation should be returned and FALSE if the data before the operation
12378 should be returned. Returns FALSE if the operation isn't supported by the
12379 architecture. */
12381 bool
12382 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12384 if (!TARGET_LSE)
12385 return false;
12387 switch (code)
12389 case SET:
12390 case AND:
12391 case IOR:
12392 case XOR:
12393 case MINUS:
12394 case PLUS:
12395 return true;
12396 default:
12397 return false;
12401 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12402 sequence implementing an atomic operation. */
12404 static void
12405 aarch64_emit_post_barrier (enum memmodel model)
12407 const enum memmodel base_model = memmodel_base (model);
12409 if (is_mm_sync (model)
12410 && (base_model == MEMMODEL_ACQUIRE
12411 || base_model == MEMMODEL_ACQ_REL
12412 || base_model == MEMMODEL_SEQ_CST))
12414 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12418 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12419 for the data in memory. EXPECTED is the value expected to be in memory.
12420 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12421 is the memory ordering to use. */
12423 void
12424 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12425 rtx expected, rtx desired,
12426 rtx model)
12428 rtx (*gen) (rtx, rtx, rtx, rtx);
12429 machine_mode mode;
12431 mode = GET_MODE (mem);
12433 switch (mode)
12435 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12436 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12437 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12438 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12439 default:
12440 gcc_unreachable ();
12443 /* Move the expected value into the CAS destination register. */
12444 emit_insn (gen_rtx_SET (rval, expected));
12446 /* Emit the CAS. */
12447 emit_insn (gen (rval, mem, desired, model));
12449 /* Compare the expected value with the value loaded by the CAS, to establish
12450 whether the swap was made. */
12451 aarch64_gen_compare_reg (EQ, rval, expected);
12454 /* Split a compare and swap pattern. */
12456 void
12457 aarch64_split_compare_and_swap (rtx operands[])
12459 rtx rval, mem, oldval, newval, scratch;
12460 machine_mode mode;
12461 bool is_weak;
12462 rtx_code_label *label1, *label2;
12463 rtx x, cond;
12464 enum memmodel model;
12465 rtx model_rtx;
12467 rval = operands[0];
12468 mem = operands[1];
12469 oldval = operands[2];
12470 newval = operands[3];
12471 is_weak = (operands[4] != const0_rtx);
12472 model_rtx = operands[5];
12473 scratch = operands[7];
12474 mode = GET_MODE (mem);
12475 model = memmodel_from_int (INTVAL (model_rtx));
12477 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12478 loop:
12479 .label1:
12480 LD[A]XR rval, [mem]
12481 CBNZ rval, .label2
12482 ST[L]XR scratch, newval, [mem]
12483 CBNZ scratch, .label1
12484 .label2:
12485 CMP rval, 0. */
12486 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12488 label1 = NULL;
12489 if (!is_weak)
12491 label1 = gen_label_rtx ();
12492 emit_label (label1);
12494 label2 = gen_label_rtx ();
12496 /* The initial load can be relaxed for a __sync operation since a final
12497 barrier will be emitted to stop code hoisting. */
12498 if (is_mm_sync (model))
12499 aarch64_emit_load_exclusive (mode, rval, mem,
12500 GEN_INT (MEMMODEL_RELAXED));
12501 else
12502 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12504 if (strong_zero_p)
12506 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12507 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12508 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12509 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12511 else
12513 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12514 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12515 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12516 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12517 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12520 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12522 if (!is_weak)
12524 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12525 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12526 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12527 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12529 else
12531 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12532 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12533 emit_insn (gen_rtx_SET (cond, x));
12536 emit_label (label2);
12537 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12538 to set the condition flags. If this is not used it will be removed by
12539 later passes. */
12540 if (strong_zero_p)
12542 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12543 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12544 emit_insn (gen_rtx_SET (cond, x));
12546 /* Emit any final barrier needed for a __sync operation. */
12547 if (is_mm_sync (model))
12548 aarch64_emit_post_barrier (model);
12551 /* Emit a BIC instruction. */
12553 static void
12554 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12556 rtx shift_rtx = GEN_INT (shift);
12557 rtx (*gen) (rtx, rtx, rtx, rtx);
12559 switch (mode)
12561 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12562 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12563 default:
12564 gcc_unreachable ();
12567 emit_insn (gen (dst, s2, shift_rtx, s1));
12570 /* Emit an atomic swap. */
12572 static void
12573 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12574 rtx mem, rtx model)
12576 rtx (*gen) (rtx, rtx, rtx, rtx);
12578 switch (mode)
12580 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12581 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12582 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12583 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12584 default:
12585 gcc_unreachable ();
12588 emit_insn (gen (dst, mem, value, model));
12591 /* Operations supported by aarch64_emit_atomic_load_op. */
12593 enum aarch64_atomic_load_op_code
12595 AARCH64_LDOP_PLUS, /* A + B */
12596 AARCH64_LDOP_XOR, /* A ^ B */
12597 AARCH64_LDOP_OR, /* A | B */
12598 AARCH64_LDOP_BIC /* A & ~B */
12601 /* Emit an atomic load-operate. */
12603 static void
12604 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12605 machine_mode mode, rtx dst, rtx src,
12606 rtx mem, rtx model)
12608 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12609 const aarch64_atomic_load_op_fn plus[] =
12611 gen_aarch64_atomic_loadaddqi,
12612 gen_aarch64_atomic_loadaddhi,
12613 gen_aarch64_atomic_loadaddsi,
12614 gen_aarch64_atomic_loadadddi
12616 const aarch64_atomic_load_op_fn eor[] =
12618 gen_aarch64_atomic_loadeorqi,
12619 gen_aarch64_atomic_loadeorhi,
12620 gen_aarch64_atomic_loadeorsi,
12621 gen_aarch64_atomic_loadeordi
12623 const aarch64_atomic_load_op_fn ior[] =
12625 gen_aarch64_atomic_loadsetqi,
12626 gen_aarch64_atomic_loadsethi,
12627 gen_aarch64_atomic_loadsetsi,
12628 gen_aarch64_atomic_loadsetdi
12630 const aarch64_atomic_load_op_fn bic[] =
12632 gen_aarch64_atomic_loadclrqi,
12633 gen_aarch64_atomic_loadclrhi,
12634 gen_aarch64_atomic_loadclrsi,
12635 gen_aarch64_atomic_loadclrdi
12637 aarch64_atomic_load_op_fn gen;
12638 int idx = 0;
12640 switch (mode)
12642 case E_QImode: idx = 0; break;
12643 case E_HImode: idx = 1; break;
12644 case E_SImode: idx = 2; break;
12645 case E_DImode: idx = 3; break;
12646 default:
12647 gcc_unreachable ();
12650 switch (code)
12652 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12653 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12654 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12655 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12656 default:
12657 gcc_unreachable ();
12660 emit_insn (gen (dst, mem, src, model));
12663 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12664 location to store the data read from memory. OUT_RESULT is the location to
12665 store the result of the operation. MEM is the memory location to read and
12666 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12667 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12668 be NULL. */
12670 void
12671 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12672 rtx mem, rtx value, rtx model_rtx)
12674 machine_mode mode = GET_MODE (mem);
12675 machine_mode wmode = (mode == DImode ? DImode : SImode);
12676 const bool short_mode = (mode < SImode);
12677 aarch64_atomic_load_op_code ldop_code;
12678 rtx src;
12679 rtx x;
12681 if (out_data)
12682 out_data = gen_lowpart (mode, out_data);
12684 if (out_result)
12685 out_result = gen_lowpart (mode, out_result);
12687 /* Make sure the value is in a register, putting it into a destination
12688 register if it needs to be manipulated. */
12689 if (!register_operand (value, mode)
12690 || code == AND || code == MINUS)
12692 src = out_result ? out_result : out_data;
12693 emit_move_insn (src, gen_lowpart (mode, value));
12695 else
12696 src = value;
12697 gcc_assert (register_operand (src, mode));
12699 /* Preprocess the data for the operation as necessary. If the operation is
12700 a SET then emit a swap instruction and finish. */
12701 switch (code)
12703 case SET:
12704 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12705 return;
12707 case MINUS:
12708 /* Negate the value and treat it as a PLUS. */
12710 rtx neg_src;
12712 /* Resize the value if necessary. */
12713 if (short_mode)
12714 src = gen_lowpart (wmode, src);
12716 neg_src = gen_rtx_NEG (wmode, src);
12717 emit_insn (gen_rtx_SET (src, neg_src));
12719 if (short_mode)
12720 src = gen_lowpart (mode, src);
12722 /* Fall-through. */
12723 case PLUS:
12724 ldop_code = AARCH64_LDOP_PLUS;
12725 break;
12727 case IOR:
12728 ldop_code = AARCH64_LDOP_OR;
12729 break;
12731 case XOR:
12732 ldop_code = AARCH64_LDOP_XOR;
12733 break;
12735 case AND:
12737 rtx not_src;
12739 /* Resize the value if necessary. */
12740 if (short_mode)
12741 src = gen_lowpart (wmode, src);
12743 not_src = gen_rtx_NOT (wmode, src);
12744 emit_insn (gen_rtx_SET (src, not_src));
12746 if (short_mode)
12747 src = gen_lowpart (mode, src);
12749 ldop_code = AARCH64_LDOP_BIC;
12750 break;
12752 default:
12753 /* The operation can't be done with atomic instructions. */
12754 gcc_unreachable ();
12757 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12759 /* If necessary, calculate the data in memory after the update by redoing the
12760 operation from values in registers. */
12761 if (!out_result)
12762 return;
12764 if (short_mode)
12766 src = gen_lowpart (wmode, src);
12767 out_data = gen_lowpart (wmode, out_data);
12768 out_result = gen_lowpart (wmode, out_result);
12771 x = NULL_RTX;
12773 switch (code)
12775 case MINUS:
12776 case PLUS:
12777 x = gen_rtx_PLUS (wmode, out_data, src);
12778 break;
12779 case IOR:
12780 x = gen_rtx_IOR (wmode, out_data, src);
12781 break;
12782 case XOR:
12783 x = gen_rtx_XOR (wmode, out_data, src);
12784 break;
12785 case AND:
12786 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12787 return;
12788 default:
12789 gcc_unreachable ();
12792 emit_set_insn (out_result, x);
12794 return;
12797 /* Split an atomic operation. */
12799 void
12800 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12801 rtx value, rtx model_rtx, rtx cond)
12803 machine_mode mode = GET_MODE (mem);
12804 machine_mode wmode = (mode == DImode ? DImode : SImode);
12805 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12806 const bool is_sync = is_mm_sync (model);
12807 rtx_code_label *label;
12808 rtx x;
12810 /* Split the atomic operation into a sequence. */
12811 label = gen_label_rtx ();
12812 emit_label (label);
12814 if (new_out)
12815 new_out = gen_lowpart (wmode, new_out);
12816 if (old_out)
12817 old_out = gen_lowpart (wmode, old_out);
12818 else
12819 old_out = new_out;
12820 value = simplify_gen_subreg (wmode, value, mode, 0);
12822 /* The initial load can be relaxed for a __sync operation since a final
12823 barrier will be emitted to stop code hoisting. */
12824 if (is_sync)
12825 aarch64_emit_load_exclusive (mode, old_out, mem,
12826 GEN_INT (MEMMODEL_RELAXED));
12827 else
12828 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12830 switch (code)
12832 case SET:
12833 new_out = value;
12834 break;
12836 case NOT:
12837 x = gen_rtx_AND (wmode, old_out, value);
12838 emit_insn (gen_rtx_SET (new_out, x));
12839 x = gen_rtx_NOT (wmode, new_out);
12840 emit_insn (gen_rtx_SET (new_out, x));
12841 break;
12843 case MINUS:
12844 if (CONST_INT_P (value))
12846 value = GEN_INT (-INTVAL (value));
12847 code = PLUS;
12849 /* Fall through. */
12851 default:
12852 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12853 emit_insn (gen_rtx_SET (new_out, x));
12854 break;
12857 aarch64_emit_store_exclusive (mode, cond, mem,
12858 gen_lowpart (mode, new_out), model_rtx);
12860 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12861 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12862 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12863 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12865 /* Emit any final barrier needed for a __sync operation. */
12866 if (is_sync)
12867 aarch64_emit_post_barrier (model);
12870 static void
12871 aarch64_init_libfuncs (void)
12873 /* Half-precision float operations. The compiler handles all operations
12874 with NULL libfuncs by converting to SFmode. */
12876 /* Conversions. */
12877 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12878 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12880 /* Arithmetic. */
12881 set_optab_libfunc (add_optab, HFmode, NULL);
12882 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12883 set_optab_libfunc (smul_optab, HFmode, NULL);
12884 set_optab_libfunc (neg_optab, HFmode, NULL);
12885 set_optab_libfunc (sub_optab, HFmode, NULL);
12887 /* Comparisons. */
12888 set_optab_libfunc (eq_optab, HFmode, NULL);
12889 set_optab_libfunc (ne_optab, HFmode, NULL);
12890 set_optab_libfunc (lt_optab, HFmode, NULL);
12891 set_optab_libfunc (le_optab, HFmode, NULL);
12892 set_optab_libfunc (ge_optab, HFmode, NULL);
12893 set_optab_libfunc (gt_optab, HFmode, NULL);
12894 set_optab_libfunc (unord_optab, HFmode, NULL);
12897 /* Target hook for c_mode_for_suffix. */
12898 static machine_mode
12899 aarch64_c_mode_for_suffix (char suffix)
12901 if (suffix == 'q')
12902 return TFmode;
12904 return VOIDmode;
12907 /* We can only represent floating point constants which will fit in
12908 "quarter-precision" values. These values are characterised by
12909 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12912 (-1)^s * (n/16) * 2^r
12914 Where:
12915 's' is the sign bit.
12916 'n' is an integer in the range 16 <= n <= 31.
12917 'r' is an integer in the range -3 <= r <= 4. */
12919 /* Return true iff X can be represented by a quarter-precision
12920 floating point immediate operand X. Note, we cannot represent 0.0. */
12921 bool
12922 aarch64_float_const_representable_p (rtx x)
12924 /* This represents our current view of how many bits
12925 make up the mantissa. */
12926 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12927 int exponent;
12928 unsigned HOST_WIDE_INT mantissa, mask;
12929 REAL_VALUE_TYPE r, m;
12930 bool fail;
12932 if (!CONST_DOUBLE_P (x))
12933 return false;
12935 /* We don't support HFmode constants yet. */
12936 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12937 return false;
12939 r = *CONST_DOUBLE_REAL_VALUE (x);
12941 /* We cannot represent infinities, NaNs or +/-zero. We won't
12942 know if we have +zero until we analyse the mantissa, but we
12943 can reject the other invalid values. */
12944 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12945 || REAL_VALUE_MINUS_ZERO (r))
12946 return false;
12948 /* Extract exponent. */
12949 r = real_value_abs (&r);
12950 exponent = REAL_EXP (&r);
12952 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12953 highest (sign) bit, with a fixed binary point at bit point_pos.
12954 m1 holds the low part of the mantissa, m2 the high part.
12955 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12956 bits for the mantissa, this can fail (low bits will be lost). */
12957 real_ldexp (&m, &r, point_pos - exponent);
12958 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12960 /* If the low part of the mantissa has bits set we cannot represent
12961 the value. */
12962 if (w.ulow () != 0)
12963 return false;
12964 /* We have rejected the lower HOST_WIDE_INT, so update our
12965 understanding of how many bits lie in the mantissa and
12966 look only at the high HOST_WIDE_INT. */
12967 mantissa = w.elt (1);
12968 point_pos -= HOST_BITS_PER_WIDE_INT;
12970 /* We can only represent values with a mantissa of the form 1.xxxx. */
12971 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12972 if ((mantissa & mask) != 0)
12973 return false;
12975 /* Having filtered unrepresentable values, we may now remove all
12976 but the highest 5 bits. */
12977 mantissa >>= point_pos - 5;
12979 /* We cannot represent the value 0.0, so reject it. This is handled
12980 elsewhere. */
12981 if (mantissa == 0)
12982 return false;
12984 /* Then, as bit 4 is always set, we can mask it off, leaving
12985 the mantissa in the range [0, 15]. */
12986 mantissa &= ~(1 << 4);
12987 gcc_assert (mantissa <= 15);
12989 /* GCC internally does not use IEEE754-like encoding (where normalized
12990 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12991 Our mantissa values are shifted 4 places to the left relative to
12992 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12993 by 5 places to correct for GCC's representation. */
12994 exponent = 5 - exponent;
12996 return (exponent >= 0 && exponent <= 7);
12999 char*
13000 aarch64_output_simd_mov_immediate (rtx const_vector,
13001 machine_mode mode,
13002 unsigned width)
13004 bool is_valid;
13005 static char templ[40];
13006 const char *mnemonic;
13007 const char *shift_op;
13008 unsigned int lane_count = 0;
13009 char element_char;
13011 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13013 /* This will return true to show const_vector is legal for use as either
13014 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
13015 also update INFO to show how the immediate should be generated. */
13016 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13017 gcc_assert (is_valid);
13019 element_char = sizetochar (info.element_width);
13020 lane_count = width / info.element_width;
13022 mode = GET_MODE_INNER (mode);
13023 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13025 gcc_assert (info.shift == 0 && ! info.mvn);
13026 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13027 move immediate path. */
13028 if (aarch64_float_const_zero_rtx_p (info.value))
13029 info.value = GEN_INT (0);
13030 else
13032 const unsigned int buf_size = 20;
13033 char float_buf[buf_size] = {'\0'};
13034 real_to_decimal_for_mode (float_buf,
13035 CONST_DOUBLE_REAL_VALUE (info.value),
13036 buf_size, buf_size, 1, mode);
13038 if (lane_count == 1)
13039 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13040 else
13041 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13042 lane_count, element_char, float_buf);
13043 return templ;
13047 mnemonic = info.mvn ? "mvni" : "movi";
13048 shift_op = info.msl ? "msl" : "lsl";
13050 gcc_assert (CONST_INT_P (info.value));
13051 if (lane_count == 1)
13052 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13053 mnemonic, UINTVAL (info.value));
13054 else if (info.shift)
13055 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13056 ", %s %d", mnemonic, lane_count, element_char,
13057 UINTVAL (info.value), shift_op, info.shift);
13058 else
13059 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13060 mnemonic, lane_count, element_char, UINTVAL (info.value));
13061 return templ;
13064 char*
13065 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13068 /* If a floating point number was passed and we desire to use it in an
13069 integer mode do the conversion to integer. */
13070 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13072 unsigned HOST_WIDE_INT ival;
13073 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13074 gcc_unreachable ();
13075 immediate = gen_int_mode (ival, mode);
13078 machine_mode vmode;
13079 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13080 a 128 bit vector mode. */
13081 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13083 vmode = aarch64_simd_container_mode (mode, width);
13084 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13085 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13088 /* Split operands into moves from op[1] + op[2] into op[0]. */
13090 void
13091 aarch64_split_combinev16qi (rtx operands[3])
13093 unsigned int dest = REGNO (operands[0]);
13094 unsigned int src1 = REGNO (operands[1]);
13095 unsigned int src2 = REGNO (operands[2]);
13096 machine_mode halfmode = GET_MODE (operands[1]);
13097 unsigned int halfregs = REG_NREGS (operands[1]);
13098 rtx destlo, desthi;
13100 gcc_assert (halfmode == V16QImode);
13102 if (src1 == dest && src2 == dest + halfregs)
13104 /* No-op move. Can't split to nothing; emit something. */
13105 emit_note (NOTE_INSN_DELETED);
13106 return;
13109 /* Preserve register attributes for variable tracking. */
13110 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13111 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13112 GET_MODE_SIZE (halfmode));
13114 /* Special case of reversed high/low parts. */
13115 if (reg_overlap_mentioned_p (operands[2], destlo)
13116 && reg_overlap_mentioned_p (operands[1], desthi))
13118 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13119 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13120 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13122 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13124 /* Try to avoid unnecessary moves if part of the result
13125 is in the right place already. */
13126 if (src1 != dest)
13127 emit_move_insn (destlo, operands[1]);
13128 if (src2 != dest + halfregs)
13129 emit_move_insn (desthi, operands[2]);
13131 else
13133 if (src2 != dest + halfregs)
13134 emit_move_insn (desthi, operands[2]);
13135 if (src1 != dest)
13136 emit_move_insn (destlo, operands[1]);
13140 /* vec_perm support. */
13142 #define MAX_VECT_LEN 16
13144 struct expand_vec_perm_d
13146 rtx target, op0, op1;
13147 unsigned char perm[MAX_VECT_LEN];
13148 machine_mode vmode;
13149 unsigned char nelt;
13150 bool one_vector_p;
13151 bool testing_p;
13154 /* Generate a variable permutation. */
13156 static void
13157 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13159 machine_mode vmode = GET_MODE (target);
13160 bool one_vector_p = rtx_equal_p (op0, op1);
13162 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13163 gcc_checking_assert (GET_MODE (op0) == vmode);
13164 gcc_checking_assert (GET_MODE (op1) == vmode);
13165 gcc_checking_assert (GET_MODE (sel) == vmode);
13166 gcc_checking_assert (TARGET_SIMD);
13168 if (one_vector_p)
13170 if (vmode == V8QImode)
13172 /* Expand the argument to a V16QI mode by duplicating it. */
13173 rtx pair = gen_reg_rtx (V16QImode);
13174 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13175 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13177 else
13179 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13182 else
13184 rtx pair;
13186 if (vmode == V8QImode)
13188 pair = gen_reg_rtx (V16QImode);
13189 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13190 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13192 else
13194 pair = gen_reg_rtx (OImode);
13195 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13196 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13201 void
13202 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13204 machine_mode vmode = GET_MODE (target);
13205 unsigned int nelt = GET_MODE_NUNITS (vmode);
13206 bool one_vector_p = rtx_equal_p (op0, op1);
13207 rtx mask;
13209 /* The TBL instruction does not use a modulo index, so we must take care
13210 of that ourselves. */
13211 mask = aarch64_simd_gen_const_vector_dup (vmode,
13212 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13213 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13215 /* For big-endian, we also need to reverse the index within the vector
13216 (but not which vector). */
13217 if (BYTES_BIG_ENDIAN)
13219 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13220 if (!one_vector_p)
13221 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13222 sel = expand_simple_binop (vmode, XOR, sel, mask,
13223 NULL, 0, OPTAB_LIB_WIDEN);
13225 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13228 /* Recognize patterns suitable for the TRN instructions. */
13229 static bool
13230 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13232 unsigned int i, odd, mask, nelt = d->nelt;
13233 rtx out, in0, in1, x;
13234 rtx (*gen) (rtx, rtx, rtx);
13235 machine_mode vmode = d->vmode;
13237 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13238 return false;
13240 /* Note that these are little-endian tests.
13241 We correct for big-endian later. */
13242 if (d->perm[0] == 0)
13243 odd = 0;
13244 else if (d->perm[0] == 1)
13245 odd = 1;
13246 else
13247 return false;
13248 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13250 for (i = 0; i < nelt; i += 2)
13252 if (d->perm[i] != i + odd)
13253 return false;
13254 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13255 return false;
13258 /* Success! */
13259 if (d->testing_p)
13260 return true;
13262 in0 = d->op0;
13263 in1 = d->op1;
13264 if (BYTES_BIG_ENDIAN)
13266 x = in0, in0 = in1, in1 = x;
13267 odd = !odd;
13269 out = d->target;
13271 if (odd)
13273 switch (vmode)
13275 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13276 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13277 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13278 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13279 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13280 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13281 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13282 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13283 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13284 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13285 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13286 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13287 default:
13288 return false;
13291 else
13293 switch (vmode)
13295 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13296 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13297 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13298 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13299 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13300 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13301 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13302 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13303 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13304 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13305 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13306 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13307 default:
13308 return false;
13312 emit_insn (gen (out, in0, in1));
13313 return true;
13316 /* Recognize patterns suitable for the UZP instructions. */
13317 static bool
13318 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13320 unsigned int i, odd, mask, nelt = d->nelt;
13321 rtx out, in0, in1, x;
13322 rtx (*gen) (rtx, rtx, rtx);
13323 machine_mode vmode = d->vmode;
13325 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13326 return false;
13328 /* Note that these are little-endian tests.
13329 We correct for big-endian later. */
13330 if (d->perm[0] == 0)
13331 odd = 0;
13332 else if (d->perm[0] == 1)
13333 odd = 1;
13334 else
13335 return false;
13336 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13338 for (i = 0; i < nelt; i++)
13340 unsigned elt = (i * 2 + odd) & mask;
13341 if (d->perm[i] != elt)
13342 return false;
13345 /* Success! */
13346 if (d->testing_p)
13347 return true;
13349 in0 = d->op0;
13350 in1 = d->op1;
13351 if (BYTES_BIG_ENDIAN)
13353 x = in0, in0 = in1, in1 = x;
13354 odd = !odd;
13356 out = d->target;
13358 if (odd)
13360 switch (vmode)
13362 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13363 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13364 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13365 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13366 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13367 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13368 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13369 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13370 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13371 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13372 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13373 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13374 default:
13375 return false;
13378 else
13380 switch (vmode)
13382 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13383 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13384 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13385 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13386 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13387 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13388 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13389 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13390 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13391 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13392 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13393 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13394 default:
13395 return false;
13399 emit_insn (gen (out, in0, in1));
13400 return true;
13403 /* Recognize patterns suitable for the ZIP instructions. */
13404 static bool
13405 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13407 unsigned int i, high, mask, nelt = d->nelt;
13408 rtx out, in0, in1, x;
13409 rtx (*gen) (rtx, rtx, rtx);
13410 machine_mode vmode = d->vmode;
13412 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13413 return false;
13415 /* Note that these are little-endian tests.
13416 We correct for big-endian later. */
13417 high = nelt / 2;
13418 if (d->perm[0] == high)
13419 /* Do Nothing. */
13421 else if (d->perm[0] == 0)
13422 high = 0;
13423 else
13424 return false;
13425 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13427 for (i = 0; i < nelt / 2; i++)
13429 unsigned elt = (i + high) & mask;
13430 if (d->perm[i * 2] != elt)
13431 return false;
13432 elt = (elt + nelt) & mask;
13433 if (d->perm[i * 2 + 1] != elt)
13434 return false;
13437 /* Success! */
13438 if (d->testing_p)
13439 return true;
13441 in0 = d->op0;
13442 in1 = d->op1;
13443 if (BYTES_BIG_ENDIAN)
13445 x = in0, in0 = in1, in1 = x;
13446 high = !high;
13448 out = d->target;
13450 if (high)
13452 switch (vmode)
13454 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13455 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13456 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13457 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13458 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13459 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13460 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13461 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13462 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13463 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13464 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13465 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13466 default:
13467 return false;
13470 else
13472 switch (vmode)
13474 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13475 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13476 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13477 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13478 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13479 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13480 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13481 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13482 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13483 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13484 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13485 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13486 default:
13487 return false;
13491 emit_insn (gen (out, in0, in1));
13492 return true;
13495 /* Recognize patterns for the EXT insn. */
13497 static bool
13498 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13500 unsigned int i, nelt = d->nelt;
13501 rtx (*gen) (rtx, rtx, rtx, rtx);
13502 rtx offset;
13504 unsigned int location = d->perm[0]; /* Always < nelt. */
13506 /* Check if the extracted indices are increasing by one. */
13507 for (i = 1; i < nelt; i++)
13509 unsigned int required = location + i;
13510 if (d->one_vector_p)
13512 /* We'll pass the same vector in twice, so allow indices to wrap. */
13513 required &= (nelt - 1);
13515 if (d->perm[i] != required)
13516 return false;
13519 switch (d->vmode)
13521 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13522 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13523 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13524 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13525 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13526 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13527 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13528 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13529 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13530 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13531 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13532 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13533 default:
13534 return false;
13537 /* Success! */
13538 if (d->testing_p)
13539 return true;
13541 /* The case where (location == 0) is a no-op for both big- and little-endian,
13542 and is removed by the mid-end at optimization levels -O1 and higher. */
13544 if (BYTES_BIG_ENDIAN && (location != 0))
13546 /* After setup, we want the high elements of the first vector (stored
13547 at the LSB end of the register), and the low elements of the second
13548 vector (stored at the MSB end of the register). So swap. */
13549 std::swap (d->op0, d->op1);
13550 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13551 location = nelt - location;
13554 offset = GEN_INT (location);
13555 emit_insn (gen (d->target, d->op0, d->op1, offset));
13556 return true;
13559 /* Recognize patterns for the REV insns. */
13561 static bool
13562 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13564 unsigned int i, j, diff, nelt = d->nelt;
13565 rtx (*gen) (rtx, rtx);
13567 if (!d->one_vector_p)
13568 return false;
13570 diff = d->perm[0];
13571 switch (diff)
13573 case 7:
13574 switch (d->vmode)
13576 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13577 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13578 default:
13579 return false;
13581 break;
13582 case 3:
13583 switch (d->vmode)
13585 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13586 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13587 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13588 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13589 default:
13590 return false;
13592 break;
13593 case 1:
13594 switch (d->vmode)
13596 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13597 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13598 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13599 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13600 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13601 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13602 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13603 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13604 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13605 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13606 default:
13607 return false;
13609 break;
13610 default:
13611 return false;
13614 for (i = 0; i < nelt ; i += diff + 1)
13615 for (j = 0; j <= diff; j += 1)
13617 /* This is guaranteed to be true as the value of diff
13618 is 7, 3, 1 and we should have enough elements in the
13619 queue to generate this. Getting a vector mask with a
13620 value of diff other than these values implies that
13621 something is wrong by the time we get here. */
13622 gcc_assert (i + j < nelt);
13623 if (d->perm[i + j] != i + diff - j)
13624 return false;
13627 /* Success! */
13628 if (d->testing_p)
13629 return true;
13631 emit_insn (gen (d->target, d->op0));
13632 return true;
13635 static bool
13636 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13638 rtx (*gen) (rtx, rtx, rtx);
13639 rtx out = d->target;
13640 rtx in0;
13641 machine_mode vmode = d->vmode;
13642 unsigned int i, elt, nelt = d->nelt;
13643 rtx lane;
13645 elt = d->perm[0];
13646 for (i = 1; i < nelt; i++)
13648 if (elt != d->perm[i])
13649 return false;
13652 /* The generic preparation in aarch64_expand_vec_perm_const_1
13653 swaps the operand order and the permute indices if it finds
13654 d->perm[0] to be in the second operand. Thus, we can always
13655 use d->op0 and need not do any extra arithmetic to get the
13656 correct lane number. */
13657 in0 = d->op0;
13658 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13660 switch (vmode)
13662 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13663 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13664 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13665 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13666 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13667 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13668 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13669 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13670 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13671 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13672 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13673 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13674 default:
13675 return false;
13678 emit_insn (gen (out, in0, lane));
13679 return true;
13682 static bool
13683 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13685 rtx rperm[MAX_VECT_LEN], sel;
13686 machine_mode vmode = d->vmode;
13687 unsigned int i, nelt = d->nelt;
13689 if (d->testing_p)
13690 return true;
13692 /* Generic code will try constant permutation twice. Once with the
13693 original mode and again with the elements lowered to QImode.
13694 So wait and don't do the selector expansion ourselves. */
13695 if (vmode != V8QImode && vmode != V16QImode)
13696 return false;
13698 for (i = 0; i < nelt; ++i)
13700 int nunits = GET_MODE_NUNITS (vmode);
13702 /* If big-endian and two vectors we end up with a weird mixed-endian
13703 mode on NEON. Reverse the index within each word but not the word
13704 itself. */
13705 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13706 : d->perm[i]);
13708 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13709 sel = force_reg (vmode, sel);
13711 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13712 return true;
13715 static bool
13716 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13718 /* The pattern matching functions above are written to look for a small
13719 number to begin the sequence (0, 1, N/2). If we begin with an index
13720 from the second operand, we can swap the operands. */
13721 if (d->perm[0] >= d->nelt)
13723 unsigned i, nelt = d->nelt;
13725 gcc_assert (nelt == (nelt & -nelt));
13726 for (i = 0; i < nelt; ++i)
13727 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13729 std::swap (d->op0, d->op1);
13732 if (TARGET_SIMD)
13734 if (aarch64_evpc_rev (d))
13735 return true;
13736 else if (aarch64_evpc_ext (d))
13737 return true;
13738 else if (aarch64_evpc_dup (d))
13739 return true;
13740 else if (aarch64_evpc_zip (d))
13741 return true;
13742 else if (aarch64_evpc_uzp (d))
13743 return true;
13744 else if (aarch64_evpc_trn (d))
13745 return true;
13746 return aarch64_evpc_tbl (d);
13748 return false;
13751 /* Expand a vec_perm_const pattern. */
13753 bool
13754 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13756 struct expand_vec_perm_d d;
13757 int i, nelt, which;
13759 d.target = target;
13760 d.op0 = op0;
13761 d.op1 = op1;
13763 d.vmode = GET_MODE (target);
13764 gcc_assert (VECTOR_MODE_P (d.vmode));
13765 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13766 d.testing_p = false;
13768 for (i = which = 0; i < nelt; ++i)
13770 rtx e = XVECEXP (sel, 0, i);
13771 int ei = INTVAL (e) & (2 * nelt - 1);
13772 which |= (ei < nelt ? 1 : 2);
13773 d.perm[i] = ei;
13776 switch (which)
13778 default:
13779 gcc_unreachable ();
13781 case 3:
13782 d.one_vector_p = false;
13783 if (!rtx_equal_p (op0, op1))
13784 break;
13786 /* The elements of PERM do not suggest that only the first operand
13787 is used, but both operands are identical. Allow easier matching
13788 of the permutation by folding the permutation into the single
13789 input vector. */
13790 /* Fall Through. */
13791 case 2:
13792 for (i = 0; i < nelt; ++i)
13793 d.perm[i] &= nelt - 1;
13794 d.op0 = op1;
13795 d.one_vector_p = true;
13796 break;
13798 case 1:
13799 d.op1 = op0;
13800 d.one_vector_p = true;
13801 break;
13804 return aarch64_expand_vec_perm_const_1 (&d);
13807 static bool
13808 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13809 const unsigned char *sel)
13811 struct expand_vec_perm_d d;
13812 unsigned int i, nelt, which;
13813 bool ret;
13815 d.vmode = vmode;
13816 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13817 d.testing_p = true;
13818 memcpy (d.perm, sel, nelt);
13820 /* Calculate whether all elements are in one vector. */
13821 for (i = which = 0; i < nelt; ++i)
13823 unsigned char e = d.perm[i];
13824 gcc_assert (e < 2 * nelt);
13825 which |= (e < nelt ? 1 : 2);
13828 /* If all elements are from the second vector, reindex as if from the
13829 first vector. */
13830 if (which == 2)
13831 for (i = 0; i < nelt; ++i)
13832 d.perm[i] -= nelt;
13834 /* Check whether the mask can be applied to a single vector. */
13835 d.one_vector_p = (which != 3);
13837 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13838 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13839 if (!d.one_vector_p)
13840 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13842 start_sequence ();
13843 ret = aarch64_expand_vec_perm_const_1 (&d);
13844 end_sequence ();
13846 return ret;
13850 aarch64_reverse_mask (machine_mode mode)
13852 /* We have to reverse each vector because we dont have
13853 a permuted load that can reverse-load according to ABI rules. */
13854 rtx mask;
13855 rtvec v = rtvec_alloc (16);
13856 int i, j;
13857 int nunits = GET_MODE_NUNITS (mode);
13858 int usize = GET_MODE_UNIT_SIZE (mode);
13860 gcc_assert (BYTES_BIG_ENDIAN);
13861 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13863 for (i = 0; i < nunits; i++)
13864 for (j = 0; j < usize; j++)
13865 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13866 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13867 return force_reg (V16QImode, mask);
13870 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
13871 true. However due to issues with register allocation it is preferable
13872 to avoid tieing integer scalar and FP scalar modes. Executing integer
13873 operations in general registers is better than treating them as scalar
13874 vector operations. This reduces latency and avoids redundant int<->FP
13875 moves. So tie modes if they are either the same class, or vector modes
13876 with other vector modes, vector structs or any scalar mode. */
13878 static bool
13879 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13881 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13882 return true;
13884 /* We specifically want to allow elements of "structure" modes to
13885 be tieable to the structure. This more general condition allows
13886 other rarer situations too. */
13887 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13888 return true;
13890 /* Also allow any scalar modes with vectors. */
13891 if (aarch64_vector_mode_supported_p (mode1)
13892 || aarch64_vector_mode_supported_p (mode2))
13893 return true;
13895 return false;
13898 /* Return a new RTX holding the result of moving POINTER forward by
13899 AMOUNT bytes. */
13901 static rtx
13902 aarch64_move_pointer (rtx pointer, int amount)
13904 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13906 return adjust_automodify_address (pointer, GET_MODE (pointer),
13907 next, amount);
13910 /* Return a new RTX holding the result of moving POINTER forward by the
13911 size of the mode it points to. */
13913 static rtx
13914 aarch64_progress_pointer (rtx pointer)
13916 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13918 return aarch64_move_pointer (pointer, amount);
13921 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13922 MODE bytes. */
13924 static void
13925 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13926 machine_mode mode)
13928 rtx reg = gen_reg_rtx (mode);
13930 /* "Cast" the pointers to the correct mode. */
13931 *src = adjust_address (*src, mode, 0);
13932 *dst = adjust_address (*dst, mode, 0);
13933 /* Emit the memcpy. */
13934 emit_move_insn (reg, *src);
13935 emit_move_insn (*dst, reg);
13936 /* Move the pointers forward. */
13937 *src = aarch64_progress_pointer (*src);
13938 *dst = aarch64_progress_pointer (*dst);
13941 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13942 we succeed, otherwise return false. */
13944 bool
13945 aarch64_expand_movmem (rtx *operands)
13947 unsigned int n;
13948 rtx dst = operands[0];
13949 rtx src = operands[1];
13950 rtx base;
13951 bool speed_p = !optimize_function_for_size_p (cfun);
13953 /* When optimizing for size, give a better estimate of the length of a
13954 memcpy call, but use the default otherwise. */
13955 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13957 /* We can't do anything smart if the amount to copy is not constant. */
13958 if (!CONST_INT_P (operands[2]))
13959 return false;
13961 n = UINTVAL (operands[2]);
13963 /* Try to keep the number of instructions low. For cases below 16 bytes we
13964 need to make at most two moves. For cases above 16 bytes it will be one
13965 move for each 16 byte chunk, then at most two additional moves. */
13966 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13967 return false;
13969 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13970 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13972 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13973 src = adjust_automodify_address (src, VOIDmode, base, 0);
13975 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13976 1-byte chunk. */
13977 if (n < 4)
13979 if (n >= 2)
13981 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13982 n -= 2;
13985 if (n == 1)
13986 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13988 return true;
13991 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13992 4-byte chunk, partially overlapping with the previously copied chunk. */
13993 if (n < 8)
13995 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13996 n -= 4;
13997 if (n > 0)
13999 int move = n - 4;
14001 src = aarch64_move_pointer (src, move);
14002 dst = aarch64_move_pointer (dst, move);
14003 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14005 return true;
14008 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
14009 them, then (if applicable) an 8-byte chunk. */
14010 while (n >= 8)
14012 if (n / 16)
14014 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14015 n -= 16;
14017 else
14019 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14020 n -= 8;
14024 /* Finish the final bytes of the copy. We can always do this in one
14025 instruction. We either copy the exact amount we need, or partially
14026 overlap with the previous chunk we copied and copy 8-bytes. */
14027 if (n == 0)
14028 return true;
14029 else if (n == 1)
14030 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14031 else if (n == 2)
14032 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14033 else if (n == 4)
14034 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14035 else
14037 if (n == 3)
14039 src = aarch64_move_pointer (src, -1);
14040 dst = aarch64_move_pointer (dst, -1);
14041 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14043 else
14045 int move = n - 8;
14047 src = aarch64_move_pointer (src, move);
14048 dst = aarch64_move_pointer (dst, move);
14049 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14053 return true;
14056 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14057 SImode stores. Handle the case when the constant has identical
14058 bottom and top halves. This is beneficial when the two stores can be
14059 merged into an STP and we avoid synthesising potentially expensive
14060 immediates twice. Return true if such a split is possible. */
14062 bool
14063 aarch64_split_dimode_const_store (rtx dst, rtx src)
14065 rtx lo = gen_lowpart (SImode, src);
14066 rtx hi = gen_highpart_mode (SImode, DImode, src);
14068 bool size_p = optimize_function_for_size_p (cfun);
14070 if (!rtx_equal_p (lo, hi))
14071 return false;
14073 unsigned int orig_cost
14074 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14075 unsigned int lo_cost
14076 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14078 /* We want to transform:
14079 MOV x1, 49370
14080 MOVK x1, 0x140, lsl 16
14081 MOVK x1, 0xc0da, lsl 32
14082 MOVK x1, 0x140, lsl 48
14083 STR x1, [x0]
14084 into:
14085 MOV w1, 49370
14086 MOVK w1, 0x140, lsl 16
14087 STP w1, w1, [x0]
14088 So we want to perform this only when we save two instructions
14089 or more. When optimizing for size, however, accept any code size
14090 savings we can. */
14091 if (size_p && orig_cost <= lo_cost)
14092 return false;
14094 if (!size_p
14095 && (orig_cost <= lo_cost + 1))
14096 return false;
14098 rtx mem_lo = adjust_address (dst, SImode, 0);
14099 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14100 return false;
14102 rtx tmp_reg = gen_reg_rtx (SImode);
14103 aarch64_expand_mov_immediate (tmp_reg, lo);
14104 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14105 /* Don't emit an explicit store pair as this may not be always profitable.
14106 Let the sched-fusion logic decide whether to merge them. */
14107 emit_move_insn (mem_lo, tmp_reg);
14108 emit_move_insn (mem_hi, tmp_reg);
14110 return true;
14113 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14115 static unsigned HOST_WIDE_INT
14116 aarch64_asan_shadow_offset (void)
14118 return (HOST_WIDE_INT_1 << 36);
14121 static bool
14122 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14123 unsigned int align,
14124 enum by_pieces_operation op,
14125 bool speed_p)
14127 /* STORE_BY_PIECES can be used when copying a constant string, but
14128 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14129 For now we always fail this and let the move_by_pieces code copy
14130 the string from read-only memory. */
14131 if (op == STORE_BY_PIECES)
14132 return false;
14134 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14137 static rtx
14138 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14139 int code, tree treeop0, tree treeop1)
14141 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14142 rtx op0, op1;
14143 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14144 insn_code icode;
14145 struct expand_operand ops[4];
14147 start_sequence ();
14148 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14150 op_mode = GET_MODE (op0);
14151 if (op_mode == VOIDmode)
14152 op_mode = GET_MODE (op1);
14154 switch (op_mode)
14156 case E_QImode:
14157 case E_HImode:
14158 case E_SImode:
14159 cmp_mode = SImode;
14160 icode = CODE_FOR_cmpsi;
14161 break;
14163 case E_DImode:
14164 cmp_mode = DImode;
14165 icode = CODE_FOR_cmpdi;
14166 break;
14168 case E_SFmode:
14169 cmp_mode = SFmode;
14170 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14171 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14172 break;
14174 case E_DFmode:
14175 cmp_mode = DFmode;
14176 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14177 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14178 break;
14180 default:
14181 end_sequence ();
14182 return NULL_RTX;
14185 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14186 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14187 if (!op0 || !op1)
14189 end_sequence ();
14190 return NULL_RTX;
14192 *prep_seq = get_insns ();
14193 end_sequence ();
14195 create_fixed_operand (&ops[0], op0);
14196 create_fixed_operand (&ops[1], op1);
14198 start_sequence ();
14199 if (!maybe_expand_insn (icode, 2, ops))
14201 end_sequence ();
14202 return NULL_RTX;
14204 *gen_seq = get_insns ();
14205 end_sequence ();
14207 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14208 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14211 static rtx
14212 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14213 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14215 rtx op0, op1, target;
14216 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14217 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14218 insn_code icode;
14219 struct expand_operand ops[6];
14220 int aarch64_cond;
14222 push_to_sequence (*prep_seq);
14223 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14225 op_mode = GET_MODE (op0);
14226 if (op_mode == VOIDmode)
14227 op_mode = GET_MODE (op1);
14229 switch (op_mode)
14231 case E_QImode:
14232 case E_HImode:
14233 case E_SImode:
14234 cmp_mode = SImode;
14235 icode = CODE_FOR_ccmpsi;
14236 break;
14238 case E_DImode:
14239 cmp_mode = DImode;
14240 icode = CODE_FOR_ccmpdi;
14241 break;
14243 case E_SFmode:
14244 cmp_mode = SFmode;
14245 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14246 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14247 break;
14249 case E_DFmode:
14250 cmp_mode = DFmode;
14251 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14252 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14253 break;
14255 default:
14256 end_sequence ();
14257 return NULL_RTX;
14260 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14261 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14262 if (!op0 || !op1)
14264 end_sequence ();
14265 return NULL_RTX;
14267 *prep_seq = get_insns ();
14268 end_sequence ();
14270 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14271 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14273 if (bit_code != AND)
14275 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14276 GET_MODE (XEXP (prev, 0))),
14277 VOIDmode, XEXP (prev, 0), const0_rtx);
14278 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14281 create_fixed_operand (&ops[0], XEXP (prev, 0));
14282 create_fixed_operand (&ops[1], target);
14283 create_fixed_operand (&ops[2], op0);
14284 create_fixed_operand (&ops[3], op1);
14285 create_fixed_operand (&ops[4], prev);
14286 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14288 push_to_sequence (*gen_seq);
14289 if (!maybe_expand_insn (icode, 6, ops))
14291 end_sequence ();
14292 return NULL_RTX;
14295 *gen_seq = get_insns ();
14296 end_sequence ();
14298 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14301 #undef TARGET_GEN_CCMP_FIRST
14302 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14304 #undef TARGET_GEN_CCMP_NEXT
14305 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14307 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14308 instruction fusion of some sort. */
14310 static bool
14311 aarch64_macro_fusion_p (void)
14313 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14317 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14318 should be kept together during scheduling. */
14320 static bool
14321 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14323 rtx set_dest;
14324 rtx prev_set = single_set (prev);
14325 rtx curr_set = single_set (curr);
14326 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14327 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14329 if (!aarch64_macro_fusion_p ())
14330 return false;
14332 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14334 /* We are trying to match:
14335 prev (mov) == (set (reg r0) (const_int imm16))
14336 curr (movk) == (set (zero_extract (reg r0)
14337 (const_int 16)
14338 (const_int 16))
14339 (const_int imm16_1)) */
14341 set_dest = SET_DEST (curr_set);
14343 if (GET_CODE (set_dest) == ZERO_EXTRACT
14344 && CONST_INT_P (SET_SRC (curr_set))
14345 && CONST_INT_P (SET_SRC (prev_set))
14346 && CONST_INT_P (XEXP (set_dest, 2))
14347 && INTVAL (XEXP (set_dest, 2)) == 16
14348 && REG_P (XEXP (set_dest, 0))
14349 && REG_P (SET_DEST (prev_set))
14350 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14352 return true;
14356 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14359 /* We're trying to match:
14360 prev (adrp) == (set (reg r1)
14361 (high (symbol_ref ("SYM"))))
14362 curr (add) == (set (reg r0)
14363 (lo_sum (reg r1)
14364 (symbol_ref ("SYM"))))
14365 Note that r0 need not necessarily be the same as r1, especially
14366 during pre-regalloc scheduling. */
14368 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14369 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14371 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14372 && REG_P (XEXP (SET_SRC (curr_set), 0))
14373 && REGNO (XEXP (SET_SRC (curr_set), 0))
14374 == REGNO (SET_DEST (prev_set))
14375 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14376 XEXP (SET_SRC (curr_set), 1)))
14377 return true;
14381 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14384 /* We're trying to match:
14385 prev (movk) == (set (zero_extract (reg r0)
14386 (const_int 16)
14387 (const_int 32))
14388 (const_int imm16_1))
14389 curr (movk) == (set (zero_extract (reg r0)
14390 (const_int 16)
14391 (const_int 48))
14392 (const_int imm16_2)) */
14394 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14395 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14396 && REG_P (XEXP (SET_DEST (prev_set), 0))
14397 && REG_P (XEXP (SET_DEST (curr_set), 0))
14398 && REGNO (XEXP (SET_DEST (prev_set), 0))
14399 == REGNO (XEXP (SET_DEST (curr_set), 0))
14400 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14401 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14402 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14403 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14404 && CONST_INT_P (SET_SRC (prev_set))
14405 && CONST_INT_P (SET_SRC (curr_set)))
14406 return true;
14409 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14411 /* We're trying to match:
14412 prev (adrp) == (set (reg r0)
14413 (high (symbol_ref ("SYM"))))
14414 curr (ldr) == (set (reg r1)
14415 (mem (lo_sum (reg r0)
14416 (symbol_ref ("SYM")))))
14418 curr (ldr) == (set (reg r1)
14419 (zero_extend (mem
14420 (lo_sum (reg r0)
14421 (symbol_ref ("SYM")))))) */
14422 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14423 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14425 rtx curr_src = SET_SRC (curr_set);
14427 if (GET_CODE (curr_src) == ZERO_EXTEND)
14428 curr_src = XEXP (curr_src, 0);
14430 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14431 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14432 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14433 == REGNO (SET_DEST (prev_set))
14434 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14435 XEXP (SET_SRC (prev_set), 0)))
14436 return true;
14440 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14441 && aarch_crypto_can_dual_issue (prev, curr))
14442 return true;
14444 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14445 && any_condjump_p (curr))
14447 enum attr_type prev_type = get_attr_type (prev);
14449 unsigned int condreg1, condreg2;
14450 rtx cc_reg_1;
14451 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14452 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14454 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14455 && prev
14456 && modified_in_p (cc_reg_1, prev))
14458 /* FIXME: this misses some which is considered simple arthematic
14459 instructions for ThunderX. Simple shifts are missed here. */
14460 if (prev_type == TYPE_ALUS_SREG
14461 || prev_type == TYPE_ALUS_IMM
14462 || prev_type == TYPE_LOGICS_REG
14463 || prev_type == TYPE_LOGICS_IMM)
14464 return true;
14468 if (prev_set
14469 && curr_set
14470 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14471 && any_condjump_p (curr))
14473 /* We're trying to match:
14474 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14475 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14476 (const_int 0))
14477 (label_ref ("SYM"))
14478 (pc)) */
14479 if (SET_DEST (curr_set) == (pc_rtx)
14480 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14481 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14482 && REG_P (SET_DEST (prev_set))
14483 && REGNO (SET_DEST (prev_set))
14484 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14486 /* Fuse ALU operations followed by conditional branch instruction. */
14487 switch (get_attr_type (prev))
14489 case TYPE_ALU_IMM:
14490 case TYPE_ALU_SREG:
14491 case TYPE_ADC_REG:
14492 case TYPE_ADC_IMM:
14493 case TYPE_ADCS_REG:
14494 case TYPE_ADCS_IMM:
14495 case TYPE_LOGIC_REG:
14496 case TYPE_LOGIC_IMM:
14497 case TYPE_CSEL:
14498 case TYPE_ADR:
14499 case TYPE_MOV_IMM:
14500 case TYPE_SHIFT_REG:
14501 case TYPE_SHIFT_IMM:
14502 case TYPE_BFM:
14503 case TYPE_RBIT:
14504 case TYPE_REV:
14505 case TYPE_EXTEND:
14506 return true;
14508 default:;
14513 return false;
14516 /* Return true iff the instruction fusion described by OP is enabled. */
14518 bool
14519 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14521 return (aarch64_tune_params.fusible_ops & op) != 0;
14524 /* If MEM is in the form of [base+offset], extract the two parts
14525 of address and set to BASE and OFFSET, otherwise return false
14526 after clearing BASE and OFFSET. */
14528 bool
14529 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14531 rtx addr;
14533 gcc_assert (MEM_P (mem));
14535 addr = XEXP (mem, 0);
14537 if (REG_P (addr))
14539 *base = addr;
14540 *offset = const0_rtx;
14541 return true;
14544 if (GET_CODE (addr) == PLUS
14545 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14547 *base = XEXP (addr, 0);
14548 *offset = XEXP (addr, 1);
14549 return true;
14552 *base = NULL_RTX;
14553 *offset = NULL_RTX;
14555 return false;
14558 /* Types for scheduling fusion. */
14559 enum sched_fusion_type
14561 SCHED_FUSION_NONE = 0,
14562 SCHED_FUSION_LD_SIGN_EXTEND,
14563 SCHED_FUSION_LD_ZERO_EXTEND,
14564 SCHED_FUSION_LD,
14565 SCHED_FUSION_ST,
14566 SCHED_FUSION_NUM
14569 /* If INSN is a load or store of address in the form of [base+offset],
14570 extract the two parts and set to BASE and OFFSET. Return scheduling
14571 fusion type this INSN is. */
14573 static enum sched_fusion_type
14574 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14576 rtx x, dest, src;
14577 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14579 gcc_assert (INSN_P (insn));
14580 x = PATTERN (insn);
14581 if (GET_CODE (x) != SET)
14582 return SCHED_FUSION_NONE;
14584 src = SET_SRC (x);
14585 dest = SET_DEST (x);
14587 machine_mode dest_mode = GET_MODE (dest);
14589 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14590 return SCHED_FUSION_NONE;
14592 if (GET_CODE (src) == SIGN_EXTEND)
14594 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14595 src = XEXP (src, 0);
14596 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14597 return SCHED_FUSION_NONE;
14599 else if (GET_CODE (src) == ZERO_EXTEND)
14601 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14602 src = XEXP (src, 0);
14603 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14604 return SCHED_FUSION_NONE;
14607 if (GET_CODE (src) == MEM && REG_P (dest))
14608 extract_base_offset_in_addr (src, base, offset);
14609 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14611 fusion = SCHED_FUSION_ST;
14612 extract_base_offset_in_addr (dest, base, offset);
14614 else
14615 return SCHED_FUSION_NONE;
14617 if (*base == NULL_RTX || *offset == NULL_RTX)
14618 fusion = SCHED_FUSION_NONE;
14620 return fusion;
14623 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14625 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14626 and PRI are only calculated for these instructions. For other instruction,
14627 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14628 type instruction fusion can be added by returning different priorities.
14630 It's important that irrelevant instructions get the largest FUSION_PRI. */
14632 static void
14633 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14634 int *fusion_pri, int *pri)
14636 int tmp, off_val;
14637 rtx base, offset;
14638 enum sched_fusion_type fusion;
14640 gcc_assert (INSN_P (insn));
14642 tmp = max_pri - 1;
14643 fusion = fusion_load_store (insn, &base, &offset);
14644 if (fusion == SCHED_FUSION_NONE)
14646 *pri = tmp;
14647 *fusion_pri = tmp;
14648 return;
14651 /* Set FUSION_PRI according to fusion type and base register. */
14652 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14654 /* Calculate PRI. */
14655 tmp /= 2;
14657 /* INSN with smaller offset goes first. */
14658 off_val = (int)(INTVAL (offset));
14659 if (off_val >= 0)
14660 tmp -= (off_val & 0xfffff);
14661 else
14662 tmp += ((- off_val) & 0xfffff);
14664 *pri = tmp;
14665 return;
14668 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14669 Adjust priority of sha1h instructions so they are scheduled before
14670 other SHA1 instructions. */
14672 static int
14673 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14675 rtx x = PATTERN (insn);
14677 if (GET_CODE (x) == SET)
14679 x = SET_SRC (x);
14681 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14682 return priority + 10;
14685 return priority;
14688 /* Given OPERANDS of consecutive load/store, check if we can merge
14689 them into ldp/stp. LOAD is true if they are load instructions.
14690 MODE is the mode of memory operands. */
14692 bool
14693 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14694 machine_mode mode)
14696 HOST_WIDE_INT offval_1, offval_2, msize;
14697 enum reg_class rclass_1, rclass_2;
14698 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14700 if (load)
14702 mem_1 = operands[1];
14703 mem_2 = operands[3];
14704 reg_1 = operands[0];
14705 reg_2 = operands[2];
14706 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14707 if (REGNO (reg_1) == REGNO (reg_2))
14708 return false;
14710 else
14712 mem_1 = operands[0];
14713 mem_2 = operands[2];
14714 reg_1 = operands[1];
14715 reg_2 = operands[3];
14718 /* The mems cannot be volatile. */
14719 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14720 return false;
14722 /* If we have SImode and slow unaligned ldp,
14723 check the alignment to be at least 8 byte. */
14724 if (mode == SImode
14725 && (aarch64_tune_params.extra_tuning_flags
14726 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14727 && !optimize_size
14728 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14729 return false;
14731 /* Check if the addresses are in the form of [base+offset]. */
14732 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14733 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14734 return false;
14735 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14736 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14737 return false;
14739 /* Check if the bases are same. */
14740 if (!rtx_equal_p (base_1, base_2))
14741 return false;
14743 offval_1 = INTVAL (offset_1);
14744 offval_2 = INTVAL (offset_2);
14745 msize = GET_MODE_SIZE (mode);
14746 /* Check if the offsets are consecutive. */
14747 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14748 return false;
14750 /* Check if the addresses are clobbered by load. */
14751 if (load)
14753 if (reg_mentioned_p (reg_1, mem_1))
14754 return false;
14756 /* In increasing order, the last load can clobber the address. */
14757 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14758 return false;
14761 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14762 rclass_1 = FP_REGS;
14763 else
14764 rclass_1 = GENERAL_REGS;
14766 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14767 rclass_2 = FP_REGS;
14768 else
14769 rclass_2 = GENERAL_REGS;
14771 /* Check if the registers are of same class. */
14772 if (rclass_1 != rclass_2)
14773 return false;
14775 return true;
14778 /* Given OPERANDS of consecutive load/store, check if we can merge
14779 them into ldp/stp by adjusting the offset. LOAD is true if they
14780 are load instructions. MODE is the mode of memory operands.
14782 Given below consecutive stores:
14784 str w1, [xb, 0x100]
14785 str w1, [xb, 0x104]
14786 str w1, [xb, 0x108]
14787 str w1, [xb, 0x10c]
14789 Though the offsets are out of the range supported by stp, we can
14790 still pair them after adjusting the offset, like:
14792 add scratch, xb, 0x100
14793 stp w1, w1, [scratch]
14794 stp w1, w1, [scratch, 0x8]
14796 The peephole patterns detecting this opportunity should guarantee
14797 the scratch register is avaliable. */
14799 bool
14800 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14801 scalar_mode mode)
14803 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14804 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14805 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14806 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14808 if (load)
14810 reg_1 = operands[0];
14811 mem_1 = operands[1];
14812 reg_2 = operands[2];
14813 mem_2 = operands[3];
14814 reg_3 = operands[4];
14815 mem_3 = operands[5];
14816 reg_4 = operands[6];
14817 mem_4 = operands[7];
14818 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14819 && REG_P (reg_3) && REG_P (reg_4));
14820 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14821 return false;
14823 else
14825 mem_1 = operands[0];
14826 reg_1 = operands[1];
14827 mem_2 = operands[2];
14828 reg_2 = operands[3];
14829 mem_3 = operands[4];
14830 reg_3 = operands[5];
14831 mem_4 = operands[6];
14832 reg_4 = operands[7];
14834 /* Skip if memory operand is by itslef valid for ldp/stp. */
14835 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14836 return false;
14838 /* The mems cannot be volatile. */
14839 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14840 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14841 return false;
14843 /* Check if the addresses are in the form of [base+offset]. */
14844 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14845 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14846 return false;
14847 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14848 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14849 return false;
14850 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14851 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14852 return false;
14853 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14854 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14855 return false;
14857 /* Check if the bases are same. */
14858 if (!rtx_equal_p (base_1, base_2)
14859 || !rtx_equal_p (base_2, base_3)
14860 || !rtx_equal_p (base_3, base_4))
14861 return false;
14863 offval_1 = INTVAL (offset_1);
14864 offval_2 = INTVAL (offset_2);
14865 offval_3 = INTVAL (offset_3);
14866 offval_4 = INTVAL (offset_4);
14867 msize = GET_MODE_SIZE (mode);
14868 /* Check if the offsets are consecutive. */
14869 if ((offval_1 != (offval_2 + msize)
14870 || offval_1 != (offval_3 + msize * 2)
14871 || offval_1 != (offval_4 + msize * 3))
14872 && (offval_4 != (offval_3 + msize)
14873 || offval_4 != (offval_2 + msize * 2)
14874 || offval_4 != (offval_1 + msize * 3)))
14875 return false;
14877 /* Check if the addresses are clobbered by load. */
14878 if (load)
14880 if (reg_mentioned_p (reg_1, mem_1)
14881 || reg_mentioned_p (reg_2, mem_2)
14882 || reg_mentioned_p (reg_3, mem_3))
14883 return false;
14885 /* In increasing order, the last load can clobber the address. */
14886 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14887 return false;
14890 /* If we have SImode and slow unaligned ldp,
14891 check the alignment to be at least 8 byte. */
14892 if (mode == SImode
14893 && (aarch64_tune_params.extra_tuning_flags
14894 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14895 && !optimize_size
14896 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14897 return false;
14899 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14900 rclass_1 = FP_REGS;
14901 else
14902 rclass_1 = GENERAL_REGS;
14904 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14905 rclass_2 = FP_REGS;
14906 else
14907 rclass_2 = GENERAL_REGS;
14909 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14910 rclass_3 = FP_REGS;
14911 else
14912 rclass_3 = GENERAL_REGS;
14914 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14915 rclass_4 = FP_REGS;
14916 else
14917 rclass_4 = GENERAL_REGS;
14919 /* Check if the registers are of same class. */
14920 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14921 return false;
14923 return true;
14926 /* Given OPERANDS of consecutive load/store, this function pairs them
14927 into ldp/stp after adjusting the offset. It depends on the fact
14928 that addresses of load/store instructions are in increasing order.
14929 MODE is the mode of memory operands. CODE is the rtl operator
14930 which should be applied to all memory operands, it's SIGN_EXTEND,
14931 ZERO_EXTEND or UNKNOWN. */
14933 bool
14934 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14935 scalar_mode mode, RTX_CODE code)
14937 rtx base, offset, t1, t2;
14938 rtx mem_1, mem_2, mem_3, mem_4;
14939 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14941 if (load)
14943 mem_1 = operands[1];
14944 mem_2 = operands[3];
14945 mem_3 = operands[5];
14946 mem_4 = operands[7];
14948 else
14950 mem_1 = operands[0];
14951 mem_2 = operands[2];
14952 mem_3 = operands[4];
14953 mem_4 = operands[6];
14954 gcc_assert (code == UNKNOWN);
14957 extract_base_offset_in_addr (mem_1, &base, &offset);
14958 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14960 /* Adjust offset thus it can fit in ldp/stp instruction. */
14961 msize = GET_MODE_SIZE (mode);
14962 stp_off_limit = msize * 0x40;
14963 off_val = INTVAL (offset);
14964 abs_off = (off_val < 0) ? -off_val : off_val;
14965 new_off = abs_off % stp_off_limit;
14966 adj_off = abs_off - new_off;
14968 /* Further adjust to make sure all offsets are OK. */
14969 if ((new_off + msize * 2) >= stp_off_limit)
14971 adj_off += stp_off_limit;
14972 new_off -= stp_off_limit;
14975 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14976 if (adj_off >= 0x1000)
14977 return false;
14979 if (off_val < 0)
14981 adj_off = -adj_off;
14982 new_off = -new_off;
14985 /* Create new memory references. */
14986 mem_1 = change_address (mem_1, VOIDmode,
14987 plus_constant (DImode, operands[8], new_off));
14989 /* Check if the adjusted address is OK for ldp/stp. */
14990 if (!aarch64_mem_pair_operand (mem_1, mode))
14991 return false;
14993 msize = GET_MODE_SIZE (mode);
14994 mem_2 = change_address (mem_2, VOIDmode,
14995 plus_constant (DImode,
14996 operands[8],
14997 new_off + msize));
14998 mem_3 = change_address (mem_3, VOIDmode,
14999 plus_constant (DImode,
15000 operands[8],
15001 new_off + msize * 2));
15002 mem_4 = change_address (mem_4, VOIDmode,
15003 plus_constant (DImode,
15004 operands[8],
15005 new_off + msize * 3));
15007 if (code == ZERO_EXTEND)
15009 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15010 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15011 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15012 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15014 else if (code == SIGN_EXTEND)
15016 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15017 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15018 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15019 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15022 if (load)
15024 operands[1] = mem_1;
15025 operands[3] = mem_2;
15026 operands[5] = mem_3;
15027 operands[7] = mem_4;
15029 else
15031 operands[0] = mem_1;
15032 operands[2] = mem_2;
15033 operands[4] = mem_3;
15034 operands[6] = mem_4;
15037 /* Emit adjusting instruction. */
15038 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15039 /* Emit ldp/stp instructions. */
15040 t1 = gen_rtx_SET (operands[0], operands[1]);
15041 t2 = gen_rtx_SET (operands[2], operands[3]);
15042 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15043 t1 = gen_rtx_SET (operands[4], operands[5]);
15044 t2 = gen_rtx_SET (operands[6], operands[7]);
15045 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15046 return true;
15049 /* Return 1 if pseudo register should be created and used to hold
15050 GOT address for PIC code. */
15052 bool
15053 aarch64_use_pseudo_pic_reg (void)
15055 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15058 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15060 static int
15061 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15063 switch (XINT (x, 1))
15065 case UNSPEC_GOTSMALLPIC:
15066 case UNSPEC_GOTSMALLPIC28K:
15067 case UNSPEC_GOTTINYPIC:
15068 return 0;
15069 default:
15070 break;
15073 return default_unspec_may_trap_p (x, flags);
15077 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15078 return the log2 of that value. Otherwise return -1. */
15081 aarch64_fpconst_pow_of_2 (rtx x)
15083 const REAL_VALUE_TYPE *r;
15085 if (!CONST_DOUBLE_P (x))
15086 return -1;
15088 r = CONST_DOUBLE_REAL_VALUE (x);
15090 if (REAL_VALUE_NEGATIVE (*r)
15091 || REAL_VALUE_ISNAN (*r)
15092 || REAL_VALUE_ISINF (*r)
15093 || !real_isinteger (r, DFmode))
15094 return -1;
15096 return exact_log2 (real_to_integer (r));
15099 /* If X is a vector of equal CONST_DOUBLE values and that value is
15100 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15103 aarch64_vec_fpconst_pow_of_2 (rtx x)
15105 if (GET_CODE (x) != CONST_VECTOR)
15106 return -1;
15108 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15109 return -1;
15111 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15112 if (firstval <= 0)
15113 return -1;
15115 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15116 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15117 return -1;
15119 return firstval;
15122 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15123 to float.
15125 __fp16 always promotes through this hook.
15126 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15127 through the generic excess precision logic rather than here. */
15129 static tree
15130 aarch64_promoted_type (const_tree t)
15132 if (SCALAR_FLOAT_TYPE_P (t)
15133 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15134 return float_type_node;
15136 return NULL_TREE;
15139 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15141 static bool
15142 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15143 optimization_type opt_type)
15145 switch (op)
15147 case rsqrt_optab:
15148 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15150 default:
15151 return true;
15155 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15156 if MODE is HFmode, and punt to the generic implementation otherwise. */
15158 static bool
15159 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15161 return (mode == HFmode
15162 ? true
15163 : default_libgcc_floating_mode_supported_p (mode));
15166 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15167 if MODE is HFmode, and punt to the generic implementation otherwise. */
15169 static bool
15170 aarch64_scalar_mode_supported_p (scalar_mode mode)
15172 return (mode == HFmode
15173 ? true
15174 : default_scalar_mode_supported_p (mode));
15177 /* Set the value of FLT_EVAL_METHOD.
15178 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15180 0: evaluate all operations and constants, whose semantic type has at
15181 most the range and precision of type float, to the range and
15182 precision of float; evaluate all other operations and constants to
15183 the range and precision of the semantic type;
15185 N, where _FloatN is a supported interchange floating type
15186 evaluate all operations and constants, whose semantic type has at
15187 most the range and precision of _FloatN type, to the range and
15188 precision of the _FloatN type; evaluate all other operations and
15189 constants to the range and precision of the semantic type;
15191 If we have the ARMv8.2-A extensions then we support _Float16 in native
15192 precision, so we should set this to 16. Otherwise, we support the type,
15193 but want to evaluate expressions in float precision, so set this to
15194 0. */
15196 static enum flt_eval_method
15197 aarch64_excess_precision (enum excess_precision_type type)
15199 switch (type)
15201 case EXCESS_PRECISION_TYPE_FAST:
15202 case EXCESS_PRECISION_TYPE_STANDARD:
15203 /* We can calculate either in 16-bit range and precision or
15204 32-bit range and precision. Make that decision based on whether
15205 we have native support for the ARMv8.2-A 16-bit floating-point
15206 instructions or not. */
15207 return (TARGET_FP_F16INST
15208 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15209 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15210 case EXCESS_PRECISION_TYPE_IMPLICIT:
15211 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15212 default:
15213 gcc_unreachable ();
15215 return FLT_EVAL_METHOD_UNPREDICTABLE;
15218 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15219 scheduled for speculative execution. Reject the long-running division
15220 and square-root instructions. */
15222 static bool
15223 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15225 switch (get_attr_type (insn))
15227 case TYPE_SDIV:
15228 case TYPE_UDIV:
15229 case TYPE_FDIVS:
15230 case TYPE_FDIVD:
15231 case TYPE_FSQRTS:
15232 case TYPE_FSQRTD:
15233 case TYPE_NEON_FP_SQRT_S:
15234 case TYPE_NEON_FP_SQRT_D:
15235 case TYPE_NEON_FP_SQRT_S_Q:
15236 case TYPE_NEON_FP_SQRT_D_Q:
15237 case TYPE_NEON_FP_DIV_S:
15238 case TYPE_NEON_FP_DIV_D:
15239 case TYPE_NEON_FP_DIV_S_Q:
15240 case TYPE_NEON_FP_DIV_D_Q:
15241 return false;
15242 default:
15243 return true;
15247 /* Target-specific selftests. */
15249 #if CHECKING_P
15251 namespace selftest {
15253 /* Selftest for the RTL loader.
15254 Verify that the RTL loader copes with a dump from
15255 print_rtx_function. This is essentially just a test that class
15256 function_reader can handle a real dump, but it also verifies
15257 that lookup_reg_by_dump_name correctly handles hard regs.
15258 The presence of hard reg names in the dump means that the test is
15259 target-specific, hence it is in this file. */
15261 static void
15262 aarch64_test_loading_full_dump ()
15264 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15266 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15268 rtx_insn *insn_1 = get_insn_by_uid (1);
15269 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15271 rtx_insn *insn_15 = get_insn_by_uid (15);
15272 ASSERT_EQ (INSN, GET_CODE (insn_15));
15273 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15275 /* Verify crtl->return_rtx. */
15276 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15277 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15278 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15281 /* Run all target-specific selftests. */
15283 static void
15284 aarch64_run_selftests (void)
15286 aarch64_test_loading_full_dump ();
15289 } // namespace selftest
15291 #endif /* #if CHECKING_P */
15293 #undef TARGET_ADDRESS_COST
15294 #define TARGET_ADDRESS_COST aarch64_address_cost
15296 /* This hook will determines whether unnamed bitfields affect the alignment
15297 of the containing structure. The hook returns true if the structure
15298 should inherit the alignment requirements of an unnamed bitfield's
15299 type. */
15300 #undef TARGET_ALIGN_ANON_BITFIELD
15301 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15303 #undef TARGET_ASM_ALIGNED_DI_OP
15304 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15306 #undef TARGET_ASM_ALIGNED_HI_OP
15307 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15309 #undef TARGET_ASM_ALIGNED_SI_OP
15310 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15312 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15313 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15314 hook_bool_const_tree_hwi_hwi_const_tree_true
15316 #undef TARGET_ASM_FILE_START
15317 #define TARGET_ASM_FILE_START aarch64_start_file
15319 #undef TARGET_ASM_OUTPUT_MI_THUNK
15320 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15322 #undef TARGET_ASM_SELECT_RTX_SECTION
15323 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15325 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15326 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15328 #undef TARGET_BUILD_BUILTIN_VA_LIST
15329 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15331 #undef TARGET_CALLEE_COPIES
15332 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15334 #undef TARGET_CAN_ELIMINATE
15335 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15337 #undef TARGET_CAN_INLINE_P
15338 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15340 #undef TARGET_CANNOT_FORCE_CONST_MEM
15341 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15343 #undef TARGET_CASE_VALUES_THRESHOLD
15344 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15346 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15347 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15349 /* Only the least significant bit is used for initialization guard
15350 variables. */
15351 #undef TARGET_CXX_GUARD_MASK_BIT
15352 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15354 #undef TARGET_C_MODE_FOR_SUFFIX
15355 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15357 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15358 #undef TARGET_DEFAULT_TARGET_FLAGS
15359 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15360 #endif
15362 #undef TARGET_CLASS_MAX_NREGS
15363 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15365 #undef TARGET_BUILTIN_DECL
15366 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15368 #undef TARGET_BUILTIN_RECIPROCAL
15369 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15371 #undef TARGET_C_EXCESS_PRECISION
15372 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15374 #undef TARGET_EXPAND_BUILTIN
15375 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15377 #undef TARGET_EXPAND_BUILTIN_VA_START
15378 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15380 #undef TARGET_FOLD_BUILTIN
15381 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15383 #undef TARGET_FUNCTION_ARG
15384 #define TARGET_FUNCTION_ARG aarch64_function_arg
15386 #undef TARGET_FUNCTION_ARG_ADVANCE
15387 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15389 #undef TARGET_FUNCTION_ARG_BOUNDARY
15390 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15392 #undef TARGET_FUNCTION_ARG_PADDING
15393 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15395 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15396 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15398 #undef TARGET_FUNCTION_VALUE
15399 #define TARGET_FUNCTION_VALUE aarch64_function_value
15401 #undef TARGET_FUNCTION_VALUE_REGNO_P
15402 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15404 #undef TARGET_FRAME_POINTER_REQUIRED
15405 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15407 #undef TARGET_GIMPLE_FOLD_BUILTIN
15408 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15410 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15411 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15413 #undef TARGET_INIT_BUILTINS
15414 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15416 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15417 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15418 aarch64_ira_change_pseudo_allocno_class
15420 #undef TARGET_LEGITIMATE_ADDRESS_P
15421 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15423 #undef TARGET_LEGITIMATE_CONSTANT_P
15424 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15426 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15427 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15428 aarch64_legitimize_address_displacement
15430 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15431 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15433 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15434 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15435 aarch64_libgcc_floating_mode_supported_p
15437 #undef TARGET_MANGLE_TYPE
15438 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15440 #undef TARGET_MEMORY_MOVE_COST
15441 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15443 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15444 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15446 #undef TARGET_MUST_PASS_IN_STACK
15447 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15449 /* This target hook should return true if accesses to volatile bitfields
15450 should use the narrowest mode possible. It should return false if these
15451 accesses should use the bitfield container type. */
15452 #undef TARGET_NARROW_VOLATILE_BITFIELD
15453 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15455 #undef TARGET_OPTION_OVERRIDE
15456 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15458 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15459 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15460 aarch64_override_options_after_change
15462 #undef TARGET_OPTION_SAVE
15463 #define TARGET_OPTION_SAVE aarch64_option_save
15465 #undef TARGET_OPTION_RESTORE
15466 #define TARGET_OPTION_RESTORE aarch64_option_restore
15468 #undef TARGET_OPTION_PRINT
15469 #define TARGET_OPTION_PRINT aarch64_option_print
15471 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15472 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15474 #undef TARGET_SET_CURRENT_FUNCTION
15475 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15477 #undef TARGET_PASS_BY_REFERENCE
15478 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15480 #undef TARGET_PREFERRED_RELOAD_CLASS
15481 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15483 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15484 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15486 #undef TARGET_PROMOTED_TYPE
15487 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15489 #undef TARGET_SECONDARY_RELOAD
15490 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15492 #undef TARGET_SHIFT_TRUNCATION_MASK
15493 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15495 #undef TARGET_SETUP_INCOMING_VARARGS
15496 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15498 #undef TARGET_STRUCT_VALUE_RTX
15499 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15501 #undef TARGET_REGISTER_MOVE_COST
15502 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15504 #undef TARGET_RETURN_IN_MEMORY
15505 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15507 #undef TARGET_RETURN_IN_MSB
15508 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15510 #undef TARGET_RTX_COSTS
15511 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15513 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15514 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15516 #undef TARGET_SCHED_ISSUE_RATE
15517 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15519 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15520 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15521 aarch64_sched_first_cycle_multipass_dfa_lookahead
15523 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15524 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15525 aarch64_first_cycle_multipass_dfa_lookahead_guard
15527 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15528 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15529 aarch64_get_separate_components
15531 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15532 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15533 aarch64_components_for_bb
15535 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15536 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15537 aarch64_disqualify_components
15539 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15540 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15541 aarch64_emit_prologue_components
15543 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15544 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15545 aarch64_emit_epilogue_components
15547 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15548 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15549 aarch64_set_handled_components
15551 #undef TARGET_TRAMPOLINE_INIT
15552 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15554 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15555 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15557 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15558 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15560 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15561 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15562 aarch64_builtin_support_vector_misalignment
15564 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15565 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15567 #undef TARGET_VECTORIZE_ADD_STMT_COST
15568 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15570 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15571 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15572 aarch64_builtin_vectorization_cost
15574 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15575 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15577 #undef TARGET_VECTORIZE_BUILTINS
15578 #define TARGET_VECTORIZE_BUILTINS
15580 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15581 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15582 aarch64_builtin_vectorized_function
15584 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15585 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15586 aarch64_autovectorize_vector_sizes
15588 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15589 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15590 aarch64_atomic_assign_expand_fenv
15592 /* Section anchor support. */
15594 #undef TARGET_MIN_ANCHOR_OFFSET
15595 #define TARGET_MIN_ANCHOR_OFFSET -256
15597 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15598 byte offset; we can do much more for larger data types, but have no way
15599 to determine the size of the access. We assume accesses are aligned. */
15600 #undef TARGET_MAX_ANCHOR_OFFSET
15601 #define TARGET_MAX_ANCHOR_OFFSET 4095
15603 #undef TARGET_VECTOR_ALIGNMENT
15604 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15606 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15607 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15608 aarch64_simd_vector_alignment_reachable
15610 /* vec_perm support. */
15612 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15613 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15614 aarch64_vectorize_vec_perm_const_ok
15616 #undef TARGET_INIT_LIBFUNCS
15617 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15619 #undef TARGET_FIXED_CONDITION_CODE_REGS
15620 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15622 #undef TARGET_FLAGS_REGNUM
15623 #define TARGET_FLAGS_REGNUM CC_REGNUM
15625 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15626 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15628 #undef TARGET_ASAN_SHADOW_OFFSET
15629 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15631 #undef TARGET_LEGITIMIZE_ADDRESS
15632 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15634 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15635 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15636 aarch64_use_by_pieces_infrastructure_p
15638 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15639 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15641 #undef TARGET_CAN_USE_DOLOOP_P
15642 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15644 #undef TARGET_SCHED_ADJUST_PRIORITY
15645 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15647 #undef TARGET_SCHED_MACRO_FUSION_P
15648 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15650 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15651 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15653 #undef TARGET_SCHED_FUSION_PRIORITY
15654 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15656 #undef TARGET_UNSPEC_MAY_TRAP_P
15657 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15659 #undef TARGET_USE_PSEUDO_PIC_REG
15660 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15662 #undef TARGET_PRINT_OPERAND
15663 #define TARGET_PRINT_OPERAND aarch64_print_operand
15665 #undef TARGET_PRINT_OPERAND_ADDRESS
15666 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15668 #undef TARGET_OPTAB_SUPPORTED_P
15669 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15671 #undef TARGET_OMIT_STRUCT_RETURN_REG
15672 #define TARGET_OMIT_STRUCT_RETURN_REG true
15674 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15675 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15676 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15678 #undef TARGET_HARD_REGNO_NREGS
15679 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15680 #undef TARGET_HARD_REGNO_MODE_OK
15681 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15683 #undef TARGET_MODES_TIEABLE_P
15684 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15686 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15687 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15688 aarch64_hard_regno_call_part_clobbered
15690 #if CHECKING_P
15691 #undef TARGET_RUN_TARGET_SELFTESTS
15692 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15693 #endif /* #if CHECKING_P */
15695 struct gcc_target targetm = TARGET_INITIALIZER;
15697 #include "gt-aarch64.h"