Introduce emit_frame_chain
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob1cc10432b769e4f04810bd664c918927a46f7df9
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
145 vec_perm_indices);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings =
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
869 /* A processor implementing AArch64. */
870 struct processor
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
929 aarch64_cc;
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
992 machine_mode mode;
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
997 if (best_class != ALL_REGS)
998 return best_class;
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1045 return mode == OImode || mode == CImode || mode == XImode;
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1067 return false;
1070 /* Implement TARGET_HARD_REGNO_NREGS. */
1072 static unsigned int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1075 switch (aarch64_regno_regclass (regno))
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1083 gcc_unreachable ();
1086 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return true;
1106 if (FP_REGNUM_P (regno))
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1110 else
1111 return true;
1114 return false;
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1118 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1119 clobbers the top 64 bits when restoring the bottom 64 bits. */
1121 static bool
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1124 return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1128 machine_mode
1129 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1130 machine_mode mode)
1132 /* Handle modes that fit within single registers. */
1133 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1135 if (GET_MODE_SIZE (mode) >= 4)
1136 return mode;
1137 else
1138 return SImode;
1140 /* Fall back to generic for multi-reg and very large modes. */
1141 else
1142 return choose_hard_reg_mode (regno, nregs, false);
1145 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1146 that strcpy from constants will be faster. */
1148 static HOST_WIDE_INT
1149 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1151 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1152 return MAX (align, BITS_PER_WORD);
1153 return align;
1156 /* Return true if calls to DECL should be treated as
1157 long-calls (ie called via a register). */
1158 static bool
1159 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1161 return false;
1164 /* Return true if calls to symbol-ref SYM should be treated as
1165 long-calls (ie called via a register). */
1166 bool
1167 aarch64_is_long_call_p (rtx sym)
1169 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1172 /* Return true if calls to symbol-ref SYM should not go through
1173 plt stubs. */
1175 bool
1176 aarch64_is_noplt_call_p (rtx sym)
1178 const_tree decl = SYMBOL_REF_DECL (sym);
1180 if (flag_pic
1181 && decl
1182 && (!flag_plt
1183 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1184 && !targetm.binds_local_p (decl))
1185 return true;
1187 return false;
1190 /* Return true if the offsets to a zero/sign-extract operation
1191 represent an expression that matches an extend operation. The
1192 operands represent the paramters from
1194 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1195 bool
1196 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1197 rtx extract_imm)
1199 HOST_WIDE_INT mult_val, extract_val;
1201 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1202 return false;
1204 mult_val = INTVAL (mult_imm);
1205 extract_val = INTVAL (extract_imm);
1207 if (extract_val > 8
1208 && extract_val < GET_MODE_BITSIZE (mode)
1209 && exact_log2 (extract_val & ~7) > 0
1210 && (extract_val & 7) <= 4
1211 && mult_val == (1 << (extract_val & 7)))
1212 return true;
1214 return false;
1217 /* Emit an insn that's a simple single-set. Both the operands must be
1218 known to be valid. */
1219 inline static rtx_insn *
1220 emit_set_insn (rtx x, rtx y)
1222 return emit_insn (gen_rtx_SET (x, y));
1225 /* X and Y are two things to compare using CODE. Emit the compare insn and
1226 return the rtx for register 0 in the proper mode. */
1228 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1230 machine_mode mode = SELECT_CC_MODE (code, x, y);
1231 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1233 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1234 return cc_reg;
1237 /* Build the SYMBOL_REF for __tls_get_addr. */
1239 static GTY(()) rtx tls_get_addr_libfunc;
1242 aarch64_tls_get_addr (void)
1244 if (!tls_get_addr_libfunc)
1245 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1246 return tls_get_addr_libfunc;
1249 /* Return the TLS model to use for ADDR. */
1251 static enum tls_model
1252 tls_symbolic_operand_type (rtx addr)
1254 enum tls_model tls_kind = TLS_MODEL_NONE;
1255 rtx sym, addend;
1257 if (GET_CODE (addr) == CONST)
1259 split_const (addr, &sym, &addend);
1260 if (GET_CODE (sym) == SYMBOL_REF)
1261 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1263 else if (GET_CODE (addr) == SYMBOL_REF)
1264 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1266 return tls_kind;
1269 /* We'll allow lo_sum's in addresses in our legitimate addresses
1270 so that combine would take care of combining addresses where
1271 necessary, but for generation purposes, we'll generate the address
1272 as :
1273 RTL Absolute
1274 tmp = hi (symbol_ref); adrp x1, foo
1275 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1278 PIC TLS
1279 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1280 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1281 bl __tls_get_addr
1284 Load TLS symbol, depending on TLS mechanism and TLS access model.
1286 Global Dynamic - Traditional TLS:
1287 adrp tmp, :tlsgd:imm
1288 add dest, tmp, #:tlsgd_lo12:imm
1289 bl __tls_get_addr
1291 Global Dynamic - TLS Descriptors:
1292 adrp dest, :tlsdesc:imm
1293 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1294 add dest, dest, #:tlsdesc_lo12:imm
1295 blr tmp
1296 mrs tp, tpidr_el0
1297 add dest, dest, tp
1299 Initial Exec:
1300 mrs tp, tpidr_el0
1301 adrp tmp, :gottprel:imm
1302 ldr dest, [tmp, #:gottprel_lo12:imm]
1303 add dest, dest, tp
1305 Local Exec:
1306 mrs tp, tpidr_el0
1307 add t0, tp, #:tprel_hi12:imm, lsl #12
1308 add t0, t0, #:tprel_lo12_nc:imm
1311 static void
1312 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1313 enum aarch64_symbol_type type)
1315 switch (type)
1317 case SYMBOL_SMALL_ABSOLUTE:
1319 /* In ILP32, the mode of dest can be either SImode or DImode. */
1320 rtx tmp_reg = dest;
1321 machine_mode mode = GET_MODE (dest);
1323 gcc_assert (mode == Pmode || mode == ptr_mode);
1325 if (can_create_pseudo_p ())
1326 tmp_reg = gen_reg_rtx (mode);
1328 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1329 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1330 return;
1333 case SYMBOL_TINY_ABSOLUTE:
1334 emit_insn (gen_rtx_SET (dest, imm));
1335 return;
1337 case SYMBOL_SMALL_GOT_28K:
1339 machine_mode mode = GET_MODE (dest);
1340 rtx gp_rtx = pic_offset_table_rtx;
1341 rtx insn;
1342 rtx mem;
1344 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1345 here before rtl expand. Tree IVOPT will generate rtl pattern to
1346 decide rtx costs, in which case pic_offset_table_rtx is not
1347 initialized. For that case no need to generate the first adrp
1348 instruction as the final cost for global variable access is
1349 one instruction. */
1350 if (gp_rtx != NULL)
1352 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1353 using the page base as GOT base, the first page may be wasted,
1354 in the worst scenario, there is only 28K space for GOT).
1356 The generate instruction sequence for accessing global variable
1359 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1361 Only one instruction needed. But we must initialize
1362 pic_offset_table_rtx properly. We generate initialize insn for
1363 every global access, and allow CSE to remove all redundant.
1365 The final instruction sequences will look like the following
1366 for multiply global variables access.
1368 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1370 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1371 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1372 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1373 ... */
1375 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1376 crtl->uses_pic_offset_table = 1;
1377 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1379 if (mode != GET_MODE (gp_rtx))
1380 gp_rtx = gen_lowpart (mode, gp_rtx);
1384 if (mode == ptr_mode)
1386 if (mode == DImode)
1387 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1388 else
1389 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1391 mem = XVECEXP (SET_SRC (insn), 0, 0);
1393 else
1395 gcc_assert (mode == Pmode);
1397 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1398 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1401 /* The operand is expected to be MEM. Whenever the related insn
1402 pattern changed, above code which calculate mem should be
1403 updated. */
1404 gcc_assert (GET_CODE (mem) == MEM);
1405 MEM_READONLY_P (mem) = 1;
1406 MEM_NOTRAP_P (mem) = 1;
1407 emit_insn (insn);
1408 return;
1411 case SYMBOL_SMALL_GOT_4G:
1413 /* In ILP32, the mode of dest can be either SImode or DImode,
1414 while the got entry is always of SImode size. The mode of
1415 dest depends on how dest is used: if dest is assigned to a
1416 pointer (e.g. in the memory), it has SImode; it may have
1417 DImode if dest is dereferenced to access the memeory.
1418 This is why we have to handle three different ldr_got_small
1419 patterns here (two patterns for ILP32). */
1421 rtx insn;
1422 rtx mem;
1423 rtx tmp_reg = dest;
1424 machine_mode mode = GET_MODE (dest);
1426 if (can_create_pseudo_p ())
1427 tmp_reg = gen_reg_rtx (mode);
1429 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1430 if (mode == ptr_mode)
1432 if (mode == DImode)
1433 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1434 else
1435 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1437 mem = XVECEXP (SET_SRC (insn), 0, 0);
1439 else
1441 gcc_assert (mode == Pmode);
1443 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1444 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1447 gcc_assert (GET_CODE (mem) == MEM);
1448 MEM_READONLY_P (mem) = 1;
1449 MEM_NOTRAP_P (mem) = 1;
1450 emit_insn (insn);
1451 return;
1454 case SYMBOL_SMALL_TLSGD:
1456 rtx_insn *insns;
1457 machine_mode mode = GET_MODE (dest);
1458 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1460 start_sequence ();
1461 if (TARGET_ILP32)
1462 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1463 else
1464 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1465 insns = get_insns ();
1466 end_sequence ();
1468 RTL_CONST_CALL_P (insns) = 1;
1469 emit_libcall_block (insns, dest, result, imm);
1470 return;
1473 case SYMBOL_SMALL_TLSDESC:
1475 machine_mode mode = GET_MODE (dest);
1476 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1477 rtx tp;
1479 gcc_assert (mode == Pmode || mode == ptr_mode);
1481 /* In ILP32, the got entry is always of SImode size. Unlike
1482 small GOT, the dest is fixed at reg 0. */
1483 if (TARGET_ILP32)
1484 emit_insn (gen_tlsdesc_small_si (imm));
1485 else
1486 emit_insn (gen_tlsdesc_small_di (imm));
1487 tp = aarch64_load_tp (NULL);
1489 if (mode != Pmode)
1490 tp = gen_lowpart (mode, tp);
1492 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1493 if (REG_P (dest))
1494 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1495 return;
1498 case SYMBOL_SMALL_TLSIE:
1500 /* In ILP32, the mode of dest can be either SImode or DImode,
1501 while the got entry is always of SImode size. The mode of
1502 dest depends on how dest is used: if dest is assigned to a
1503 pointer (e.g. in the memory), it has SImode; it may have
1504 DImode if dest is dereferenced to access the memeory.
1505 This is why we have to handle three different tlsie_small
1506 patterns here (two patterns for ILP32). */
1507 machine_mode mode = GET_MODE (dest);
1508 rtx tmp_reg = gen_reg_rtx (mode);
1509 rtx tp = aarch64_load_tp (NULL);
1511 if (mode == ptr_mode)
1513 if (mode == DImode)
1514 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1515 else
1517 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1518 tp = gen_lowpart (mode, tp);
1521 else
1523 gcc_assert (mode == Pmode);
1524 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1527 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1528 if (REG_P (dest))
1529 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1530 return;
1533 case SYMBOL_TLSLE12:
1534 case SYMBOL_TLSLE24:
1535 case SYMBOL_TLSLE32:
1536 case SYMBOL_TLSLE48:
1538 machine_mode mode = GET_MODE (dest);
1539 rtx tp = aarch64_load_tp (NULL);
1541 if (mode != Pmode)
1542 tp = gen_lowpart (mode, tp);
1544 switch (type)
1546 case SYMBOL_TLSLE12:
1547 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1548 (dest, tp, imm));
1549 break;
1550 case SYMBOL_TLSLE24:
1551 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1552 (dest, tp, imm));
1553 break;
1554 case SYMBOL_TLSLE32:
1555 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1556 (dest, imm));
1557 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1558 (dest, dest, tp));
1559 break;
1560 case SYMBOL_TLSLE48:
1561 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1562 (dest, imm));
1563 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1564 (dest, dest, tp));
1565 break;
1566 default:
1567 gcc_unreachable ();
1570 if (REG_P (dest))
1571 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1572 return;
1575 case SYMBOL_TINY_GOT:
1576 emit_insn (gen_ldr_got_tiny (dest, imm));
1577 return;
1579 case SYMBOL_TINY_TLSIE:
1581 machine_mode mode = GET_MODE (dest);
1582 rtx tp = aarch64_load_tp (NULL);
1584 if (mode == ptr_mode)
1586 if (mode == DImode)
1587 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1588 else
1590 tp = gen_lowpart (mode, tp);
1591 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1594 else
1596 gcc_assert (mode == Pmode);
1597 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1600 if (REG_P (dest))
1601 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1602 return;
1605 default:
1606 gcc_unreachable ();
1610 /* Emit a move from SRC to DEST. Assume that the move expanders can
1611 handle all moves if !can_create_pseudo_p (). The distinction is
1612 important because, unlike emit_move_insn, the move expanders know
1613 how to force Pmode objects into the constant pool even when the
1614 constant pool address is not itself legitimate. */
1615 static rtx
1616 aarch64_emit_move (rtx dest, rtx src)
1618 return (can_create_pseudo_p ()
1619 ? emit_move_insn (dest, src)
1620 : emit_move_insn_1 (dest, src));
1623 /* Split a 128-bit move operation into two 64-bit move operations,
1624 taking care to handle partial overlap of register to register
1625 copies. Special cases are needed when moving between GP regs and
1626 FP regs. SRC can be a register, constant or memory; DST a register
1627 or memory. If either operand is memory it must not have any side
1628 effects. */
1629 void
1630 aarch64_split_128bit_move (rtx dst, rtx src)
1632 rtx dst_lo, dst_hi;
1633 rtx src_lo, src_hi;
1635 machine_mode mode = GET_MODE (dst);
1637 gcc_assert (mode == TImode || mode == TFmode);
1638 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1639 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1641 if (REG_P (dst) && REG_P (src))
1643 int src_regno = REGNO (src);
1644 int dst_regno = REGNO (dst);
1646 /* Handle FP <-> GP regs. */
1647 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1649 src_lo = gen_lowpart (word_mode, src);
1650 src_hi = gen_highpart (word_mode, src);
1652 if (mode == TImode)
1654 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1655 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1657 else
1659 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1660 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1662 return;
1664 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1666 dst_lo = gen_lowpart (word_mode, dst);
1667 dst_hi = gen_highpart (word_mode, dst);
1669 if (mode == TImode)
1671 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1672 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1674 else
1676 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1677 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1679 return;
1683 dst_lo = gen_lowpart (word_mode, dst);
1684 dst_hi = gen_highpart (word_mode, dst);
1685 src_lo = gen_lowpart (word_mode, src);
1686 src_hi = gen_highpart_mode (word_mode, mode, src);
1688 /* At most one pairing may overlap. */
1689 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1691 aarch64_emit_move (dst_hi, src_hi);
1692 aarch64_emit_move (dst_lo, src_lo);
1694 else
1696 aarch64_emit_move (dst_lo, src_lo);
1697 aarch64_emit_move (dst_hi, src_hi);
1701 bool
1702 aarch64_split_128bit_move_p (rtx dst, rtx src)
1704 return (! REG_P (src)
1705 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1708 /* Split a complex SIMD combine. */
1710 void
1711 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1713 machine_mode src_mode = GET_MODE (src1);
1714 machine_mode dst_mode = GET_MODE (dst);
1716 gcc_assert (VECTOR_MODE_P (dst_mode));
1717 gcc_assert (register_operand (dst, dst_mode)
1718 && register_operand (src1, src_mode)
1719 && register_operand (src2, src_mode));
1721 rtx (*gen) (rtx, rtx, rtx);
1723 switch (src_mode)
1725 case E_V8QImode:
1726 gen = gen_aarch64_simd_combinev8qi;
1727 break;
1728 case E_V4HImode:
1729 gen = gen_aarch64_simd_combinev4hi;
1730 break;
1731 case E_V2SImode:
1732 gen = gen_aarch64_simd_combinev2si;
1733 break;
1734 case E_V4HFmode:
1735 gen = gen_aarch64_simd_combinev4hf;
1736 break;
1737 case E_V2SFmode:
1738 gen = gen_aarch64_simd_combinev2sf;
1739 break;
1740 case E_DImode:
1741 gen = gen_aarch64_simd_combinedi;
1742 break;
1743 case E_DFmode:
1744 gen = gen_aarch64_simd_combinedf;
1745 break;
1746 default:
1747 gcc_unreachable ();
1750 emit_insn (gen (dst, src1, src2));
1751 return;
1754 /* Split a complex SIMD move. */
1756 void
1757 aarch64_split_simd_move (rtx dst, rtx src)
1759 machine_mode src_mode = GET_MODE (src);
1760 machine_mode dst_mode = GET_MODE (dst);
1762 gcc_assert (VECTOR_MODE_P (dst_mode));
1764 if (REG_P (dst) && REG_P (src))
1766 rtx (*gen) (rtx, rtx);
1768 gcc_assert (VECTOR_MODE_P (src_mode));
1770 switch (src_mode)
1772 case E_V16QImode:
1773 gen = gen_aarch64_split_simd_movv16qi;
1774 break;
1775 case E_V8HImode:
1776 gen = gen_aarch64_split_simd_movv8hi;
1777 break;
1778 case E_V4SImode:
1779 gen = gen_aarch64_split_simd_movv4si;
1780 break;
1781 case E_V2DImode:
1782 gen = gen_aarch64_split_simd_movv2di;
1783 break;
1784 case E_V8HFmode:
1785 gen = gen_aarch64_split_simd_movv8hf;
1786 break;
1787 case E_V4SFmode:
1788 gen = gen_aarch64_split_simd_movv4sf;
1789 break;
1790 case E_V2DFmode:
1791 gen = gen_aarch64_split_simd_movv2df;
1792 break;
1793 default:
1794 gcc_unreachable ();
1797 emit_insn (gen (dst, src));
1798 return;
1802 bool
1803 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1804 machine_mode ymode, rtx y)
1806 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1807 gcc_assert (r != NULL);
1808 return rtx_equal_p (x, r);
1812 static rtx
1813 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1815 if (can_create_pseudo_p ())
1816 return force_reg (mode, value);
1817 else
1819 x = aarch64_emit_move (x, value);
1820 return x;
1825 static rtx
1826 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1827 HOST_WIDE_INT offset)
1829 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1831 rtx high;
1832 /* Load the full offset into a register. This
1833 might be improvable in the future. */
1834 high = GEN_INT (offset);
1835 offset = 0;
1836 high = aarch64_force_temporary (mode, temp, high);
1837 reg = aarch64_force_temporary (mode, temp,
1838 gen_rtx_PLUS (mode, high, reg));
1840 return plus_constant (mode, reg, offset);
1843 static int
1844 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1845 scalar_int_mode mode)
1847 int i;
1848 unsigned HOST_WIDE_INT val, val2, mask;
1849 int one_match, zero_match;
1850 int num_insns;
1852 val = INTVAL (imm);
1854 if (aarch64_move_imm (val, mode))
1856 if (generate)
1857 emit_insn (gen_rtx_SET (dest, imm));
1858 return 1;
1861 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1862 (with XXXX non-zero). In that case check to see if the move can be done in
1863 a smaller mode. */
1864 val2 = val & 0xffffffff;
1865 if (mode == DImode
1866 && aarch64_move_imm (val2, SImode)
1867 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1869 if (generate)
1870 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1872 /* Check if we have to emit a second instruction by checking to see
1873 if any of the upper 32 bits of the original DI mode value is set. */
1874 if (val == val2)
1875 return 1;
1877 i = (val >> 48) ? 48 : 32;
1879 if (generate)
1880 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1881 GEN_INT ((val >> i) & 0xffff)));
1883 return 2;
1886 if ((val >> 32) == 0 || mode == SImode)
1888 if (generate)
1890 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1891 if (mode == SImode)
1892 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1893 GEN_INT ((val >> 16) & 0xffff)));
1894 else
1895 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1896 GEN_INT ((val >> 16) & 0xffff)));
1898 return 2;
1901 /* Remaining cases are all for DImode. */
1903 mask = 0xffff;
1904 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1905 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1906 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1907 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1909 if (zero_match != 2 && one_match != 2)
1911 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1912 For a 64-bit bitmask try whether changing 16 bits to all ones or
1913 zeroes creates a valid bitmask. To check any repeated bitmask,
1914 try using 16 bits from the other 32-bit half of val. */
1916 for (i = 0; i < 64; i += 16, mask <<= 16)
1918 val2 = val & ~mask;
1919 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1920 break;
1921 val2 = val | mask;
1922 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1923 break;
1924 val2 = val2 & ~mask;
1925 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1926 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1927 break;
1929 if (i != 64)
1931 if (generate)
1933 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1934 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1935 GEN_INT ((val >> i) & 0xffff)));
1937 return 2;
1941 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1942 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1943 otherwise skip zero bits. */
1945 num_insns = 1;
1946 mask = 0xffff;
1947 val2 = one_match > zero_match ? ~val : val;
1948 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1950 if (generate)
1951 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1952 ? (val | ~(mask << i))
1953 : (val & (mask << i)))));
1954 for (i += 16; i < 64; i += 16)
1956 if ((val2 & (mask << i)) == 0)
1957 continue;
1958 if (generate)
1959 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1960 GEN_INT ((val >> i) & 0xffff)));
1961 num_insns ++;
1964 return num_insns;
1968 void
1969 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1971 machine_mode mode = GET_MODE (dest);
1973 gcc_assert (mode == SImode || mode == DImode);
1975 /* Check on what type of symbol it is. */
1976 scalar_int_mode int_mode;
1977 if ((GET_CODE (imm) == SYMBOL_REF
1978 || GET_CODE (imm) == LABEL_REF
1979 || GET_CODE (imm) == CONST)
1980 && is_a <scalar_int_mode> (mode, &int_mode))
1982 rtx mem, base, offset;
1983 enum aarch64_symbol_type sty;
1985 /* If we have (const (plus symbol offset)), separate out the offset
1986 before we start classifying the symbol. */
1987 split_const (imm, &base, &offset);
1989 sty = aarch64_classify_symbol (base, offset);
1990 switch (sty)
1992 case SYMBOL_FORCE_TO_MEM:
1993 if (offset != const0_rtx
1994 && targetm.cannot_force_const_mem (int_mode, imm))
1996 gcc_assert (can_create_pseudo_p ());
1997 base = aarch64_force_temporary (int_mode, dest, base);
1998 base = aarch64_add_offset (int_mode, NULL, base,
1999 INTVAL (offset));
2000 aarch64_emit_move (dest, base);
2001 return;
2004 mem = force_const_mem (ptr_mode, imm);
2005 gcc_assert (mem);
2007 /* If we aren't generating PC relative literals, then
2008 we need to expand the literal pool access carefully.
2009 This is something that needs to be done in a number
2010 of places, so could well live as a separate function. */
2011 if (!aarch64_pcrelative_literal_loads)
2013 gcc_assert (can_create_pseudo_p ());
2014 base = gen_reg_rtx (ptr_mode);
2015 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2016 if (ptr_mode != Pmode)
2017 base = convert_memory_address (Pmode, base);
2018 mem = gen_rtx_MEM (ptr_mode, base);
2021 if (int_mode != ptr_mode)
2022 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2024 emit_insn (gen_rtx_SET (dest, mem));
2026 return;
2028 case SYMBOL_SMALL_TLSGD:
2029 case SYMBOL_SMALL_TLSDESC:
2030 case SYMBOL_SMALL_TLSIE:
2031 case SYMBOL_SMALL_GOT_28K:
2032 case SYMBOL_SMALL_GOT_4G:
2033 case SYMBOL_TINY_GOT:
2034 case SYMBOL_TINY_TLSIE:
2035 if (offset != const0_rtx)
2037 gcc_assert(can_create_pseudo_p ());
2038 base = aarch64_force_temporary (int_mode, dest, base);
2039 base = aarch64_add_offset (int_mode, NULL, base,
2040 INTVAL (offset));
2041 aarch64_emit_move (dest, base);
2042 return;
2044 /* FALLTHRU */
2046 case SYMBOL_SMALL_ABSOLUTE:
2047 case SYMBOL_TINY_ABSOLUTE:
2048 case SYMBOL_TLSLE12:
2049 case SYMBOL_TLSLE24:
2050 case SYMBOL_TLSLE32:
2051 case SYMBOL_TLSLE48:
2052 aarch64_load_symref_appropriately (dest, imm, sty);
2053 return;
2055 default:
2056 gcc_unreachable ();
2060 if (!CONST_INT_P (imm))
2062 if (GET_CODE (imm) == HIGH)
2063 emit_insn (gen_rtx_SET (dest, imm));
2064 else
2066 rtx mem = force_const_mem (mode, imm);
2067 gcc_assert (mem);
2068 emit_insn (gen_rtx_SET (dest, mem));
2071 return;
2074 aarch64_internal_mov_immediate (dest, imm, true,
2075 as_a <scalar_int_mode> (mode));
2078 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2079 temporary value if necessary. FRAME_RELATED_P should be true if
2080 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2081 to the generated instructions. If SCRATCHREG is known to hold
2082 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2083 immediate again.
2085 Since this function may be used to adjust the stack pointer, we must
2086 ensure that it cannot cause transient stack deallocation (for example
2087 by first incrementing SP and then decrementing when adjusting by a
2088 large immediate). */
2090 static void
2091 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2092 int scratchreg, HOST_WIDE_INT delta,
2093 bool frame_related_p, bool emit_move_imm)
2095 HOST_WIDE_INT mdelta = abs_hwi (delta);
2096 rtx this_rtx = gen_rtx_REG (mode, regnum);
2097 rtx_insn *insn;
2099 if (!mdelta)
2100 return;
2102 /* Single instruction adjustment. */
2103 if (aarch64_uimm12_shift (mdelta))
2105 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2106 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2107 return;
2110 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2111 Only do this if mdelta is not a 16-bit move as adjusting using a move
2112 is better. */
2113 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2115 HOST_WIDE_INT low_off = mdelta & 0xfff;
2117 low_off = delta < 0 ? -low_off : low_off;
2118 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2119 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2120 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2121 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2122 return;
2125 /* Emit a move immediate if required and an addition/subtraction. */
2126 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2127 if (emit_move_imm)
2128 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2129 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2130 : gen_add2_insn (this_rtx, scratch_rtx));
2131 if (frame_related_p)
2133 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2134 rtx adj = plus_constant (mode, this_rtx, delta);
2135 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2139 static inline void
2140 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2141 HOST_WIDE_INT delta)
2143 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2146 static inline void
2147 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2149 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2150 true, emit_move_imm);
2153 static inline void
2154 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2156 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2157 frame_related_p, true);
2160 static bool
2161 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2162 tree exp ATTRIBUTE_UNUSED)
2164 /* Currently, always true. */
2165 return true;
2168 /* Implement TARGET_PASS_BY_REFERENCE. */
2170 static bool
2171 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2172 machine_mode mode,
2173 const_tree type,
2174 bool named ATTRIBUTE_UNUSED)
2176 HOST_WIDE_INT size;
2177 machine_mode dummymode;
2178 int nregs;
2180 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2181 size = (mode == BLKmode && type)
2182 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2184 /* Aggregates are passed by reference based on their size. */
2185 if (type && AGGREGATE_TYPE_P (type))
2187 size = int_size_in_bytes (type);
2190 /* Variable sized arguments are always returned by reference. */
2191 if (size < 0)
2192 return true;
2194 /* Can this be a candidate to be passed in fp/simd register(s)? */
2195 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2196 &dummymode, &nregs,
2197 NULL))
2198 return false;
2200 /* Arguments which are variable sized or larger than 2 registers are
2201 passed by reference unless they are a homogenous floating point
2202 aggregate. */
2203 return size > 2 * UNITS_PER_WORD;
2206 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2207 static bool
2208 aarch64_return_in_msb (const_tree valtype)
2210 machine_mode dummy_mode;
2211 int dummy_int;
2213 /* Never happens in little-endian mode. */
2214 if (!BYTES_BIG_ENDIAN)
2215 return false;
2217 /* Only composite types smaller than or equal to 16 bytes can
2218 be potentially returned in registers. */
2219 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2220 || int_size_in_bytes (valtype) <= 0
2221 || int_size_in_bytes (valtype) > 16)
2222 return false;
2224 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2225 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2226 is always passed/returned in the least significant bits of fp/simd
2227 register(s). */
2228 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2229 &dummy_mode, &dummy_int, NULL))
2230 return false;
2232 return true;
2235 /* Implement TARGET_FUNCTION_VALUE.
2236 Define how to find the value returned by a function. */
2238 static rtx
2239 aarch64_function_value (const_tree type, const_tree func,
2240 bool outgoing ATTRIBUTE_UNUSED)
2242 machine_mode mode;
2243 int unsignedp;
2244 int count;
2245 machine_mode ag_mode;
2247 mode = TYPE_MODE (type);
2248 if (INTEGRAL_TYPE_P (type))
2249 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2251 if (aarch64_return_in_msb (type))
2253 HOST_WIDE_INT size = int_size_in_bytes (type);
2255 if (size % UNITS_PER_WORD != 0)
2257 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2258 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2262 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2263 &ag_mode, &count, NULL))
2265 if (!aarch64_composite_type_p (type, mode))
2267 gcc_assert (count == 1 && mode == ag_mode);
2268 return gen_rtx_REG (mode, V0_REGNUM);
2270 else
2272 int i;
2273 rtx par;
2275 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2276 for (i = 0; i < count; i++)
2278 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2279 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2280 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2281 XVECEXP (par, 0, i) = tmp;
2283 return par;
2286 else
2287 return gen_rtx_REG (mode, R0_REGNUM);
2290 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2291 Return true if REGNO is the number of a hard register in which the values
2292 of called function may come back. */
2294 static bool
2295 aarch64_function_value_regno_p (const unsigned int regno)
2297 /* Maximum of 16 bytes can be returned in the general registers. Examples
2298 of 16-byte return values are: 128-bit integers and 16-byte small
2299 structures (excluding homogeneous floating-point aggregates). */
2300 if (regno == R0_REGNUM || regno == R1_REGNUM)
2301 return true;
2303 /* Up to four fp/simd registers can return a function value, e.g. a
2304 homogeneous floating-point aggregate having four members. */
2305 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2306 return TARGET_FLOAT;
2308 return false;
2311 /* Implement TARGET_RETURN_IN_MEMORY.
2313 If the type T of the result of a function is such that
2314 void func (T arg)
2315 would require that arg be passed as a value in a register (or set of
2316 registers) according to the parameter passing rules, then the result
2317 is returned in the same registers as would be used for such an
2318 argument. */
2320 static bool
2321 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2323 HOST_WIDE_INT size;
2324 machine_mode ag_mode;
2325 int count;
2327 if (!AGGREGATE_TYPE_P (type)
2328 && TREE_CODE (type) != COMPLEX_TYPE
2329 && TREE_CODE (type) != VECTOR_TYPE)
2330 /* Simple scalar types always returned in registers. */
2331 return false;
2333 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2334 type,
2335 &ag_mode,
2336 &count,
2337 NULL))
2338 return false;
2340 /* Types larger than 2 registers returned in memory. */
2341 size = int_size_in_bytes (type);
2342 return (size < 0 || size > 2 * UNITS_PER_WORD);
2345 static bool
2346 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2347 const_tree type, int *nregs)
2349 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2350 return aarch64_vfp_is_call_or_return_candidate (mode,
2351 type,
2352 &pcum->aapcs_vfp_rmode,
2353 nregs,
2354 NULL);
2357 /* Given MODE and TYPE of a function argument, return the alignment in
2358 bits. The idea is to suppress any stronger alignment requested by
2359 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2360 This is a helper function for local use only. */
2362 static unsigned int
2363 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2365 if (!type)
2366 return GET_MODE_ALIGNMENT (mode);
2368 if (integer_zerop (TYPE_SIZE (type)))
2369 return 0;
2371 gcc_assert (TYPE_MODE (type) == mode);
2373 if (!AGGREGATE_TYPE_P (type))
2374 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2376 if (TREE_CODE (type) == ARRAY_TYPE)
2377 return TYPE_ALIGN (TREE_TYPE (type));
2379 unsigned int alignment = 0;
2380 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2381 if (TREE_CODE (field) == FIELD_DECL)
2382 alignment = std::max (alignment, DECL_ALIGN (field));
2384 return alignment;
2387 /* Layout a function argument according to the AAPCS64 rules. The rule
2388 numbers refer to the rule numbers in the AAPCS64. */
2390 static void
2391 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2392 const_tree type,
2393 bool named ATTRIBUTE_UNUSED)
2395 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2396 int ncrn, nvrn, nregs;
2397 bool allocate_ncrn, allocate_nvrn;
2398 HOST_WIDE_INT size;
2400 /* We need to do this once per argument. */
2401 if (pcum->aapcs_arg_processed)
2402 return;
2404 pcum->aapcs_arg_processed = true;
2406 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2407 size
2408 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2409 UNITS_PER_WORD);
2411 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2412 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2413 mode,
2414 type,
2415 &nregs);
2417 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2418 The following code thus handles passing by SIMD/FP registers first. */
2420 nvrn = pcum->aapcs_nvrn;
2422 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2423 and homogenous short-vector aggregates (HVA). */
2424 if (allocate_nvrn)
2426 if (!TARGET_FLOAT)
2427 aarch64_err_no_fpadvsimd (mode, "argument");
2429 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2431 pcum->aapcs_nextnvrn = nvrn + nregs;
2432 if (!aarch64_composite_type_p (type, mode))
2434 gcc_assert (nregs == 1);
2435 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2437 else
2439 rtx par;
2440 int i;
2441 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2442 for (i = 0; i < nregs; i++)
2444 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2445 V0_REGNUM + nvrn + i);
2446 tmp = gen_rtx_EXPR_LIST
2447 (VOIDmode, tmp,
2448 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2449 XVECEXP (par, 0, i) = tmp;
2451 pcum->aapcs_reg = par;
2453 return;
2455 else
2457 /* C.3 NSRN is set to 8. */
2458 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2459 goto on_stack;
2463 ncrn = pcum->aapcs_ncrn;
2464 nregs = size / UNITS_PER_WORD;
2466 /* C6 - C9. though the sign and zero extension semantics are
2467 handled elsewhere. This is the case where the argument fits
2468 entirely general registers. */
2469 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2472 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2474 /* C.8 if the argument has an alignment of 16 then the NGRN is
2475 rounded up to the next even number. */
2476 if (nregs == 2
2477 && ncrn % 2
2478 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2479 comparison is there because for > 16 * BITS_PER_UNIT
2480 alignment nregs should be > 2 and therefore it should be
2481 passed by reference rather than value. */
2482 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2484 ++ncrn;
2485 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2488 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2489 A reg is still generated for it, but the caller should be smart
2490 enough not to use it. */
2491 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2492 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2493 else
2495 rtx par;
2496 int i;
2498 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2499 for (i = 0; i < nregs; i++)
2501 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2502 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2503 GEN_INT (i * UNITS_PER_WORD));
2504 XVECEXP (par, 0, i) = tmp;
2506 pcum->aapcs_reg = par;
2509 pcum->aapcs_nextncrn = ncrn + nregs;
2510 return;
2513 /* C.11 */
2514 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2516 /* The argument is passed on stack; record the needed number of words for
2517 this argument and align the total size if necessary. */
2518 on_stack:
2519 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2521 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2522 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2523 16 / UNITS_PER_WORD);
2524 return;
2527 /* Implement TARGET_FUNCTION_ARG. */
2529 static rtx
2530 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2531 const_tree type, bool named)
2533 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2534 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2536 if (mode == VOIDmode)
2537 return NULL_RTX;
2539 aarch64_layout_arg (pcum_v, mode, type, named);
2540 return pcum->aapcs_reg;
2543 void
2544 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2545 const_tree fntype ATTRIBUTE_UNUSED,
2546 rtx libname ATTRIBUTE_UNUSED,
2547 const_tree fndecl ATTRIBUTE_UNUSED,
2548 unsigned n_named ATTRIBUTE_UNUSED)
2550 pcum->aapcs_ncrn = 0;
2551 pcum->aapcs_nvrn = 0;
2552 pcum->aapcs_nextncrn = 0;
2553 pcum->aapcs_nextnvrn = 0;
2554 pcum->pcs_variant = ARM_PCS_AAPCS64;
2555 pcum->aapcs_reg = NULL_RTX;
2556 pcum->aapcs_arg_processed = false;
2557 pcum->aapcs_stack_words = 0;
2558 pcum->aapcs_stack_size = 0;
2560 if (!TARGET_FLOAT
2561 && fndecl && TREE_PUBLIC (fndecl)
2562 && fntype && fntype != error_mark_node)
2564 const_tree type = TREE_TYPE (fntype);
2565 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2566 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2567 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2568 &mode, &nregs, NULL))
2569 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2571 return;
2574 static void
2575 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2576 machine_mode mode,
2577 const_tree type,
2578 bool named)
2580 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2581 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2583 aarch64_layout_arg (pcum_v, mode, type, named);
2584 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2585 != (pcum->aapcs_stack_words != 0));
2586 pcum->aapcs_arg_processed = false;
2587 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2588 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2589 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2590 pcum->aapcs_stack_words = 0;
2591 pcum->aapcs_reg = NULL_RTX;
2595 bool
2596 aarch64_function_arg_regno_p (unsigned regno)
2598 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2599 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2602 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2603 PARM_BOUNDARY bits of alignment, but will be given anything up
2604 to STACK_BOUNDARY bits if the type requires it. This makes sure
2605 that both before and after the layout of each argument, the Next
2606 Stacked Argument Address (NSAA) will have a minimum alignment of
2607 8 bytes. */
2609 static unsigned int
2610 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2612 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2613 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2616 /* Implement TARGET_FUNCTION_ARG_PADDING.
2618 Small aggregate types are placed in the lowest memory address.
2620 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2622 static pad_direction
2623 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2625 /* On little-endian targets, the least significant byte of every stack
2626 argument is passed at the lowest byte address of the stack slot. */
2627 if (!BYTES_BIG_ENDIAN)
2628 return PAD_UPWARD;
2630 /* Otherwise, integral, floating-point and pointer types are padded downward:
2631 the least significant byte of a stack argument is passed at the highest
2632 byte address of the stack slot. */
2633 if (type
2634 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2635 || POINTER_TYPE_P (type))
2636 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2637 return PAD_DOWNWARD;
2639 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2640 return PAD_UPWARD;
2643 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2645 It specifies padding for the last (may also be the only)
2646 element of a block move between registers and memory. If
2647 assuming the block is in the memory, padding upward means that
2648 the last element is padded after its highest significant byte,
2649 while in downward padding, the last element is padded at the
2650 its least significant byte side.
2652 Small aggregates and small complex types are always padded
2653 upwards.
2655 We don't need to worry about homogeneous floating-point or
2656 short-vector aggregates; their move is not affected by the
2657 padding direction determined here. Regardless of endianness,
2658 each element of such an aggregate is put in the least
2659 significant bits of a fp/simd register.
2661 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2662 register has useful data, and return the opposite if the most
2663 significant byte does. */
2665 bool
2666 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2667 bool first ATTRIBUTE_UNUSED)
2670 /* Small composite types are always padded upward. */
2671 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2673 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2674 : GET_MODE_SIZE (mode));
2675 if (size < 2 * UNITS_PER_WORD)
2676 return true;
2679 /* Otherwise, use the default padding. */
2680 return !BYTES_BIG_ENDIAN;
2683 static scalar_int_mode
2684 aarch64_libgcc_cmp_return_mode (void)
2686 return SImode;
2689 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2691 /* We use the 12-bit shifted immediate arithmetic instructions so values
2692 must be multiple of (1 << 12), i.e. 4096. */
2693 #define ARITH_FACTOR 4096
2695 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2696 #error Cannot use simple address calculation for stack probing
2697 #endif
2699 /* The pair of scratch registers used for stack probing. */
2700 #define PROBE_STACK_FIRST_REG 9
2701 #define PROBE_STACK_SECOND_REG 10
2703 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2704 inclusive. These are offsets from the current stack pointer. */
2706 static void
2707 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2709 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2711 /* See the same assertion on PROBE_INTERVAL above. */
2712 gcc_assert ((first % ARITH_FACTOR) == 0);
2714 /* See if we have a constant small number of probes to generate. If so,
2715 that's the easy case. */
2716 if (size <= PROBE_INTERVAL)
2718 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2720 emit_set_insn (reg1,
2721 plus_constant (Pmode,
2722 stack_pointer_rtx, -(first + base)));
2723 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2726 /* The run-time loop is made up of 8 insns in the generic case while the
2727 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2728 else if (size <= 4 * PROBE_INTERVAL)
2730 HOST_WIDE_INT i, rem;
2732 emit_set_insn (reg1,
2733 plus_constant (Pmode,
2734 stack_pointer_rtx,
2735 -(first + PROBE_INTERVAL)));
2736 emit_stack_probe (reg1);
2738 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2739 it exceeds SIZE. If only two probes are needed, this will not
2740 generate any code. Then probe at FIRST + SIZE. */
2741 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2743 emit_set_insn (reg1,
2744 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2745 emit_stack_probe (reg1);
2748 rem = size - (i - PROBE_INTERVAL);
2749 if (rem > 256)
2751 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2753 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2754 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2756 else
2757 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2760 /* Otherwise, do the same as above, but in a loop. Note that we must be
2761 extra careful with variables wrapping around because we might be at
2762 the very top (or the very bottom) of the address space and we have
2763 to be able to handle this case properly; in particular, we use an
2764 equality test for the loop condition. */
2765 else
2767 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2769 /* Step 1: round SIZE to the previous multiple of the interval. */
2771 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2774 /* Step 2: compute initial and final value of the loop counter. */
2776 /* TEST_ADDR = SP + FIRST. */
2777 emit_set_insn (reg1,
2778 plus_constant (Pmode, stack_pointer_rtx, -first));
2780 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2781 HOST_WIDE_INT adjustment = - (first + rounded_size);
2782 if (! aarch64_uimm12_shift (adjustment))
2784 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2785 true, Pmode);
2786 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2788 else
2790 emit_set_insn (reg2,
2791 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2794 /* Step 3: the loop
2798 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2799 probe at TEST_ADDR
2801 while (TEST_ADDR != LAST_ADDR)
2803 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2804 until it is equal to ROUNDED_SIZE. */
2806 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2809 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2810 that SIZE is equal to ROUNDED_SIZE. */
2812 if (size != rounded_size)
2814 HOST_WIDE_INT rem = size - rounded_size;
2816 if (rem > 256)
2818 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2820 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2821 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2823 else
2824 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2828 /* Make sure nothing is scheduled before we are done. */
2829 emit_insn (gen_blockage ());
2832 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2833 absolute addresses. */
2835 const char *
2836 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2838 static int labelno = 0;
2839 char loop_lab[32];
2840 rtx xops[2];
2842 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2844 /* Loop. */
2845 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2847 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2848 xops[0] = reg1;
2849 xops[1] = GEN_INT (PROBE_INTERVAL);
2850 output_asm_insn ("sub\t%0, %0, %1", xops);
2852 /* Probe at TEST_ADDR. */
2853 output_asm_insn ("str\txzr, [%0]", xops);
2855 /* Test if TEST_ADDR == LAST_ADDR. */
2856 xops[1] = reg2;
2857 output_asm_insn ("cmp\t%0, %1", xops);
2859 /* Branch. */
2860 fputs ("\tb.ne\t", asm_out_file);
2861 assemble_name_raw (asm_out_file, loop_lab);
2862 fputc ('\n', asm_out_file);
2864 return "";
2867 static bool
2868 aarch64_frame_pointer_required (void)
2870 /* Use the frame pointer if enabled and it is not a leaf function, unless
2871 leaf frame pointer omission is disabled. If the frame pointer is enabled,
2872 force the frame pointer in leaf functions which use LR. */
2873 if (flag_omit_frame_pointer == 2
2874 && !(flag_omit_leaf_frame_pointer
2875 && crtl->is_leaf
2876 && !df_regs_ever_live_p (LR_REGNUM)))
2877 return true;
2879 return false;
2882 /* Mark the registers that need to be saved by the callee and calculate
2883 the size of the callee-saved registers area and frame record (both FP
2884 and LR may be omitted). If the function is not a leaf, ensure LR is
2885 saved at the bottom of the callee-save area. */
2886 static void
2887 aarch64_layout_frame (void)
2889 HOST_WIDE_INT offset = 0;
2890 int regno, last_fp_reg = INVALID_REGNUM;
2892 if (reload_completed && cfun->machine->frame.laid_out)
2893 return;
2895 /* Force a frame chain for EH returns so the return address is at FP+8. */
2896 cfun->machine->frame.emit_frame_chain
2897 = frame_pointer_needed || crtl->calls_eh_return;
2899 #define SLOT_NOT_REQUIRED (-2)
2900 #define SLOT_REQUIRED (-1)
2902 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2903 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2905 /* First mark all the registers that really need to be saved... */
2906 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2907 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2909 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2910 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2912 /* ... that includes the eh data registers (if needed)... */
2913 if (crtl->calls_eh_return)
2914 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2915 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2916 = SLOT_REQUIRED;
2918 /* ... and any callee saved register that dataflow says is live. */
2919 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2920 if (df_regs_ever_live_p (regno)
2921 && (regno == R30_REGNUM
2922 || !call_used_regs[regno]))
2923 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2925 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2926 if (df_regs_ever_live_p (regno)
2927 && !call_used_regs[regno])
2929 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2930 last_fp_reg = regno;
2933 if (cfun->machine->frame.emit_frame_chain)
2935 /* FP and LR are placed in the linkage record. */
2936 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2937 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2938 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2939 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2940 offset = 2 * UNITS_PER_WORD;
2942 else if (!crtl->is_leaf)
2944 /* Ensure LR is saved at the bottom of the callee-saves. */
2945 cfun->machine->frame.reg_offset[R30_REGNUM] = 0;
2946 cfun->machine->frame.wb_candidate1 = R30_REGNUM;
2947 offset = UNITS_PER_WORD;
2950 /* Now assign stack slots for them. */
2951 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2952 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2954 cfun->machine->frame.reg_offset[regno] = offset;
2955 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2956 cfun->machine->frame.wb_candidate1 = regno;
2957 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2958 cfun->machine->frame.wb_candidate2 = regno;
2959 offset += UNITS_PER_WORD;
2962 HOST_WIDE_INT max_int_offset = offset;
2963 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2964 bool has_align_gap = offset != max_int_offset;
2966 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2967 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2969 /* If there is an alignment gap between integer and fp callee-saves,
2970 allocate the last fp register to it if possible. */
2971 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2973 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2974 break;
2977 cfun->machine->frame.reg_offset[regno] = offset;
2978 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2979 cfun->machine->frame.wb_candidate1 = regno;
2980 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2981 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2982 cfun->machine->frame.wb_candidate2 = regno;
2983 offset += UNITS_PER_WORD;
2986 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2988 cfun->machine->frame.saved_regs_size = offset;
2990 HOST_WIDE_INT varargs_and_saved_regs_size
2991 = offset + cfun->machine->frame.saved_varargs_size;
2993 cfun->machine->frame.hard_fp_offset
2994 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2995 STACK_BOUNDARY / BITS_PER_UNIT);
2997 cfun->machine->frame.frame_size
2998 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2999 + crtl->outgoing_args_size,
3000 STACK_BOUNDARY / BITS_PER_UNIT);
3002 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3004 cfun->machine->frame.initial_adjust = 0;
3005 cfun->machine->frame.final_adjust = 0;
3006 cfun->machine->frame.callee_adjust = 0;
3007 cfun->machine->frame.callee_offset = 0;
3009 HOST_WIDE_INT max_push_offset = 0;
3010 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3011 max_push_offset = 512;
3012 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3013 max_push_offset = 256;
3015 if (cfun->machine->frame.frame_size < max_push_offset
3016 && crtl->outgoing_args_size == 0)
3018 /* Simple, small frame with no outgoing arguments:
3019 stp reg1, reg2, [sp, -frame_size]!
3020 stp reg3, reg4, [sp, 16] */
3021 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3023 else if ((crtl->outgoing_args_size
3024 + cfun->machine->frame.saved_regs_size < 512)
3025 && !(cfun->calls_alloca
3026 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3028 /* Frame with small outgoing arguments:
3029 sub sp, sp, frame_size
3030 stp reg1, reg2, [sp, outgoing_args_size]
3031 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3032 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3033 cfun->machine->frame.callee_offset
3034 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3036 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3038 /* Frame with large outgoing arguments but a small local area:
3039 stp reg1, reg2, [sp, -hard_fp_offset]!
3040 stp reg3, reg4, [sp, 16]
3041 sub sp, sp, outgoing_args_size */
3042 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3043 cfun->machine->frame.final_adjust
3044 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3046 else
3048 /* Frame with large local area and outgoing arguments using frame pointer:
3049 sub sp, sp, hard_fp_offset
3050 stp x29, x30, [sp, 0]
3051 add x29, sp, 0
3052 stp reg3, reg4, [sp, 16]
3053 sub sp, sp, outgoing_args_size */
3054 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3055 cfun->machine->frame.final_adjust
3056 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3059 cfun->machine->frame.laid_out = true;
3062 /* Return true if the register REGNO is saved on entry to
3063 the current function. */
3065 static bool
3066 aarch64_register_saved_on_entry (int regno)
3068 return cfun->machine->frame.reg_offset[regno] >= 0;
3071 /* Return the next register up from REGNO up to LIMIT for the callee
3072 to save. */
3074 static unsigned
3075 aarch64_next_callee_save (unsigned regno, unsigned limit)
3077 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3078 regno ++;
3079 return regno;
3082 /* Push the register number REGNO of mode MODE to the stack with write-back
3083 adjusting the stack by ADJUSTMENT. */
3085 static void
3086 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3087 HOST_WIDE_INT adjustment)
3089 rtx base_rtx = stack_pointer_rtx;
3090 rtx insn, reg, mem;
3092 reg = gen_rtx_REG (mode, regno);
3093 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3094 plus_constant (Pmode, base_rtx, -adjustment));
3095 mem = gen_frame_mem (mode, mem);
3097 insn = emit_move_insn (mem, reg);
3098 RTX_FRAME_RELATED_P (insn) = 1;
3101 /* Generate and return an instruction to store the pair of registers
3102 REG and REG2 of mode MODE to location BASE with write-back adjusting
3103 the stack location BASE by ADJUSTMENT. */
3105 static rtx
3106 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3107 HOST_WIDE_INT adjustment)
3109 switch (mode)
3111 case E_DImode:
3112 return gen_storewb_pairdi_di (base, base, reg, reg2,
3113 GEN_INT (-adjustment),
3114 GEN_INT (UNITS_PER_WORD - adjustment));
3115 case E_DFmode:
3116 return gen_storewb_pairdf_di (base, base, reg, reg2,
3117 GEN_INT (-adjustment),
3118 GEN_INT (UNITS_PER_WORD - adjustment));
3119 default:
3120 gcc_unreachable ();
3124 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3125 stack pointer by ADJUSTMENT. */
3127 static void
3128 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3130 rtx_insn *insn;
3131 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3133 if (regno2 == INVALID_REGNUM)
3134 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3136 rtx reg1 = gen_rtx_REG (mode, regno1);
3137 rtx reg2 = gen_rtx_REG (mode, regno2);
3139 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3140 reg2, adjustment));
3141 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3142 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3143 RTX_FRAME_RELATED_P (insn) = 1;
3146 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3147 adjusting it by ADJUSTMENT afterwards. */
3149 static rtx
3150 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3151 HOST_WIDE_INT adjustment)
3153 switch (mode)
3155 case E_DImode:
3156 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3157 GEN_INT (UNITS_PER_WORD));
3158 case E_DFmode:
3159 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3160 GEN_INT (UNITS_PER_WORD));
3161 default:
3162 gcc_unreachable ();
3166 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3167 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3168 into CFI_OPS. */
3170 static void
3171 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3172 rtx *cfi_ops)
3174 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3175 rtx reg1 = gen_rtx_REG (mode, regno1);
3177 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3179 if (regno2 == INVALID_REGNUM)
3181 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3182 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3183 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3185 else
3187 rtx reg2 = gen_rtx_REG (mode, regno2);
3188 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3189 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3190 reg2, adjustment));
3194 /* Generate and return a store pair instruction of mode MODE to store
3195 register REG1 to MEM1 and register REG2 to MEM2. */
3197 static rtx
3198 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3199 rtx reg2)
3201 switch (mode)
3203 case E_DImode:
3204 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3206 case E_DFmode:
3207 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3209 default:
3210 gcc_unreachable ();
3214 /* Generate and regurn a load pair isntruction of mode MODE to load register
3215 REG1 from MEM1 and register REG2 from MEM2. */
3217 static rtx
3218 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3219 rtx mem2)
3221 switch (mode)
3223 case E_DImode:
3224 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3226 case E_DFmode:
3227 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3229 default:
3230 gcc_unreachable ();
3234 /* Return TRUE if return address signing should be enabled for the current
3235 function, otherwise return FALSE. */
3237 bool
3238 aarch64_return_address_signing_enabled (void)
3240 /* This function should only be called after frame laid out. */
3241 gcc_assert (cfun->machine->frame.laid_out);
3243 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3244 if it's LR is pushed onto stack. */
3245 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3246 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3247 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3250 /* Emit code to save the callee-saved registers from register number START
3251 to LIMIT to the stack at the location starting at offset START_OFFSET,
3252 skipping any write-back candidates if SKIP_WB is true. */
3254 static void
3255 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3256 unsigned start, unsigned limit, bool skip_wb)
3258 rtx_insn *insn;
3259 unsigned regno;
3260 unsigned regno2;
3262 for (regno = aarch64_next_callee_save (start, limit);
3263 regno <= limit;
3264 regno = aarch64_next_callee_save (regno + 1, limit))
3266 rtx reg, mem;
3267 HOST_WIDE_INT offset;
3269 if (skip_wb
3270 && (regno == cfun->machine->frame.wb_candidate1
3271 || regno == cfun->machine->frame.wb_candidate2))
3272 continue;
3274 if (cfun->machine->reg_is_wrapped_separately[regno])
3275 continue;
3277 reg = gen_rtx_REG (mode, regno);
3278 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3279 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3280 offset));
3282 regno2 = aarch64_next_callee_save (regno + 1, limit);
3284 if (regno2 <= limit
3285 && !cfun->machine->reg_is_wrapped_separately[regno2]
3286 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3287 == cfun->machine->frame.reg_offset[regno2]))
3290 rtx reg2 = gen_rtx_REG (mode, regno2);
3291 rtx mem2;
3293 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3294 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3295 offset));
3296 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3297 reg2));
3299 /* The first part of a frame-related parallel insn is
3300 always assumed to be relevant to the frame
3301 calculations; subsequent parts, are only
3302 frame-related if explicitly marked. */
3303 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3304 regno = regno2;
3306 else
3307 insn = emit_move_insn (mem, reg);
3309 RTX_FRAME_RELATED_P (insn) = 1;
3313 /* Emit code to restore the callee registers of mode MODE from register
3314 number START up to and including LIMIT. Restore from the stack offset
3315 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3316 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3318 static void
3319 aarch64_restore_callee_saves (machine_mode mode,
3320 HOST_WIDE_INT start_offset, unsigned start,
3321 unsigned limit, bool skip_wb, rtx *cfi_ops)
3323 rtx base_rtx = stack_pointer_rtx;
3324 unsigned regno;
3325 unsigned regno2;
3326 HOST_WIDE_INT offset;
3328 for (regno = aarch64_next_callee_save (start, limit);
3329 regno <= limit;
3330 regno = aarch64_next_callee_save (regno + 1, limit))
3332 if (cfun->machine->reg_is_wrapped_separately[regno])
3333 continue;
3335 rtx reg, mem;
3337 if (skip_wb
3338 && (regno == cfun->machine->frame.wb_candidate1
3339 || regno == cfun->machine->frame.wb_candidate2))
3340 continue;
3342 reg = gen_rtx_REG (mode, regno);
3343 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3344 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3346 regno2 = aarch64_next_callee_save (regno + 1, limit);
3348 if (regno2 <= limit
3349 && !cfun->machine->reg_is_wrapped_separately[regno2]
3350 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3351 == cfun->machine->frame.reg_offset[regno2]))
3353 rtx reg2 = gen_rtx_REG (mode, regno2);
3354 rtx mem2;
3356 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3357 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3358 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3360 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3361 regno = regno2;
3363 else
3364 emit_move_insn (reg, mem);
3365 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3369 static inline bool
3370 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3371 HOST_WIDE_INT offset)
3373 return offset >= -256 && offset < 256;
3376 static inline bool
3377 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3379 return (offset >= 0
3380 && offset < 4096 * GET_MODE_SIZE (mode)
3381 && offset % GET_MODE_SIZE (mode) == 0);
3384 bool
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3387 return (offset >= -64 * GET_MODE_SIZE (mode)
3388 && offset < 64 * GET_MODE_SIZE (mode)
3389 && offset % GET_MODE_SIZE (mode) == 0);
3392 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3394 static sbitmap
3395 aarch64_get_separate_components (void)
3397 aarch64_layout_frame ();
3399 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3400 bitmap_clear (components);
3402 /* The registers we need saved to the frame. */
3403 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3404 if (aarch64_register_saved_on_entry (regno))
3406 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3407 if (!frame_pointer_needed)
3408 offset += cfun->machine->frame.frame_size
3409 - cfun->machine->frame.hard_fp_offset;
3410 /* Check that we can access the stack slot of the register with one
3411 direct load with no adjustments needed. */
3412 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3413 bitmap_set_bit (components, regno);
3416 /* Don't mess with the hard frame pointer. */
3417 if (frame_pointer_needed)
3418 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3420 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3421 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3422 /* If aarch64_layout_frame has chosen registers to store/restore with
3423 writeback don't interfere with them to avoid having to output explicit
3424 stack adjustment instructions. */
3425 if (reg2 != INVALID_REGNUM)
3426 bitmap_clear_bit (components, reg2);
3427 if (reg1 != INVALID_REGNUM)
3428 bitmap_clear_bit (components, reg1);
3430 bitmap_clear_bit (components, LR_REGNUM);
3431 bitmap_clear_bit (components, SP_REGNUM);
3433 return components;
3436 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3438 static sbitmap
3439 aarch64_components_for_bb (basic_block bb)
3441 bitmap in = DF_LIVE_IN (bb);
3442 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3443 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3445 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3446 bitmap_clear (components);
3448 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3449 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3450 if ((!call_used_regs[regno])
3451 && (bitmap_bit_p (in, regno)
3452 || bitmap_bit_p (gen, regno)
3453 || bitmap_bit_p (kill, regno)))
3454 bitmap_set_bit (components, regno);
3456 return components;
3459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3460 Nothing to do for aarch64. */
3462 static void
3463 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3467 /* Return the next set bit in BMP from START onwards. Return the total number
3468 of bits in BMP if no set bit is found at or after START. */
3470 static unsigned int
3471 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3473 unsigned int nbits = SBITMAP_SIZE (bmp);
3474 if (start == nbits)
3475 return start;
3477 gcc_assert (start < nbits);
3478 for (unsigned int i = start; i < nbits; i++)
3479 if (bitmap_bit_p (bmp, i))
3480 return i;
3482 return nbits;
3485 /* Do the work for aarch64_emit_prologue_components and
3486 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3487 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3488 for these components or the epilogue sequence. That is, it determines
3489 whether we should emit stores or loads and what kind of CFA notes to attach
3490 to the insns. Otherwise the logic for the two sequences is very
3491 similar. */
3493 static void
3494 aarch64_process_components (sbitmap components, bool prologue_p)
3496 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3497 ? HARD_FRAME_POINTER_REGNUM
3498 : STACK_POINTER_REGNUM);
3500 unsigned last_regno = SBITMAP_SIZE (components);
3501 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3502 rtx_insn *insn = NULL;
3504 while (regno != last_regno)
3506 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3507 so DFmode for the vector registers is enough. */
3508 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3509 rtx reg = gen_rtx_REG (mode, regno);
3510 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3511 if (!frame_pointer_needed)
3512 offset += cfun->machine->frame.frame_size
3513 - cfun->machine->frame.hard_fp_offset;
3514 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3515 rtx mem = gen_frame_mem (mode, addr);
3517 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3518 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3519 /* No more registers to handle after REGNO.
3520 Emit a single save/restore and exit. */
3521 if (regno2 == last_regno)
3523 insn = emit_insn (set);
3524 RTX_FRAME_RELATED_P (insn) = 1;
3525 if (prologue_p)
3526 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3527 else
3528 add_reg_note (insn, REG_CFA_RESTORE, reg);
3529 break;
3532 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3533 /* The next register is not of the same class or its offset is not
3534 mergeable with the current one into a pair. */
3535 if (!satisfies_constraint_Ump (mem)
3536 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3537 || (offset2 - cfun->machine->frame.reg_offset[regno])
3538 != GET_MODE_SIZE (mode))
3540 insn = emit_insn (set);
3541 RTX_FRAME_RELATED_P (insn) = 1;
3542 if (prologue_p)
3543 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3544 else
3545 add_reg_note (insn, REG_CFA_RESTORE, reg);
3547 regno = regno2;
3548 continue;
3551 /* REGNO2 can be saved/restored in a pair with REGNO. */
3552 rtx reg2 = gen_rtx_REG (mode, regno2);
3553 if (!frame_pointer_needed)
3554 offset2 += cfun->machine->frame.frame_size
3555 - cfun->machine->frame.hard_fp_offset;
3556 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3557 rtx mem2 = gen_frame_mem (mode, addr2);
3558 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3559 : gen_rtx_SET (reg2, mem2);
3561 if (prologue_p)
3562 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3563 else
3564 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3566 RTX_FRAME_RELATED_P (insn) = 1;
3567 if (prologue_p)
3569 add_reg_note (insn, REG_CFA_OFFSET, set);
3570 add_reg_note (insn, REG_CFA_OFFSET, set2);
3572 else
3574 add_reg_note (insn, REG_CFA_RESTORE, reg);
3575 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3578 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3582 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3584 static void
3585 aarch64_emit_prologue_components (sbitmap components)
3587 aarch64_process_components (components, true);
3590 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3592 static void
3593 aarch64_emit_epilogue_components (sbitmap components)
3595 aarch64_process_components (components, false);
3598 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3600 static void
3601 aarch64_set_handled_components (sbitmap components)
3603 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3604 if (bitmap_bit_p (components, regno))
3605 cfun->machine->reg_is_wrapped_separately[regno] = true;
3608 /* AArch64 stack frames generated by this compiler look like:
3610 +-------------------------------+
3612 | incoming stack arguments |
3614 +-------------------------------+
3615 | | <-- incoming stack pointer (aligned)
3616 | callee-allocated save area |
3617 | for register varargs |
3619 +-------------------------------+
3620 | local variables | <-- frame_pointer_rtx
3622 +-------------------------------+
3623 | padding0 | \
3624 +-------------------------------+ |
3625 | callee-saved registers | | frame.saved_regs_size
3626 +-------------------------------+ |
3627 | LR' | |
3628 +-------------------------------+ |
3629 | FP' | / <- hard_frame_pointer_rtx (aligned)
3630 +-------------------------------+
3631 | dynamic allocation |
3632 +-------------------------------+
3633 | padding |
3634 +-------------------------------+
3635 | outgoing stack arguments | <-- arg_pointer
3637 +-------------------------------+
3638 | | <-- stack_pointer_rtx (aligned)
3640 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3641 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3642 unchanged. */
3644 /* Generate the prologue instructions for entry into a function.
3645 Establish the stack frame by decreasing the stack pointer with a
3646 properly calculated size and, if necessary, create a frame record
3647 filled with the values of LR and previous frame pointer. The
3648 current FP is also set up if it is in use. */
3650 void
3651 aarch64_expand_prologue (void)
3653 aarch64_layout_frame ();
3655 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3656 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3657 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3658 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3659 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3660 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3661 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3662 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
3663 rtx_insn *insn;
3665 /* Sign return address for functions. */
3666 if (aarch64_return_address_signing_enabled ())
3668 insn = emit_insn (gen_pacisp ());
3669 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3670 RTX_FRAME_RELATED_P (insn) = 1;
3673 if (flag_stack_usage_info)
3674 current_function_static_stack_size = frame_size;
3676 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3678 if (crtl->is_leaf && !cfun->calls_alloca)
3680 if (frame_size > PROBE_INTERVAL
3681 && frame_size > get_stack_check_protect ())
3682 aarch64_emit_probe_stack_range (get_stack_check_protect (),
3683 (frame_size
3684 - get_stack_check_protect ()));
3686 else if (frame_size > 0)
3687 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
3690 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3692 if (callee_adjust != 0)
3693 aarch64_push_regs (reg1, reg2, callee_adjust);
3695 if (emit_frame_chain)
3697 if (callee_adjust == 0)
3698 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3699 R30_REGNUM, false);
3700 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3701 stack_pointer_rtx,
3702 GEN_INT (callee_offset)));
3703 RTX_FRAME_RELATED_P (insn) = frame_pointer_needed;
3704 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3707 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3708 callee_adjust != 0 || emit_frame_chain);
3709 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3710 callee_adjust != 0 || emit_frame_chain);
3711 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3714 /* Return TRUE if we can use a simple_return insn.
3716 This function checks whether the callee saved stack is empty, which
3717 means no restore actions are need. The pro_and_epilogue will use
3718 this to check whether shrink-wrapping opt is feasible. */
3720 bool
3721 aarch64_use_return_insn_p (void)
3723 if (!reload_completed)
3724 return false;
3726 if (crtl->profile)
3727 return false;
3729 aarch64_layout_frame ();
3731 return cfun->machine->frame.frame_size == 0;
3734 /* Generate the epilogue instructions for returning from a function.
3735 This is almost exactly the reverse of the prolog sequence, except
3736 that we need to insert barriers to avoid scheduling loads that read
3737 from a deallocated stack, and we optimize the unwind records by
3738 emitting them all together if possible. */
3739 void
3740 aarch64_expand_epilogue (bool for_sibcall)
3742 aarch64_layout_frame ();
3744 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3745 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3746 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3747 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3748 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3749 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3750 rtx cfi_ops = NULL;
3751 rtx_insn *insn;
3753 /* We need to add memory barrier to prevent read from deallocated stack. */
3754 bool need_barrier_p = (get_frame_size ()
3755 + cfun->machine->frame.saved_varargs_size) != 0;
3757 /* Emit a barrier to prevent loads from a deallocated stack. */
3758 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3759 || crtl->calls_eh_return)
3761 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3762 need_barrier_p = false;
3765 /* Restore the stack pointer from the frame pointer if it may not
3766 be the same as the stack pointer. */
3767 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3769 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3770 hard_frame_pointer_rtx,
3771 GEN_INT (-callee_offset)));
3772 /* If writeback is used when restoring callee-saves, the CFA
3773 is restored on the instruction doing the writeback. */
3774 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3776 else
3777 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3779 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3780 callee_adjust != 0, &cfi_ops);
3781 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3782 callee_adjust != 0, &cfi_ops);
3784 if (need_barrier_p)
3785 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3787 if (callee_adjust != 0)
3788 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3790 if (callee_adjust != 0 || initial_adjust > 65536)
3792 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3793 insn = get_last_insn ();
3794 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3795 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3796 RTX_FRAME_RELATED_P (insn) = 1;
3797 cfi_ops = NULL;
3800 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3802 if (cfi_ops)
3804 /* Emit delayed restores and reset the CFA to be SP. */
3805 insn = get_last_insn ();
3806 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3807 REG_NOTES (insn) = cfi_ops;
3808 RTX_FRAME_RELATED_P (insn) = 1;
3811 /* We prefer to emit the combined return/authenticate instruction RETAA,
3812 however there are three cases in which we must instead emit an explicit
3813 authentication instruction.
3815 1) Sibcalls don't return in a normal way, so if we're about to call one
3816 we must authenticate.
3818 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3819 generating code for !TARGET_ARMV8_3 we can't use it and must
3820 explicitly authenticate.
3822 3) On an eh_return path we make extra stack adjustments to update the
3823 canonical frame address to be the exception handler's CFA. We want
3824 to authenticate using the CFA of the function which calls eh_return.
3826 if (aarch64_return_address_signing_enabled ()
3827 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3829 insn = emit_insn (gen_autisp ());
3830 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3831 RTX_FRAME_RELATED_P (insn) = 1;
3834 /* Stack adjustment for exception handler. */
3835 if (crtl->calls_eh_return)
3837 /* We need to unwind the stack by the offset computed by
3838 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3839 to be SP; letting the CFA move during this adjustment
3840 is just as correct as retaining the CFA from the body
3841 of the function. Therefore, do nothing special. */
3842 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3845 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3846 if (!for_sibcall)
3847 emit_jump_insn (ret_rtx);
3850 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3851 normally or return to a previous frame after unwinding.
3853 An EH return uses a single shared return sequence. The epilogue is
3854 exactly like a normal epilogue except that it has an extra input
3855 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3856 that must be applied after the frame has been destroyed. An extra label
3857 is inserted before the epilogue which initializes this register to zero,
3858 and this is the entry point for a normal return.
3860 An actual EH return updates the return address, initializes the stack
3861 adjustment and jumps directly into the epilogue (bypassing the zeroing
3862 of the adjustment). Since the return address is typically saved on the
3863 stack when a function makes a call, the saved LR must be updated outside
3864 the epilogue.
3866 This poses problems as the store is generated well before the epilogue,
3867 so the offset of LR is not known yet. Also optimizations will remove the
3868 store as it appears dead, even after the epilogue is generated (as the
3869 base or offset for loading LR is different in many cases).
3871 To avoid these problems this implementation forces the frame pointer
3872 in eh_return functions so that the location of LR is fixed and known early.
3873 It also marks the store volatile, so no optimization is permitted to
3874 remove the store. */
3876 aarch64_eh_return_handler_rtx (void)
3878 rtx tmp = gen_frame_mem (Pmode,
3879 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3881 /* Mark the store volatile, so no optimization is permitted to remove it. */
3882 MEM_VOLATILE_P (tmp) = true;
3883 return tmp;
3886 /* Output code to add DELTA to the first argument, and then jump
3887 to FUNCTION. Used for C++ multiple inheritance. */
3888 static void
3889 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3890 HOST_WIDE_INT delta,
3891 HOST_WIDE_INT vcall_offset,
3892 tree function)
3894 /* The this pointer is always in x0. Note that this differs from
3895 Arm where the this pointer maybe bumped to r1 if r0 is required
3896 to return a pointer to an aggregate. On AArch64 a result value
3897 pointer will be in x8. */
3898 int this_regno = R0_REGNUM;
3899 rtx this_rtx, temp0, temp1, addr, funexp;
3900 rtx_insn *insn;
3902 reload_completed = 1;
3903 emit_note (NOTE_INSN_PROLOGUE_END);
3905 if (vcall_offset == 0)
3906 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3907 else
3909 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3911 this_rtx = gen_rtx_REG (Pmode, this_regno);
3912 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3913 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3915 addr = this_rtx;
3916 if (delta != 0)
3918 if (delta >= -256 && delta < 256)
3919 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3920 plus_constant (Pmode, this_rtx, delta));
3921 else
3922 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3925 if (Pmode == ptr_mode)
3926 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3927 else
3928 aarch64_emit_move (temp0,
3929 gen_rtx_ZERO_EXTEND (Pmode,
3930 gen_rtx_MEM (ptr_mode, addr)));
3932 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3933 addr = plus_constant (Pmode, temp0, vcall_offset);
3934 else
3936 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3937 Pmode);
3938 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3941 if (Pmode == ptr_mode)
3942 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3943 else
3944 aarch64_emit_move (temp1,
3945 gen_rtx_SIGN_EXTEND (Pmode,
3946 gen_rtx_MEM (ptr_mode, addr)));
3948 emit_insn (gen_add2_insn (this_rtx, temp1));
3951 /* Generate a tail call to the target function. */
3952 if (!TREE_USED (function))
3954 assemble_external (function);
3955 TREE_USED (function) = 1;
3957 funexp = XEXP (DECL_RTL (function), 0);
3958 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3959 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3960 SIBLING_CALL_P (insn) = 1;
3962 insn = get_insns ();
3963 shorten_branches (insn);
3964 final_start_function (insn, file, 1);
3965 final (insn, file, 1);
3966 final_end_function ();
3968 /* Stop pretending to be a post-reload pass. */
3969 reload_completed = 0;
3972 static bool
3973 aarch64_tls_referenced_p (rtx x)
3975 if (!TARGET_HAVE_TLS)
3976 return false;
3977 subrtx_iterator::array_type array;
3978 FOR_EACH_SUBRTX (iter, array, x, ALL)
3980 const_rtx x = *iter;
3981 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3982 return true;
3983 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3984 TLS offsets, not real symbol references. */
3985 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3986 iter.skip_subrtxes ();
3988 return false;
3992 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3993 a left shift of 0 or 12 bits. */
3994 bool
3995 aarch64_uimm12_shift (HOST_WIDE_INT val)
3997 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3998 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
4003 /* Return true if val is an immediate that can be loaded into a
4004 register by a MOVZ instruction. */
4005 static bool
4006 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
4008 if (GET_MODE_SIZE (mode) > 4)
4010 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4011 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4012 return 1;
4014 else
4016 /* Ignore sign extension. */
4017 val &= (HOST_WIDE_INT) 0xffffffff;
4019 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4020 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4023 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4025 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4027 0x0000000100000001ull,
4028 0x0001000100010001ull,
4029 0x0101010101010101ull,
4030 0x1111111111111111ull,
4031 0x5555555555555555ull,
4035 /* Return true if val is a valid bitmask immediate. */
4037 bool
4038 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4040 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4041 int bits;
4043 /* Check for a single sequence of one bits and return quickly if so.
4044 The special cases of all ones and all zeroes returns false. */
4045 val = (unsigned HOST_WIDE_INT) val_in;
4046 tmp = val + (val & -val);
4048 if (tmp == (tmp & -tmp))
4049 return (val + 1) > 1;
4051 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4052 if (mode == SImode)
4053 val = (val << 32) | (val & 0xffffffff);
4055 /* Invert if the immediate doesn't start with a zero bit - this means we
4056 only need to search for sequences of one bits. */
4057 if (val & 1)
4058 val = ~val;
4060 /* Find the first set bit and set tmp to val with the first sequence of one
4061 bits removed. Return success if there is a single sequence of ones. */
4062 first_one = val & -val;
4063 tmp = val & (val + first_one);
4065 if (tmp == 0)
4066 return true;
4068 /* Find the next set bit and compute the difference in bit position. */
4069 next_one = tmp & -tmp;
4070 bits = clz_hwi (first_one) - clz_hwi (next_one);
4071 mask = val ^ tmp;
4073 /* Check the bit position difference is a power of 2, and that the first
4074 sequence of one bits fits within 'bits' bits. */
4075 if ((mask >> bits) != 0 || bits != (bits & -bits))
4076 return false;
4078 /* Check the sequence of one bits is repeated 64/bits times. */
4079 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4082 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4083 Assumed precondition: VAL_IN Is not zero. */
4085 unsigned HOST_WIDE_INT
4086 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4088 int lowest_bit_set = ctz_hwi (val_in);
4089 int highest_bit_set = floor_log2 (val_in);
4090 gcc_assert (val_in != 0);
4092 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4093 (HOST_WIDE_INT_1U << lowest_bit_set));
4096 /* Create constant where bits outside of lowest bit set to highest bit set
4097 are set to 1. */
4099 unsigned HOST_WIDE_INT
4100 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4102 return val_in | ~aarch64_and_split_imm1 (val_in);
4105 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4107 bool
4108 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4110 scalar_int_mode int_mode;
4111 if (!is_a <scalar_int_mode> (mode, &int_mode))
4112 return false;
4114 if (aarch64_bitmask_imm (val_in, int_mode))
4115 return false;
4117 if (aarch64_move_imm (val_in, int_mode))
4118 return false;
4120 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4122 return aarch64_bitmask_imm (imm2, int_mode);
4125 /* Return true if val is an immediate that can be loaded into a
4126 register in a single instruction. */
4127 bool
4128 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4130 scalar_int_mode int_mode;
4131 if (!is_a <scalar_int_mode> (mode, &int_mode))
4132 return false;
4134 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4135 return 1;
4136 return aarch64_bitmask_imm (val, int_mode);
4139 static bool
4140 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4142 rtx base, offset;
4144 if (GET_CODE (x) == HIGH)
4145 return true;
4147 split_const (x, &base, &offset);
4148 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4150 if (aarch64_classify_symbol (base, offset)
4151 != SYMBOL_FORCE_TO_MEM)
4152 return true;
4153 else
4154 /* Avoid generating a 64-bit relocation in ILP32; leave
4155 to aarch64_expand_mov_immediate to handle it properly. */
4156 return mode != ptr_mode;
4159 return aarch64_tls_referenced_p (x);
4162 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4163 The expansion for a table switch is quite expensive due to the number
4164 of instructions, the table lookup and hard to predict indirect jump.
4165 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4166 set, otherwise use tables for > 16 cases as a tradeoff between size and
4167 performance. When optimizing for size, use the default setting. */
4169 static unsigned int
4170 aarch64_case_values_threshold (void)
4172 /* Use the specified limit for the number of cases before using jump
4173 tables at higher optimization levels. */
4174 if (optimize > 2
4175 && selected_cpu->tune->max_case_values != 0)
4176 return selected_cpu->tune->max_case_values;
4177 else
4178 return optimize_size ? default_case_values_threshold () : 17;
4181 /* Return true if register REGNO is a valid index register.
4182 STRICT_P is true if REG_OK_STRICT is in effect. */
4184 bool
4185 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4187 if (!HARD_REGISTER_NUM_P (regno))
4189 if (!strict_p)
4190 return true;
4192 if (!reg_renumber)
4193 return false;
4195 regno = reg_renumber[regno];
4197 return GP_REGNUM_P (regno);
4200 /* Return true if register REGNO is a valid base register for mode MODE.
4201 STRICT_P is true if REG_OK_STRICT is in effect. */
4203 bool
4204 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4206 if (!HARD_REGISTER_NUM_P (regno))
4208 if (!strict_p)
4209 return true;
4211 if (!reg_renumber)
4212 return false;
4214 regno = reg_renumber[regno];
4217 /* The fake registers will be eliminated to either the stack or
4218 hard frame pointer, both of which are usually valid base registers.
4219 Reload deals with the cases where the eliminated form isn't valid. */
4220 return (GP_REGNUM_P (regno)
4221 || regno == SP_REGNUM
4222 || regno == FRAME_POINTER_REGNUM
4223 || regno == ARG_POINTER_REGNUM);
4226 /* Return true if X is a valid base register for mode MODE.
4227 STRICT_P is true if REG_OK_STRICT is in effect. */
4229 static bool
4230 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4232 if (!strict_p
4233 && GET_CODE (x) == SUBREG
4234 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4235 x = SUBREG_REG (x);
4237 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4240 /* Return true if address offset is a valid index. If it is, fill in INFO
4241 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4243 static bool
4244 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4245 machine_mode mode, bool strict_p)
4247 enum aarch64_address_type type;
4248 rtx index;
4249 int shift;
4251 /* (reg:P) */
4252 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4253 && GET_MODE (x) == Pmode)
4255 type = ADDRESS_REG_REG;
4256 index = x;
4257 shift = 0;
4259 /* (sign_extend:DI (reg:SI)) */
4260 else if ((GET_CODE (x) == SIGN_EXTEND
4261 || GET_CODE (x) == ZERO_EXTEND)
4262 && GET_MODE (x) == DImode
4263 && GET_MODE (XEXP (x, 0)) == SImode)
4265 type = (GET_CODE (x) == SIGN_EXTEND)
4266 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267 index = XEXP (x, 0);
4268 shift = 0;
4270 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4271 else if (GET_CODE (x) == MULT
4272 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274 && GET_MODE (XEXP (x, 0)) == DImode
4275 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276 && CONST_INT_P (XEXP (x, 1)))
4278 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280 index = XEXP (XEXP (x, 0), 0);
4281 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4283 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4284 else if (GET_CODE (x) == ASHIFT
4285 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4286 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4287 && GET_MODE (XEXP (x, 0)) == DImode
4288 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4289 && CONST_INT_P (XEXP (x, 1)))
4291 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4292 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293 index = XEXP (XEXP (x, 0), 0);
4294 shift = INTVAL (XEXP (x, 1));
4296 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4297 else if ((GET_CODE (x) == SIGN_EXTRACT
4298 || GET_CODE (x) == ZERO_EXTRACT)
4299 && GET_MODE (x) == DImode
4300 && GET_CODE (XEXP (x, 0)) == MULT
4301 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4302 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4304 type = (GET_CODE (x) == SIGN_EXTRACT)
4305 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4306 index = XEXP (XEXP (x, 0), 0);
4307 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4308 if (INTVAL (XEXP (x, 1)) != 32 + shift
4309 || INTVAL (XEXP (x, 2)) != 0)
4310 shift = -1;
4312 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4313 (const_int 0xffffffff<<shift)) */
4314 else if (GET_CODE (x) == AND
4315 && GET_MODE (x) == DImode
4316 && GET_CODE (XEXP (x, 0)) == MULT
4317 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4318 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4319 && CONST_INT_P (XEXP (x, 1)))
4321 type = ADDRESS_REG_UXTW;
4322 index = XEXP (XEXP (x, 0), 0);
4323 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4324 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4325 shift = -1;
4327 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4328 else if ((GET_CODE (x) == SIGN_EXTRACT
4329 || GET_CODE (x) == ZERO_EXTRACT)
4330 && GET_MODE (x) == DImode
4331 && GET_CODE (XEXP (x, 0)) == ASHIFT
4332 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4333 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4335 type = (GET_CODE (x) == SIGN_EXTRACT)
4336 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4337 index = XEXP (XEXP (x, 0), 0);
4338 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4339 if (INTVAL (XEXP (x, 1)) != 32 + shift
4340 || INTVAL (XEXP (x, 2)) != 0)
4341 shift = -1;
4343 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4344 (const_int 0xffffffff<<shift)) */
4345 else if (GET_CODE (x) == AND
4346 && GET_MODE (x) == DImode
4347 && GET_CODE (XEXP (x, 0)) == ASHIFT
4348 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4349 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4350 && CONST_INT_P (XEXP (x, 1)))
4352 type = ADDRESS_REG_UXTW;
4353 index = XEXP (XEXP (x, 0), 0);
4354 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4355 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4356 shift = -1;
4358 /* (mult:P (reg:P) (const_int scale)) */
4359 else if (GET_CODE (x) == MULT
4360 && GET_MODE (x) == Pmode
4361 && GET_MODE (XEXP (x, 0)) == Pmode
4362 && CONST_INT_P (XEXP (x, 1)))
4364 type = ADDRESS_REG_REG;
4365 index = XEXP (x, 0);
4366 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4368 /* (ashift:P (reg:P) (const_int shift)) */
4369 else if (GET_CODE (x) == ASHIFT
4370 && GET_MODE (x) == Pmode
4371 && GET_MODE (XEXP (x, 0)) == Pmode
4372 && CONST_INT_P (XEXP (x, 1)))
4374 type = ADDRESS_REG_REG;
4375 index = XEXP (x, 0);
4376 shift = INTVAL (XEXP (x, 1));
4378 else
4379 return false;
4381 if (!strict_p
4382 && GET_CODE (index) == SUBREG
4383 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4384 index = SUBREG_REG (index);
4386 if ((shift == 0 ||
4387 (shift > 0 && shift <= 3
4388 && (1 << shift) == GET_MODE_SIZE (mode)))
4389 && REG_P (index)
4390 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4392 info->type = type;
4393 info->offset = index;
4394 info->shift = shift;
4395 return true;
4398 return false;
4401 /* Return true if MODE is one of the modes for which we
4402 support LDP/STP operations. */
4404 static bool
4405 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4407 return mode == SImode || mode == DImode
4408 || mode == SFmode || mode == DFmode
4409 || (aarch64_vector_mode_supported_p (mode)
4410 && GET_MODE_SIZE (mode) == 8);
4413 /* Return true if REGNO is a virtual pointer register, or an eliminable
4414 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4415 include stack_pointer or hard_frame_pointer. */
4416 static bool
4417 virt_or_elim_regno_p (unsigned regno)
4419 return ((regno >= FIRST_VIRTUAL_REGISTER
4420 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4421 || regno == FRAME_POINTER_REGNUM
4422 || regno == ARG_POINTER_REGNUM);
4425 /* Return true if X is a valid address for machine mode MODE. If it is,
4426 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4427 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4429 static bool
4430 aarch64_classify_address (struct aarch64_address_info *info,
4431 rtx x, machine_mode mode,
4432 RTX_CODE outer_code, bool strict_p)
4434 enum rtx_code code = GET_CODE (x);
4435 rtx op0, op1;
4437 /* On BE, we use load/store pair for all large int mode load/stores.
4438 TI/TFmode may also use a load/store pair. */
4439 bool load_store_pair_p = (outer_code == PARALLEL
4440 || mode == TImode
4441 || mode == TFmode
4442 || (BYTES_BIG_ENDIAN
4443 && aarch64_vect_struct_mode_p (mode)));
4445 bool allow_reg_index_p =
4446 !load_store_pair_p
4447 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4448 && !aarch64_vect_struct_mode_p (mode);
4450 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4451 REG addressing. */
4452 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4453 && (code != POST_INC && code != REG))
4454 return false;
4456 switch (code)
4458 case REG:
4459 case SUBREG:
4460 info->type = ADDRESS_REG_IMM;
4461 info->base = x;
4462 info->offset = const0_rtx;
4463 return aarch64_base_register_rtx_p (x, strict_p);
4465 case PLUS:
4466 op0 = XEXP (x, 0);
4467 op1 = XEXP (x, 1);
4469 if (! strict_p
4470 && REG_P (op0)
4471 && virt_or_elim_regno_p (REGNO (op0))
4472 && CONST_INT_P (op1))
4474 info->type = ADDRESS_REG_IMM;
4475 info->base = op0;
4476 info->offset = op1;
4478 return true;
4481 if (GET_MODE_SIZE (mode) != 0
4482 && CONST_INT_P (op1)
4483 && aarch64_base_register_rtx_p (op0, strict_p))
4485 HOST_WIDE_INT offset = INTVAL (op1);
4487 info->type = ADDRESS_REG_IMM;
4488 info->base = op0;
4489 info->offset = op1;
4491 /* TImode and TFmode values are allowed in both pairs of X
4492 registers and individual Q registers. The available
4493 address modes are:
4494 X,X: 7-bit signed scaled offset
4495 Q: 9-bit signed offset
4496 We conservatively require an offset representable in either mode.
4497 When performing the check for pairs of X registers i.e. LDP/STP
4498 pass down DImode since that is the natural size of the LDP/STP
4499 instruction memory accesses. */
4500 if (mode == TImode || mode == TFmode)
4501 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4502 && (offset_9bit_signed_unscaled_p (mode, offset)
4503 || offset_12bit_unsigned_scaled_p (mode, offset)));
4505 /* A 7bit offset check because OImode will emit a ldp/stp
4506 instruction (only big endian will get here).
4507 For ldp/stp instructions, the offset is scaled for the size of a
4508 single element of the pair. */
4509 if (mode == OImode)
4510 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4512 /* Three 9/12 bit offsets checks because CImode will emit three
4513 ldr/str instructions (only big endian will get here). */
4514 if (mode == CImode)
4515 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4516 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4517 || offset_12bit_unsigned_scaled_p (V16QImode,
4518 offset + 32)));
4520 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4521 instructions (only big endian will get here). */
4522 if (mode == XImode)
4523 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4524 && aarch64_offset_7bit_signed_scaled_p (TImode,
4525 offset + 32));
4527 if (load_store_pair_p)
4528 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4529 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4530 else
4531 return (offset_9bit_signed_unscaled_p (mode, offset)
4532 || offset_12bit_unsigned_scaled_p (mode, offset));
4535 if (allow_reg_index_p)
4537 /* Look for base + (scaled/extended) index register. */
4538 if (aarch64_base_register_rtx_p (op0, strict_p)
4539 && aarch64_classify_index (info, op1, mode, strict_p))
4541 info->base = op0;
4542 return true;
4544 if (aarch64_base_register_rtx_p (op1, strict_p)
4545 && aarch64_classify_index (info, op0, mode, strict_p))
4547 info->base = op1;
4548 return true;
4552 return false;
4554 case POST_INC:
4555 case POST_DEC:
4556 case PRE_INC:
4557 case PRE_DEC:
4558 info->type = ADDRESS_REG_WB;
4559 info->base = XEXP (x, 0);
4560 info->offset = NULL_RTX;
4561 return aarch64_base_register_rtx_p (info->base, strict_p);
4563 case POST_MODIFY:
4564 case PRE_MODIFY:
4565 info->type = ADDRESS_REG_WB;
4566 info->base = XEXP (x, 0);
4567 if (GET_CODE (XEXP (x, 1)) == PLUS
4568 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4569 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4570 && aarch64_base_register_rtx_p (info->base, strict_p))
4572 HOST_WIDE_INT offset;
4573 info->offset = XEXP (XEXP (x, 1), 1);
4574 offset = INTVAL (info->offset);
4576 /* TImode and TFmode values are allowed in both pairs of X
4577 registers and individual Q registers. The available
4578 address modes are:
4579 X,X: 7-bit signed scaled offset
4580 Q: 9-bit signed offset
4581 We conservatively require an offset representable in either mode.
4583 if (mode == TImode || mode == TFmode)
4584 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4585 && offset_9bit_signed_unscaled_p (mode, offset));
4587 if (load_store_pair_p)
4588 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4589 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4590 else
4591 return offset_9bit_signed_unscaled_p (mode, offset);
4593 return false;
4595 case CONST:
4596 case SYMBOL_REF:
4597 case LABEL_REF:
4598 /* load literal: pc-relative constant pool entry. Only supported
4599 for SI mode or larger. */
4600 info->type = ADDRESS_SYMBOLIC;
4602 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4604 rtx sym, addend;
4606 split_const (x, &sym, &addend);
4607 return ((GET_CODE (sym) == LABEL_REF
4608 || (GET_CODE (sym) == SYMBOL_REF
4609 && CONSTANT_POOL_ADDRESS_P (sym)
4610 && aarch64_pcrelative_literal_loads)));
4612 return false;
4614 case LO_SUM:
4615 info->type = ADDRESS_LO_SUM;
4616 info->base = XEXP (x, 0);
4617 info->offset = XEXP (x, 1);
4618 if (allow_reg_index_p
4619 && aarch64_base_register_rtx_p (info->base, strict_p))
4621 rtx sym, offs;
4622 split_const (info->offset, &sym, &offs);
4623 if (GET_CODE (sym) == SYMBOL_REF
4624 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4626 /* The symbol and offset must be aligned to the access size. */
4627 unsigned int align;
4628 unsigned int ref_size;
4630 if (CONSTANT_POOL_ADDRESS_P (sym))
4631 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4632 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4634 tree exp = SYMBOL_REF_DECL (sym);
4635 align = TYPE_ALIGN (TREE_TYPE (exp));
4636 align = aarch64_constant_alignment (exp, align);
4638 else if (SYMBOL_REF_DECL (sym))
4639 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4640 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4641 && SYMBOL_REF_BLOCK (sym) != NULL)
4642 align = SYMBOL_REF_BLOCK (sym)->alignment;
4643 else
4644 align = BITS_PER_UNIT;
4646 ref_size = GET_MODE_SIZE (mode);
4647 if (ref_size == 0)
4648 ref_size = GET_MODE_SIZE (DImode);
4650 return ((INTVAL (offs) & (ref_size - 1)) == 0
4651 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4654 return false;
4656 default:
4657 return false;
4661 /* Return true if the address X is valid for a PRFM instruction.
4662 STRICT_P is true if we should do strict checking with
4663 aarch64_classify_address. */
4665 bool
4666 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4668 struct aarch64_address_info addr;
4670 /* PRFM accepts the same addresses as DImode... */
4671 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4672 if (!res)
4673 return false;
4675 /* ... except writeback forms. */
4676 return addr.type != ADDRESS_REG_WB;
4679 bool
4680 aarch64_symbolic_address_p (rtx x)
4682 rtx offset;
4684 split_const (x, &x, &offset);
4685 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4688 /* Classify the base of symbolic expression X. */
4690 enum aarch64_symbol_type
4691 aarch64_classify_symbolic_expression (rtx x)
4693 rtx offset;
4695 split_const (x, &x, &offset);
4696 return aarch64_classify_symbol (x, offset);
4700 /* Return TRUE if X is a legitimate address for accessing memory in
4701 mode MODE. */
4702 static bool
4703 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4705 struct aarch64_address_info addr;
4707 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4710 /* Return TRUE if X is a legitimate address for accessing memory in
4711 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4712 pair operation. */
4713 bool
4714 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4715 RTX_CODE outer_code, bool strict_p)
4717 struct aarch64_address_info addr;
4719 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4722 /* Split an out-of-range address displacement into a base and offset.
4723 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4724 to increase opportunities for sharing the base address of different sizes.
4725 Unaligned accesses use the signed 9-bit range, TImode/TFmode use
4726 the intersection of signed scaled 7-bit and signed 9-bit offset. */
4727 static bool
4728 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4730 HOST_WIDE_INT offset = INTVAL (*disp);
4731 HOST_WIDE_INT base;
4733 if (mode == TImode || mode == TFmode)
4734 base = (offset + 0x100) & ~0x1f8;
4735 else if ((offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4736 base = (offset + 0x100) & ~0x1ff;
4737 else
4738 base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4740 *off = GEN_INT (base);
4741 *disp = GEN_INT (offset - base);
4742 return true;
4745 /* Return the binary representation of floating point constant VALUE in INTVAL.
4746 If the value cannot be converted, return false without setting INTVAL.
4747 The conversion is done in the given MODE. */
4748 bool
4749 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4752 /* We make a general exception for 0. */
4753 if (aarch64_float_const_zero_rtx_p (value))
4755 *intval = 0;
4756 return true;
4759 machine_mode mode = GET_MODE (value);
4760 if (GET_CODE (value) != CONST_DOUBLE
4761 || !SCALAR_FLOAT_MODE_P (mode)
4762 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4763 /* Only support up to DF mode. */
4764 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4765 return false;
4767 unsigned HOST_WIDE_INT ival = 0;
4769 long res[2];
4770 real_to_target (res,
4771 CONST_DOUBLE_REAL_VALUE (value),
4772 REAL_MODE_FORMAT (mode));
4774 if (mode == DFmode)
4776 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4777 ival = zext_hwi (res[order], 32);
4778 ival |= (zext_hwi (res[1 - order], 32) << 32);
4780 else
4781 ival = zext_hwi (res[0], 32);
4783 *intval = ival;
4784 return true;
4787 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4788 single MOV(+MOVK) followed by an FMOV. */
4789 bool
4790 aarch64_float_const_rtx_p (rtx x)
4792 machine_mode mode = GET_MODE (x);
4793 if (mode == VOIDmode)
4794 return false;
4796 /* Determine whether it's cheaper to write float constants as
4797 mov/movk pairs over ldr/adrp pairs. */
4798 unsigned HOST_WIDE_INT ival;
4800 if (GET_CODE (x) == CONST_DOUBLE
4801 && SCALAR_FLOAT_MODE_P (mode)
4802 && aarch64_reinterpret_float_as_int (x, &ival))
4804 scalar_int_mode imode = (mode == HFmode
4805 ? SImode
4806 : int_mode_for_mode (mode).require ());
4807 int num_instr = aarch64_internal_mov_immediate
4808 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4809 return num_instr < 3;
4812 return false;
4815 /* Return TRUE if rtx X is immediate constant 0.0 */
4816 bool
4817 aarch64_float_const_zero_rtx_p (rtx x)
4819 if (GET_MODE (x) == VOIDmode)
4820 return false;
4822 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4823 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4824 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4827 /* Return TRUE if rtx X is immediate constant that fits in a single
4828 MOVI immediate operation. */
4829 bool
4830 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4832 if (!TARGET_SIMD)
4833 return false;
4835 machine_mode vmode;
4836 scalar_int_mode imode;
4837 unsigned HOST_WIDE_INT ival;
4839 if (GET_CODE (x) == CONST_DOUBLE
4840 && SCALAR_FLOAT_MODE_P (mode))
4842 if (!aarch64_reinterpret_float_as_int (x, &ival))
4843 return false;
4845 /* We make a general exception for 0. */
4846 if (aarch64_float_const_zero_rtx_p (x))
4847 return true;
4849 imode = int_mode_for_mode (mode).require ();
4851 else if (GET_CODE (x) == CONST_INT
4852 && is_a <scalar_int_mode> (mode, &imode))
4853 ival = INTVAL (x);
4854 else
4855 return false;
4857 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4858 a 128 bit vector mode. */
4859 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4861 vmode = aarch64_simd_container_mode (imode, width);
4862 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4864 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4868 /* Return the fixed registers used for condition codes. */
4870 static bool
4871 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4873 *p1 = CC_REGNUM;
4874 *p2 = INVALID_REGNUM;
4875 return true;
4878 /* This function is used by the call expanders of the machine description.
4879 RESULT is the register in which the result is returned. It's NULL for
4880 "call" and "sibcall".
4881 MEM is the location of the function call.
4882 SIBCALL indicates whether this function call is normal call or sibling call.
4883 It will generate different pattern accordingly. */
4885 void
4886 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4888 rtx call, callee, tmp;
4889 rtvec vec;
4890 machine_mode mode;
4892 gcc_assert (MEM_P (mem));
4893 callee = XEXP (mem, 0);
4894 mode = GET_MODE (callee);
4895 gcc_assert (mode == Pmode);
4897 /* Decide if we should generate indirect calls by loading the
4898 address of the callee into a register before performing
4899 the branch-and-link. */
4900 if (SYMBOL_REF_P (callee)
4901 ? (aarch64_is_long_call_p (callee)
4902 || aarch64_is_noplt_call_p (callee))
4903 : !REG_P (callee))
4904 XEXP (mem, 0) = force_reg (mode, callee);
4906 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4908 if (result != NULL_RTX)
4909 call = gen_rtx_SET (result, call);
4911 if (sibcall)
4912 tmp = ret_rtx;
4913 else
4914 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4916 vec = gen_rtvec (2, call, tmp);
4917 call = gen_rtx_PARALLEL (VOIDmode, vec);
4919 aarch64_emit_call_insn (call);
4922 /* Emit call insn with PAT and do aarch64-specific handling. */
4924 void
4925 aarch64_emit_call_insn (rtx pat)
4927 rtx insn = emit_call_insn (pat);
4929 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4930 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4931 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4934 machine_mode
4935 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4937 /* All floating point compares return CCFP if it is an equality
4938 comparison, and CCFPE otherwise. */
4939 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4941 switch (code)
4943 case EQ:
4944 case NE:
4945 case UNORDERED:
4946 case ORDERED:
4947 case UNLT:
4948 case UNLE:
4949 case UNGT:
4950 case UNGE:
4951 case UNEQ:
4952 case LTGT:
4953 return CCFPmode;
4955 case LT:
4956 case LE:
4957 case GT:
4958 case GE:
4959 return CCFPEmode;
4961 default:
4962 gcc_unreachable ();
4966 /* Equality comparisons of short modes against zero can be performed
4967 using the TST instruction with the appropriate bitmask. */
4968 if (y == const0_rtx && REG_P (x)
4969 && (code == EQ || code == NE)
4970 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4971 return CC_NZmode;
4973 /* Similarly, comparisons of zero_extends from shorter modes can
4974 be performed using an ANDS with an immediate mask. */
4975 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4976 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4977 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4978 && (code == EQ || code == NE))
4979 return CC_NZmode;
4981 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4982 && y == const0_rtx
4983 && (code == EQ || code == NE || code == LT || code == GE)
4984 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4985 || GET_CODE (x) == NEG
4986 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4987 && CONST_INT_P (XEXP (x, 2)))))
4988 return CC_NZmode;
4990 /* A compare with a shifted operand. Because of canonicalization,
4991 the comparison will have to be swapped when we emit the assembly
4992 code. */
4993 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4994 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4995 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4996 || GET_CODE (x) == LSHIFTRT
4997 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4998 return CC_SWPmode;
5000 /* Similarly for a negated operand, but we can only do this for
5001 equalities. */
5002 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5003 && (REG_P (y) || GET_CODE (y) == SUBREG)
5004 && (code == EQ || code == NE)
5005 && GET_CODE (x) == NEG)
5006 return CC_Zmode;
5008 /* A test for unsigned overflow. */
5009 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
5010 && code == NE
5011 && GET_CODE (x) == PLUS
5012 && GET_CODE (y) == ZERO_EXTEND)
5013 return CC_Cmode;
5015 /* For everything else, return CCmode. */
5016 return CCmode;
5019 static int
5020 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5023 aarch64_get_condition_code (rtx x)
5025 machine_mode mode = GET_MODE (XEXP (x, 0));
5026 enum rtx_code comp_code = GET_CODE (x);
5028 if (GET_MODE_CLASS (mode) != MODE_CC)
5029 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5030 return aarch64_get_condition_code_1 (mode, comp_code);
5033 static int
5034 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5036 switch (mode)
5038 case E_CCFPmode:
5039 case E_CCFPEmode:
5040 switch (comp_code)
5042 case GE: return AARCH64_GE;
5043 case GT: return AARCH64_GT;
5044 case LE: return AARCH64_LS;
5045 case LT: return AARCH64_MI;
5046 case NE: return AARCH64_NE;
5047 case EQ: return AARCH64_EQ;
5048 case ORDERED: return AARCH64_VC;
5049 case UNORDERED: return AARCH64_VS;
5050 case UNLT: return AARCH64_LT;
5051 case UNLE: return AARCH64_LE;
5052 case UNGT: return AARCH64_HI;
5053 case UNGE: return AARCH64_PL;
5054 default: return -1;
5056 break;
5058 case E_CCmode:
5059 switch (comp_code)
5061 case NE: return AARCH64_NE;
5062 case EQ: return AARCH64_EQ;
5063 case GE: return AARCH64_GE;
5064 case GT: return AARCH64_GT;
5065 case LE: return AARCH64_LE;
5066 case LT: return AARCH64_LT;
5067 case GEU: return AARCH64_CS;
5068 case GTU: return AARCH64_HI;
5069 case LEU: return AARCH64_LS;
5070 case LTU: return AARCH64_CC;
5071 default: return -1;
5073 break;
5075 case E_CC_SWPmode:
5076 switch (comp_code)
5078 case NE: return AARCH64_NE;
5079 case EQ: return AARCH64_EQ;
5080 case GE: return AARCH64_LE;
5081 case GT: return AARCH64_LT;
5082 case LE: return AARCH64_GE;
5083 case LT: return AARCH64_GT;
5084 case GEU: return AARCH64_LS;
5085 case GTU: return AARCH64_CC;
5086 case LEU: return AARCH64_CS;
5087 case LTU: return AARCH64_HI;
5088 default: return -1;
5090 break;
5092 case E_CC_NZmode:
5093 switch (comp_code)
5095 case NE: return AARCH64_NE;
5096 case EQ: return AARCH64_EQ;
5097 case GE: return AARCH64_PL;
5098 case LT: return AARCH64_MI;
5099 default: return -1;
5101 break;
5103 case E_CC_Zmode:
5104 switch (comp_code)
5106 case NE: return AARCH64_NE;
5107 case EQ: return AARCH64_EQ;
5108 default: return -1;
5110 break;
5112 case E_CC_Cmode:
5113 switch (comp_code)
5115 case NE: return AARCH64_CS;
5116 case EQ: return AARCH64_CC;
5117 default: return -1;
5119 break;
5121 default:
5122 return -1;
5125 return -1;
5128 bool
5129 aarch64_const_vec_all_same_in_range_p (rtx x,
5130 HOST_WIDE_INT minval,
5131 HOST_WIDE_INT maxval)
5133 HOST_WIDE_INT firstval;
5134 int count, i;
5136 if (GET_CODE (x) != CONST_VECTOR
5137 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5138 return false;
5140 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5141 if (firstval < minval || firstval > maxval)
5142 return false;
5144 count = CONST_VECTOR_NUNITS (x);
5145 for (i = 1; i < count; i++)
5146 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5147 return false;
5149 return true;
5152 bool
5153 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5155 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5159 /* N Z C V. */
5160 #define AARCH64_CC_V 1
5161 #define AARCH64_CC_C (1 << 1)
5162 #define AARCH64_CC_Z (1 << 2)
5163 #define AARCH64_CC_N (1 << 3)
5165 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5166 static const int aarch64_nzcv_codes[] =
5168 0, /* EQ, Z == 1. */
5169 AARCH64_CC_Z, /* NE, Z == 0. */
5170 0, /* CS, C == 1. */
5171 AARCH64_CC_C, /* CC, C == 0. */
5172 0, /* MI, N == 1. */
5173 AARCH64_CC_N, /* PL, N == 0. */
5174 0, /* VS, V == 1. */
5175 AARCH64_CC_V, /* VC, V == 0. */
5176 0, /* HI, C ==1 && Z == 0. */
5177 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5178 AARCH64_CC_V, /* GE, N == V. */
5179 0, /* LT, N != V. */
5180 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5181 0, /* LE, !(Z == 0 && N == V). */
5182 0, /* AL, Any. */
5183 0 /* NV, Any. */
5186 /* Print operand X to file F in a target specific manner according to CODE.
5187 The acceptable formatting commands given by CODE are:
5188 'c': An integer or symbol address without a preceding #
5189 sign.
5190 'e': Print the sign/zero-extend size as a character 8->b,
5191 16->h, 32->w.
5192 'p': Prints N such that 2^N == X (X must be power of 2 and
5193 const int).
5194 'P': Print the number of non-zero bits in X (a const_int).
5195 'H': Print the higher numbered register of a pair (TImode)
5196 of regs.
5197 'm': Print a condition (eq, ne, etc).
5198 'M': Same as 'm', but invert condition.
5199 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5200 'S/T/U/V': Print a FP/SIMD register name for a register list.
5201 The register printed is the FP/SIMD register name
5202 of X + 0/1/2/3 for S/T/U/V.
5203 'R': Print a scalar FP/SIMD register name + 1.
5204 'X': Print bottom 16 bits of integer constant in hex.
5205 'w/x': Print a general register name or the zero register
5206 (32-bit or 64-bit).
5207 '0': Print a normal operand, if it's a general register,
5208 then we assume DImode.
5209 'k': Print NZCV for conditional compare instructions.
5210 'A': Output address constant representing the first
5211 argument of X, specifying a relocation offset
5212 if appropriate.
5213 'L': Output constant address specified by X
5214 with a relocation offset if appropriate.
5215 'G': Prints address of X, specifying a PC relative
5216 relocation mode if appropriate. */
5218 static void
5219 aarch64_print_operand (FILE *f, rtx x, int code)
5221 switch (code)
5223 case 'c':
5224 switch (GET_CODE (x))
5226 case CONST_INT:
5227 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5228 break;
5230 case SYMBOL_REF:
5231 output_addr_const (f, x);
5232 break;
5234 case CONST:
5235 if (GET_CODE (XEXP (x, 0)) == PLUS
5236 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5238 output_addr_const (f, x);
5239 break;
5241 /* Fall through. */
5243 default:
5244 output_operand_lossage ("Unsupported operand for code '%c'", code);
5246 break;
5248 case 'e':
5250 int n;
5252 if (!CONST_INT_P (x)
5253 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5255 output_operand_lossage ("invalid operand for '%%%c'", code);
5256 return;
5259 switch (n)
5261 case 3:
5262 fputc ('b', f);
5263 break;
5264 case 4:
5265 fputc ('h', f);
5266 break;
5267 case 5:
5268 fputc ('w', f);
5269 break;
5270 default:
5271 output_operand_lossage ("invalid operand for '%%%c'", code);
5272 return;
5275 break;
5277 case 'p':
5279 int n;
5281 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5283 output_operand_lossage ("invalid operand for '%%%c'", code);
5284 return;
5287 asm_fprintf (f, "%d", n);
5289 break;
5291 case 'P':
5292 if (!CONST_INT_P (x))
5294 output_operand_lossage ("invalid operand for '%%%c'", code);
5295 return;
5298 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5299 break;
5301 case 'H':
5302 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5304 output_operand_lossage ("invalid operand for '%%%c'", code);
5305 return;
5308 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5309 break;
5311 case 'M':
5312 case 'm':
5314 int cond_code;
5315 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5316 if (x == const_true_rtx)
5318 if (code == 'M')
5319 fputs ("nv", f);
5320 return;
5323 if (!COMPARISON_P (x))
5325 output_operand_lossage ("invalid operand for '%%%c'", code);
5326 return;
5329 cond_code = aarch64_get_condition_code (x);
5330 gcc_assert (cond_code >= 0);
5331 if (code == 'M')
5332 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5333 fputs (aarch64_condition_codes[cond_code], f);
5335 break;
5337 case 'b':
5338 case 'h':
5339 case 's':
5340 case 'd':
5341 case 'q':
5342 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5344 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5345 return;
5347 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5348 break;
5350 case 'S':
5351 case 'T':
5352 case 'U':
5353 case 'V':
5354 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5356 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5357 return;
5359 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5360 break;
5362 case 'R':
5363 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5365 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5366 return;
5368 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5369 break;
5371 case 'X':
5372 if (!CONST_INT_P (x))
5374 output_operand_lossage ("invalid operand for '%%%c'", code);
5375 return;
5377 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5378 break;
5380 case 'w':
5381 case 'x':
5382 if (x == const0_rtx
5383 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5385 asm_fprintf (f, "%czr", code);
5386 break;
5389 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5391 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5392 break;
5395 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5397 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5398 break;
5401 /* Fall through */
5403 case 0:
5404 if (x == NULL)
5406 output_operand_lossage ("missing operand");
5407 return;
5410 switch (GET_CODE (x))
5412 case REG:
5413 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5414 break;
5416 case MEM:
5417 output_address (GET_MODE (x), XEXP (x, 0));
5418 /* Check all memory references are Pmode - even with ILP32. */
5419 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5420 break;
5422 case CONST:
5423 case LABEL_REF:
5424 case SYMBOL_REF:
5425 output_addr_const (asm_out_file, x);
5426 break;
5428 case CONST_INT:
5429 asm_fprintf (f, "%wd", INTVAL (x));
5430 break;
5432 case CONST_VECTOR:
5433 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5435 gcc_assert (
5436 aarch64_const_vec_all_same_in_range_p (x,
5437 HOST_WIDE_INT_MIN,
5438 HOST_WIDE_INT_MAX));
5439 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5441 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5443 fputc ('0', f);
5445 else
5446 gcc_unreachable ();
5447 break;
5449 case CONST_DOUBLE:
5450 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5451 be getting CONST_DOUBLEs holding integers. */
5452 gcc_assert (GET_MODE (x) != VOIDmode);
5453 if (aarch64_float_const_zero_rtx_p (x))
5455 fputc ('0', f);
5456 break;
5458 else if (aarch64_float_const_representable_p (x))
5460 #define buf_size 20
5461 char float_buf[buf_size] = {'\0'};
5462 real_to_decimal_for_mode (float_buf,
5463 CONST_DOUBLE_REAL_VALUE (x),
5464 buf_size, buf_size,
5465 1, GET_MODE (x));
5466 asm_fprintf (asm_out_file, "%s", float_buf);
5467 break;
5468 #undef buf_size
5470 output_operand_lossage ("invalid constant");
5471 return;
5472 default:
5473 output_operand_lossage ("invalid operand");
5474 return;
5476 break;
5478 case 'A':
5479 if (GET_CODE (x) == HIGH)
5480 x = XEXP (x, 0);
5482 switch (aarch64_classify_symbolic_expression (x))
5484 case SYMBOL_SMALL_GOT_4G:
5485 asm_fprintf (asm_out_file, ":got:");
5486 break;
5488 case SYMBOL_SMALL_TLSGD:
5489 asm_fprintf (asm_out_file, ":tlsgd:");
5490 break;
5492 case SYMBOL_SMALL_TLSDESC:
5493 asm_fprintf (asm_out_file, ":tlsdesc:");
5494 break;
5496 case SYMBOL_SMALL_TLSIE:
5497 asm_fprintf (asm_out_file, ":gottprel:");
5498 break;
5500 case SYMBOL_TLSLE24:
5501 asm_fprintf (asm_out_file, ":tprel:");
5502 break;
5504 case SYMBOL_TINY_GOT:
5505 gcc_unreachable ();
5506 break;
5508 default:
5509 break;
5511 output_addr_const (asm_out_file, x);
5512 break;
5514 case 'L':
5515 switch (aarch64_classify_symbolic_expression (x))
5517 case SYMBOL_SMALL_GOT_4G:
5518 asm_fprintf (asm_out_file, ":lo12:");
5519 break;
5521 case SYMBOL_SMALL_TLSGD:
5522 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5523 break;
5525 case SYMBOL_SMALL_TLSDESC:
5526 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5527 break;
5529 case SYMBOL_SMALL_TLSIE:
5530 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5531 break;
5533 case SYMBOL_TLSLE12:
5534 asm_fprintf (asm_out_file, ":tprel_lo12:");
5535 break;
5537 case SYMBOL_TLSLE24:
5538 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5539 break;
5541 case SYMBOL_TINY_GOT:
5542 asm_fprintf (asm_out_file, ":got:");
5543 break;
5545 case SYMBOL_TINY_TLSIE:
5546 asm_fprintf (asm_out_file, ":gottprel:");
5547 break;
5549 default:
5550 break;
5552 output_addr_const (asm_out_file, x);
5553 break;
5555 case 'G':
5556 switch (aarch64_classify_symbolic_expression (x))
5558 case SYMBOL_TLSLE24:
5559 asm_fprintf (asm_out_file, ":tprel_hi12:");
5560 break;
5561 default:
5562 break;
5564 output_addr_const (asm_out_file, x);
5565 break;
5567 case 'k':
5569 HOST_WIDE_INT cond_code;
5571 if (!CONST_INT_P (x))
5573 output_operand_lossage ("invalid operand for '%%%c'", code);
5574 return;
5577 cond_code = INTVAL (x);
5578 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5579 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5581 break;
5583 default:
5584 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5585 return;
5589 static void
5590 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5592 struct aarch64_address_info addr;
5594 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5595 switch (addr.type)
5597 case ADDRESS_REG_IMM:
5598 if (addr.offset == const0_rtx)
5599 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5600 else
5601 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5602 INTVAL (addr.offset));
5603 return;
5605 case ADDRESS_REG_REG:
5606 if (addr.shift == 0)
5607 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5608 reg_names [REGNO (addr.offset)]);
5609 else
5610 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5611 reg_names [REGNO (addr.offset)], addr.shift);
5612 return;
5614 case ADDRESS_REG_UXTW:
5615 if (addr.shift == 0)
5616 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5617 REGNO (addr.offset) - R0_REGNUM);
5618 else
5619 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5620 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5621 return;
5623 case ADDRESS_REG_SXTW:
5624 if (addr.shift == 0)
5625 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5626 REGNO (addr.offset) - R0_REGNUM);
5627 else
5628 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5629 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5630 return;
5632 case ADDRESS_REG_WB:
5633 switch (GET_CODE (x))
5635 case PRE_INC:
5636 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5637 GET_MODE_SIZE (mode));
5638 return;
5639 case POST_INC:
5640 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5641 GET_MODE_SIZE (mode));
5642 return;
5643 case PRE_DEC:
5644 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5645 GET_MODE_SIZE (mode));
5646 return;
5647 case POST_DEC:
5648 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5649 GET_MODE_SIZE (mode));
5650 return;
5651 case PRE_MODIFY:
5652 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5653 INTVAL (addr.offset));
5654 return;
5655 case POST_MODIFY:
5656 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5657 INTVAL (addr.offset));
5658 return;
5659 default:
5660 break;
5662 break;
5664 case ADDRESS_LO_SUM:
5665 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5666 output_addr_const (f, addr.offset);
5667 asm_fprintf (f, "]");
5668 return;
5670 case ADDRESS_SYMBOLIC:
5671 break;
5674 output_addr_const (f, x);
5677 bool
5678 aarch64_label_mentioned_p (rtx x)
5680 const char *fmt;
5681 int i;
5683 if (GET_CODE (x) == LABEL_REF)
5684 return true;
5686 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5687 referencing instruction, but they are constant offsets, not
5688 symbols. */
5689 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5690 return false;
5692 fmt = GET_RTX_FORMAT (GET_CODE (x));
5693 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5695 if (fmt[i] == 'E')
5697 int j;
5699 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5700 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5701 return 1;
5703 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5704 return 1;
5707 return 0;
5710 /* Implement REGNO_REG_CLASS. */
5712 enum reg_class
5713 aarch64_regno_regclass (unsigned regno)
5715 if (GP_REGNUM_P (regno))
5716 return GENERAL_REGS;
5718 if (regno == SP_REGNUM)
5719 return STACK_REG;
5721 if (regno == FRAME_POINTER_REGNUM
5722 || regno == ARG_POINTER_REGNUM)
5723 return POINTER_REGS;
5725 if (FP_REGNUM_P (regno))
5726 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5728 return NO_REGS;
5731 static rtx
5732 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5734 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5735 where mask is selected by alignment and size of the offset.
5736 We try to pick as large a range for the offset as possible to
5737 maximize the chance of a CSE. However, for aligned addresses
5738 we limit the range to 4k so that structures with different sized
5739 elements are likely to use the same base. We need to be careful
5740 not to split a CONST for some forms of address expression, otherwise
5741 it will generate sub-optimal code. */
5743 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5745 rtx base = XEXP (x, 0);
5746 rtx offset_rtx = XEXP (x, 1);
5747 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5749 if (GET_CODE (base) == PLUS)
5751 rtx op0 = XEXP (base, 0);
5752 rtx op1 = XEXP (base, 1);
5754 /* Force any scaling into a temp for CSE. */
5755 op0 = force_reg (Pmode, op0);
5756 op1 = force_reg (Pmode, op1);
5758 /* Let the pointer register be in op0. */
5759 if (REG_POINTER (op1))
5760 std::swap (op0, op1);
5762 /* If the pointer is virtual or frame related, then we know that
5763 virtual register instantiation or register elimination is going
5764 to apply a second constant. We want the two constants folded
5765 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5766 if (virt_or_elim_regno_p (REGNO (op0)))
5768 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5769 NULL_RTX, true, OPTAB_DIRECT);
5770 return gen_rtx_PLUS (Pmode, base, op1);
5773 /* Otherwise, in order to encourage CSE (and thence loop strength
5774 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5775 base = expand_binop (Pmode, add_optab, op0, op1,
5776 NULL_RTX, true, OPTAB_DIRECT);
5777 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5780 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5781 HOST_WIDE_INT base_offset;
5782 if (GET_MODE_SIZE (mode) > 16)
5783 base_offset = (offset + 0x400) & ~0x7f0;
5784 /* For offsets aren't a multiple of the access size, the limit is
5785 -256...255. */
5786 else if (offset & (GET_MODE_SIZE (mode) - 1))
5788 base_offset = (offset + 0x100) & ~0x1ff;
5790 /* BLKmode typically uses LDP of X-registers. */
5791 if (mode == BLKmode)
5792 base_offset = (offset + 512) & ~0x3ff;
5794 /* Small negative offsets are supported. */
5795 else if (IN_RANGE (offset, -256, 0))
5796 base_offset = 0;
5797 else if (mode == TImode || mode == TFmode)
5798 base_offset = (offset + 0x100) & ~0x1ff;
5799 /* Use 12-bit offset by access size. */
5800 else
5801 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5803 if (base_offset != 0)
5805 base = plus_constant (Pmode, base, base_offset);
5806 base = force_operand (base, NULL_RTX);
5807 return plus_constant (Pmode, base, offset - base_offset);
5811 return x;
5814 /* Return the reload icode required for a constant pool in mode. */
5815 static enum insn_code
5816 aarch64_constant_pool_reload_icode (machine_mode mode)
5818 switch (mode)
5820 case E_SFmode:
5821 return CODE_FOR_aarch64_reload_movcpsfdi;
5823 case E_DFmode:
5824 return CODE_FOR_aarch64_reload_movcpdfdi;
5826 case E_TFmode:
5827 return CODE_FOR_aarch64_reload_movcptfdi;
5829 case E_V8QImode:
5830 return CODE_FOR_aarch64_reload_movcpv8qidi;
5832 case E_V16QImode:
5833 return CODE_FOR_aarch64_reload_movcpv16qidi;
5835 case E_V4HImode:
5836 return CODE_FOR_aarch64_reload_movcpv4hidi;
5838 case E_V8HImode:
5839 return CODE_FOR_aarch64_reload_movcpv8hidi;
5841 case E_V2SImode:
5842 return CODE_FOR_aarch64_reload_movcpv2sidi;
5844 case E_V4SImode:
5845 return CODE_FOR_aarch64_reload_movcpv4sidi;
5847 case E_V2DImode:
5848 return CODE_FOR_aarch64_reload_movcpv2didi;
5850 case E_V2DFmode:
5851 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5853 default:
5854 gcc_unreachable ();
5857 gcc_unreachable ();
5859 static reg_class_t
5860 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5861 reg_class_t rclass,
5862 machine_mode mode,
5863 secondary_reload_info *sri)
5866 /* If we have to disable direct literal pool loads and stores because the
5867 function is too big, then we need a scratch register. */
5868 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5869 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5870 || targetm.vector_mode_supported_p (GET_MODE (x)))
5871 && !aarch64_pcrelative_literal_loads)
5873 sri->icode = aarch64_constant_pool_reload_icode (mode);
5874 return NO_REGS;
5877 /* Without the TARGET_SIMD instructions we cannot move a Q register
5878 to a Q register directly. We need a scratch. */
5879 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5880 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5881 && reg_class_subset_p (rclass, FP_REGS))
5883 if (mode == TFmode)
5884 sri->icode = CODE_FOR_aarch64_reload_movtf;
5885 else if (mode == TImode)
5886 sri->icode = CODE_FOR_aarch64_reload_movti;
5887 return NO_REGS;
5890 /* A TFmode or TImode memory access should be handled via an FP_REGS
5891 because AArch64 has richer addressing modes for LDR/STR instructions
5892 than LDP/STP instructions. */
5893 if (TARGET_FLOAT && rclass == GENERAL_REGS
5894 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5895 return FP_REGS;
5897 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5898 return GENERAL_REGS;
5900 return NO_REGS;
5903 static bool
5904 aarch64_can_eliminate (const int from, const int to)
5906 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5907 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5909 if (frame_pointer_needed)
5911 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5912 return true;
5913 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5914 return false;
5915 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5916 && !cfun->calls_alloca)
5917 return true;
5918 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5919 return true;
5921 return false;
5923 else
5925 /* If we decided that we didn't need a leaf frame pointer but then used
5926 LR in the function, then we'll want a frame pointer after all, so
5927 prevent this elimination to ensure a frame pointer is used. */
5928 if (to == STACK_POINTER_REGNUM
5929 && flag_omit_frame_pointer == 2
5930 && flag_omit_leaf_frame_pointer
5931 && df_regs_ever_live_p (LR_REGNUM))
5932 return false;
5935 return true;
5938 HOST_WIDE_INT
5939 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5941 aarch64_layout_frame ();
5943 if (to == HARD_FRAME_POINTER_REGNUM)
5945 if (from == ARG_POINTER_REGNUM)
5946 return cfun->machine->frame.hard_fp_offset;
5948 if (from == FRAME_POINTER_REGNUM)
5949 return cfun->machine->frame.hard_fp_offset
5950 - cfun->machine->frame.locals_offset;
5953 if (to == STACK_POINTER_REGNUM)
5955 if (from == FRAME_POINTER_REGNUM)
5956 return cfun->machine->frame.frame_size
5957 - cfun->machine->frame.locals_offset;
5960 return cfun->machine->frame.frame_size;
5963 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5964 previous frame. */
5967 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5969 if (count != 0)
5970 return const0_rtx;
5971 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5975 static void
5976 aarch64_asm_trampoline_template (FILE *f)
5978 if (TARGET_ILP32)
5980 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5981 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5983 else
5985 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5986 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5988 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5989 assemble_aligned_integer (4, const0_rtx);
5990 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5991 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5994 static void
5995 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5997 rtx fnaddr, mem, a_tramp;
5998 const int tramp_code_sz = 16;
6000 /* Don't need to copy the trailing D-words, we fill those in below. */
6001 emit_block_move (m_tramp, assemble_trampoline_template (),
6002 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
6003 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
6004 fnaddr = XEXP (DECL_RTL (fndecl), 0);
6005 if (GET_MODE (fnaddr) != ptr_mode)
6006 fnaddr = convert_memory_address (ptr_mode, fnaddr);
6007 emit_move_insn (mem, fnaddr);
6009 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
6010 emit_move_insn (mem, chain_value);
6012 /* XXX We should really define a "clear_cache" pattern and use
6013 gen_clear_cache(). */
6014 a_tramp = XEXP (m_tramp, 0);
6015 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
6016 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6017 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6018 ptr_mode);
6021 static unsigned char
6022 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6024 switch (regclass)
6026 case CALLER_SAVE_REGS:
6027 case POINTER_REGS:
6028 case GENERAL_REGS:
6029 case ALL_REGS:
6030 case POINTER_AND_FP_REGS:
6031 case FP_REGS:
6032 case FP_LO_REGS:
6033 return
6034 aarch64_vector_mode_p (mode)
6035 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6036 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6037 case STACK_REG:
6038 return 1;
6040 case NO_REGS:
6041 return 0;
6043 default:
6044 break;
6046 gcc_unreachable ();
6049 static reg_class_t
6050 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6052 if (regclass == POINTER_REGS)
6053 return GENERAL_REGS;
6055 if (regclass == STACK_REG)
6057 if (REG_P(x)
6058 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6059 return regclass;
6061 return NO_REGS;
6064 /* Register eliminiation can result in a request for
6065 SP+constant->FP_REGS. We cannot support such operations which
6066 use SP as source and an FP_REG as destination, so reject out
6067 right now. */
6068 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6070 rtx lhs = XEXP (x, 0);
6072 /* Look through a possible SUBREG introduced by ILP32. */
6073 if (GET_CODE (lhs) == SUBREG)
6074 lhs = SUBREG_REG (lhs);
6076 gcc_assert (REG_P (lhs));
6077 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6078 POINTER_REGS));
6079 return NO_REGS;
6082 return regclass;
6085 void
6086 aarch64_asm_output_labelref (FILE* f, const char *name)
6088 asm_fprintf (f, "%U%s", name);
6091 static void
6092 aarch64_elf_asm_constructor (rtx symbol, int priority)
6094 if (priority == DEFAULT_INIT_PRIORITY)
6095 default_ctor_section_asm_out_constructor (symbol, priority);
6096 else
6098 section *s;
6099 /* While priority is known to be in range [0, 65535], so 18 bytes
6100 would be enough, the compiler might not know that. To avoid
6101 -Wformat-truncation false positive, use a larger size. */
6102 char buf[23];
6103 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6104 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6105 switch_to_section (s);
6106 assemble_align (POINTER_SIZE);
6107 assemble_aligned_integer (POINTER_BYTES, symbol);
6111 static void
6112 aarch64_elf_asm_destructor (rtx symbol, int priority)
6114 if (priority == DEFAULT_INIT_PRIORITY)
6115 default_dtor_section_asm_out_destructor (symbol, priority);
6116 else
6118 section *s;
6119 /* While priority is known to be in range [0, 65535], so 18 bytes
6120 would be enough, the compiler might not know that. To avoid
6121 -Wformat-truncation false positive, use a larger size. */
6122 char buf[23];
6123 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6124 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6125 switch_to_section (s);
6126 assemble_align (POINTER_SIZE);
6127 assemble_aligned_integer (POINTER_BYTES, symbol);
6131 const char*
6132 aarch64_output_casesi (rtx *operands)
6134 char buf[100];
6135 char label[100];
6136 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6137 int index;
6138 static const char *const patterns[4][2] =
6141 "ldrb\t%w3, [%0,%w1,uxtw]",
6142 "add\t%3, %4, %w3, sxtb #2"
6145 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6146 "add\t%3, %4, %w3, sxth #2"
6149 "ldr\t%w3, [%0,%w1,uxtw #2]",
6150 "add\t%3, %4, %w3, sxtw #2"
6152 /* We assume that DImode is only generated when not optimizing and
6153 that we don't really need 64-bit address offsets. That would
6154 imply an object file with 8GB of code in a single function! */
6156 "ldr\t%w3, [%0,%w1,uxtw #2]",
6157 "add\t%3, %4, %w3, sxtw #2"
6161 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6163 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6164 index = exact_log2 (GET_MODE_SIZE (mode));
6166 gcc_assert (index >= 0 && index <= 3);
6168 /* Need to implement table size reduction, by chaning the code below. */
6169 output_asm_insn (patterns[index][0], operands);
6170 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6171 snprintf (buf, sizeof (buf),
6172 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6173 output_asm_insn (buf, operands);
6174 output_asm_insn (patterns[index][1], operands);
6175 output_asm_insn ("br\t%3", operands);
6176 assemble_label (asm_out_file, label);
6177 return "";
6181 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6182 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6183 operator. */
6186 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6188 if (shift >= 0 && shift <= 3)
6190 int size;
6191 for (size = 8; size <= 32; size *= 2)
6193 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6194 if (mask == bits << shift)
6195 return size;
6198 return 0;
6201 /* Constant pools are per function only when PC relative
6202 literal loads are true or we are in the large memory
6203 model. */
6205 static inline bool
6206 aarch64_can_use_per_function_literal_pools_p (void)
6208 return (aarch64_pcrelative_literal_loads
6209 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6212 static bool
6213 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6215 /* Fixme:: In an ideal world this would work similar
6216 to the logic in aarch64_select_rtx_section but this
6217 breaks bootstrap in gcc go. For now we workaround
6218 this by returning false here. */
6219 return false;
6222 /* Select appropriate section for constants depending
6223 on where we place literal pools. */
6225 static section *
6226 aarch64_select_rtx_section (machine_mode mode,
6227 rtx x,
6228 unsigned HOST_WIDE_INT align)
6230 if (aarch64_can_use_per_function_literal_pools_p ())
6231 return function_section (current_function_decl);
6233 return default_elf_select_rtx_section (mode, x, align);
6236 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6237 void
6238 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6239 HOST_WIDE_INT offset)
6241 /* When using per-function literal pools, we must ensure that any code
6242 section is aligned to the minimal instruction length, lest we get
6243 errors from the assembler re "unaligned instructions". */
6244 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6245 ASM_OUTPUT_ALIGN (f, 2);
6248 /* Costs. */
6250 /* Helper function for rtx cost calculation. Strip a shift expression
6251 from X. Returns the inner operand if successful, or the original
6252 expression on failure. */
6253 static rtx
6254 aarch64_strip_shift (rtx x)
6256 rtx op = x;
6258 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6259 we can convert both to ROR during final output. */
6260 if ((GET_CODE (op) == ASHIFT
6261 || GET_CODE (op) == ASHIFTRT
6262 || GET_CODE (op) == LSHIFTRT
6263 || GET_CODE (op) == ROTATERT
6264 || GET_CODE (op) == ROTATE)
6265 && CONST_INT_P (XEXP (op, 1)))
6266 return XEXP (op, 0);
6268 if (GET_CODE (op) == MULT
6269 && CONST_INT_P (XEXP (op, 1))
6270 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6271 return XEXP (op, 0);
6273 return x;
6276 /* Helper function for rtx cost calculation. Strip an extend
6277 expression from X. Returns the inner operand if successful, or the
6278 original expression on failure. We deal with a number of possible
6279 canonicalization variations here. If STRIP_SHIFT is true, then
6280 we can strip off a shift also. */
6281 static rtx
6282 aarch64_strip_extend (rtx x, bool strip_shift)
6284 scalar_int_mode mode;
6285 rtx op = x;
6287 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6288 return op;
6290 /* Zero and sign extraction of a widened value. */
6291 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6292 && XEXP (op, 2) == const0_rtx
6293 && GET_CODE (XEXP (op, 0)) == MULT
6294 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6295 XEXP (op, 1)))
6296 return XEXP (XEXP (op, 0), 0);
6298 /* It can also be represented (for zero-extend) as an AND with an
6299 immediate. */
6300 if (GET_CODE (op) == AND
6301 && GET_CODE (XEXP (op, 0)) == MULT
6302 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6303 && CONST_INT_P (XEXP (op, 1))
6304 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6305 INTVAL (XEXP (op, 1))) != 0)
6306 return XEXP (XEXP (op, 0), 0);
6308 /* Now handle extended register, as this may also have an optional
6309 left shift by 1..4. */
6310 if (strip_shift
6311 && GET_CODE (op) == ASHIFT
6312 && CONST_INT_P (XEXP (op, 1))
6313 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6314 op = XEXP (op, 0);
6316 if (GET_CODE (op) == ZERO_EXTEND
6317 || GET_CODE (op) == SIGN_EXTEND)
6318 op = XEXP (op, 0);
6320 if (op != x)
6321 return op;
6323 return x;
6326 /* Return true iff CODE is a shift supported in combination
6327 with arithmetic instructions. */
6329 static bool
6330 aarch64_shift_p (enum rtx_code code)
6332 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6336 /* Return true iff X is a cheap shift without a sign extend. */
6338 static bool
6339 aarch64_cheap_mult_shift_p (rtx x)
6341 rtx op0, op1;
6343 op0 = XEXP (x, 0);
6344 op1 = XEXP (x, 1);
6346 if (!(aarch64_tune_params.extra_tuning_flags
6347 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6348 return false;
6350 if (GET_CODE (op0) == SIGN_EXTEND)
6351 return false;
6353 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6354 && UINTVAL (op1) <= 4)
6355 return true;
6357 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6358 return false;
6360 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6362 if (l2 > 0 && l2 <= 4)
6363 return true;
6365 return false;
6368 /* Helper function for rtx cost calculation. Calculate the cost of
6369 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6370 Return the calculated cost of the expression, recursing manually in to
6371 operands where needed. */
6373 static int
6374 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6376 rtx op0, op1;
6377 const struct cpu_cost_table *extra_cost
6378 = aarch64_tune_params.insn_extra_cost;
6379 int cost = 0;
6380 bool compound_p = (outer == PLUS || outer == MINUS);
6381 machine_mode mode = GET_MODE (x);
6383 gcc_checking_assert (code == MULT);
6385 op0 = XEXP (x, 0);
6386 op1 = XEXP (x, 1);
6388 if (VECTOR_MODE_P (mode))
6389 mode = GET_MODE_INNER (mode);
6391 /* Integer multiply/fma. */
6392 if (GET_MODE_CLASS (mode) == MODE_INT)
6394 /* The multiply will be canonicalized as a shift, cost it as such. */
6395 if (aarch64_shift_p (GET_CODE (x))
6396 || (CONST_INT_P (op1)
6397 && exact_log2 (INTVAL (op1)) > 0))
6399 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6400 || GET_CODE (op0) == SIGN_EXTEND;
6401 if (speed)
6403 if (compound_p)
6405 /* If the shift is considered cheap,
6406 then don't add any cost. */
6407 if (aarch64_cheap_mult_shift_p (x))
6409 else if (REG_P (op1))
6410 /* ARITH + shift-by-register. */
6411 cost += extra_cost->alu.arith_shift_reg;
6412 else if (is_extend)
6413 /* ARITH + extended register. We don't have a cost field
6414 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6415 cost += extra_cost->alu.extend_arith;
6416 else
6417 /* ARITH + shift-by-immediate. */
6418 cost += extra_cost->alu.arith_shift;
6420 else
6421 /* LSL (immediate). */
6422 cost += extra_cost->alu.shift;
6425 /* Strip extends as we will have costed them in the case above. */
6426 if (is_extend)
6427 op0 = aarch64_strip_extend (op0, true);
6429 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6431 return cost;
6434 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6435 compound and let the below cases handle it. After all, MNEG is a
6436 special-case alias of MSUB. */
6437 if (GET_CODE (op0) == NEG)
6439 op0 = XEXP (op0, 0);
6440 compound_p = true;
6443 /* Integer multiplies or FMAs have zero/sign extending variants. */
6444 if ((GET_CODE (op0) == ZERO_EXTEND
6445 && GET_CODE (op1) == ZERO_EXTEND)
6446 || (GET_CODE (op0) == SIGN_EXTEND
6447 && GET_CODE (op1) == SIGN_EXTEND))
6449 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6450 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6452 if (speed)
6454 if (compound_p)
6455 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6456 cost += extra_cost->mult[0].extend_add;
6457 else
6458 /* MUL/SMULL/UMULL. */
6459 cost += extra_cost->mult[0].extend;
6462 return cost;
6465 /* This is either an integer multiply or a MADD. In both cases
6466 we want to recurse and cost the operands. */
6467 cost += rtx_cost (op0, mode, MULT, 0, speed);
6468 cost += rtx_cost (op1, mode, MULT, 1, speed);
6470 if (speed)
6472 if (compound_p)
6473 /* MADD/MSUB. */
6474 cost += extra_cost->mult[mode == DImode].add;
6475 else
6476 /* MUL. */
6477 cost += extra_cost->mult[mode == DImode].simple;
6480 return cost;
6482 else
6484 if (speed)
6486 /* Floating-point FMA/FMUL can also support negations of the
6487 operands, unless the rounding mode is upward or downward in
6488 which case FNMUL is different than FMUL with operand negation. */
6489 bool neg0 = GET_CODE (op0) == NEG;
6490 bool neg1 = GET_CODE (op1) == NEG;
6491 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6493 if (neg0)
6494 op0 = XEXP (op0, 0);
6495 if (neg1)
6496 op1 = XEXP (op1, 0);
6499 if (compound_p)
6500 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6501 cost += extra_cost->fp[mode == DFmode].fma;
6502 else
6503 /* FMUL/FNMUL. */
6504 cost += extra_cost->fp[mode == DFmode].mult;
6507 cost += rtx_cost (op0, mode, MULT, 0, speed);
6508 cost += rtx_cost (op1, mode, MULT, 1, speed);
6509 return cost;
6513 static int
6514 aarch64_address_cost (rtx x,
6515 machine_mode mode,
6516 addr_space_t as ATTRIBUTE_UNUSED,
6517 bool speed)
6519 enum rtx_code c = GET_CODE (x);
6520 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6521 struct aarch64_address_info info;
6522 int cost = 0;
6523 info.shift = 0;
6525 if (!aarch64_classify_address (&info, x, mode, c, false))
6527 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6529 /* This is a CONST or SYMBOL ref which will be split
6530 in a different way depending on the code model in use.
6531 Cost it through the generic infrastructure. */
6532 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6533 /* Divide through by the cost of one instruction to
6534 bring it to the same units as the address costs. */
6535 cost_symbol_ref /= COSTS_N_INSNS (1);
6536 /* The cost is then the cost of preparing the address,
6537 followed by an immediate (possibly 0) offset. */
6538 return cost_symbol_ref + addr_cost->imm_offset;
6540 else
6542 /* This is most likely a jump table from a case
6543 statement. */
6544 return addr_cost->register_offset;
6548 switch (info.type)
6550 case ADDRESS_LO_SUM:
6551 case ADDRESS_SYMBOLIC:
6552 case ADDRESS_REG_IMM:
6553 cost += addr_cost->imm_offset;
6554 break;
6556 case ADDRESS_REG_WB:
6557 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6558 cost += addr_cost->pre_modify;
6559 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6560 cost += addr_cost->post_modify;
6561 else
6562 gcc_unreachable ();
6564 break;
6566 case ADDRESS_REG_REG:
6567 cost += addr_cost->register_offset;
6568 break;
6570 case ADDRESS_REG_SXTW:
6571 cost += addr_cost->register_sextend;
6572 break;
6574 case ADDRESS_REG_UXTW:
6575 cost += addr_cost->register_zextend;
6576 break;
6578 default:
6579 gcc_unreachable ();
6583 if (info.shift > 0)
6585 /* For the sake of calculating the cost of the shifted register
6586 component, we can treat same sized modes in the same way. */
6587 switch (GET_MODE_BITSIZE (mode))
6589 case 16:
6590 cost += addr_cost->addr_scale_costs.hi;
6591 break;
6593 case 32:
6594 cost += addr_cost->addr_scale_costs.si;
6595 break;
6597 case 64:
6598 cost += addr_cost->addr_scale_costs.di;
6599 break;
6601 /* We can't tell, or this is a 128-bit vector. */
6602 default:
6603 cost += addr_cost->addr_scale_costs.ti;
6604 break;
6608 return cost;
6611 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6612 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6613 to be taken. */
6616 aarch64_branch_cost (bool speed_p, bool predictable_p)
6618 /* When optimizing for speed, use the cost of unpredictable branches. */
6619 const struct cpu_branch_cost *branch_costs =
6620 aarch64_tune_params.branch_costs;
6622 if (!speed_p || predictable_p)
6623 return branch_costs->predictable;
6624 else
6625 return branch_costs->unpredictable;
6628 /* Return true if the RTX X in mode MODE is a zero or sign extract
6629 usable in an ADD or SUB (extended register) instruction. */
6630 static bool
6631 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6633 /* Catch add with a sign extract.
6634 This is add_<optab><mode>_multp2. */
6635 if (GET_CODE (x) == SIGN_EXTRACT
6636 || GET_CODE (x) == ZERO_EXTRACT)
6638 rtx op0 = XEXP (x, 0);
6639 rtx op1 = XEXP (x, 1);
6640 rtx op2 = XEXP (x, 2);
6642 if (GET_CODE (op0) == MULT
6643 && CONST_INT_P (op1)
6644 && op2 == const0_rtx
6645 && CONST_INT_P (XEXP (op0, 1))
6646 && aarch64_is_extend_from_extract (mode,
6647 XEXP (op0, 1),
6648 op1))
6650 return true;
6653 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6654 No shift. */
6655 else if (GET_CODE (x) == SIGN_EXTEND
6656 || GET_CODE (x) == ZERO_EXTEND)
6657 return REG_P (XEXP (x, 0));
6659 return false;
6662 static bool
6663 aarch64_frint_unspec_p (unsigned int u)
6665 switch (u)
6667 case UNSPEC_FRINTZ:
6668 case UNSPEC_FRINTP:
6669 case UNSPEC_FRINTM:
6670 case UNSPEC_FRINTA:
6671 case UNSPEC_FRINTN:
6672 case UNSPEC_FRINTX:
6673 case UNSPEC_FRINTI:
6674 return true;
6676 default:
6677 return false;
6681 /* Return true iff X is an rtx that will match an extr instruction
6682 i.e. as described in the *extr<mode>5_insn family of patterns.
6683 OP0 and OP1 will be set to the operands of the shifts involved
6684 on success and will be NULL_RTX otherwise. */
6686 static bool
6687 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6689 rtx op0, op1;
6690 scalar_int_mode mode;
6691 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6692 return false;
6694 *res_op0 = NULL_RTX;
6695 *res_op1 = NULL_RTX;
6697 if (GET_CODE (x) != IOR)
6698 return false;
6700 op0 = XEXP (x, 0);
6701 op1 = XEXP (x, 1);
6703 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6704 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6706 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6707 if (GET_CODE (op1) == ASHIFT)
6708 std::swap (op0, op1);
6710 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6711 return false;
6713 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6714 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6716 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6717 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6719 *res_op0 = XEXP (op0, 0);
6720 *res_op1 = XEXP (op1, 0);
6721 return true;
6725 return false;
6728 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6729 storing it in *COST. Result is true if the total cost of the operation
6730 has now been calculated. */
6731 static bool
6732 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6734 rtx inner;
6735 rtx comparator;
6736 enum rtx_code cmpcode;
6738 if (COMPARISON_P (op0))
6740 inner = XEXP (op0, 0);
6741 comparator = XEXP (op0, 1);
6742 cmpcode = GET_CODE (op0);
6744 else
6746 inner = op0;
6747 comparator = const0_rtx;
6748 cmpcode = NE;
6751 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6753 /* Conditional branch. */
6754 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6755 return true;
6756 else
6758 if (cmpcode == NE || cmpcode == EQ)
6760 if (comparator == const0_rtx)
6762 /* TBZ/TBNZ/CBZ/CBNZ. */
6763 if (GET_CODE (inner) == ZERO_EXTRACT)
6764 /* TBZ/TBNZ. */
6765 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6766 ZERO_EXTRACT, 0, speed);
6767 else
6768 /* CBZ/CBNZ. */
6769 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6771 return true;
6774 else if (cmpcode == LT || cmpcode == GE)
6776 /* TBZ/TBNZ. */
6777 if (comparator == const0_rtx)
6778 return true;
6782 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6784 /* CCMP. */
6785 if (GET_CODE (op1) == COMPARE)
6787 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6788 if (XEXP (op1, 1) == const0_rtx)
6789 *cost += 1;
6790 if (speed)
6792 machine_mode mode = GET_MODE (XEXP (op1, 0));
6793 const struct cpu_cost_table *extra_cost
6794 = aarch64_tune_params.insn_extra_cost;
6796 if (GET_MODE_CLASS (mode) == MODE_INT)
6797 *cost += extra_cost->alu.arith;
6798 else
6799 *cost += extra_cost->fp[mode == DFmode].compare;
6801 return true;
6804 /* It's a conditional operation based on the status flags,
6805 so it must be some flavor of CSEL. */
6807 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6808 if (GET_CODE (op1) == NEG
6809 || GET_CODE (op1) == NOT
6810 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6811 op1 = XEXP (op1, 0);
6812 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6814 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6815 op1 = XEXP (op1, 0);
6816 op2 = XEXP (op2, 0);
6819 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6820 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6821 return true;
6824 /* We don't know what this is, cost all operands. */
6825 return false;
6828 /* Check whether X is a bitfield operation of the form shift + extend that
6829 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6830 operand to which the bitfield operation is applied. Otherwise return
6831 NULL_RTX. */
6833 static rtx
6834 aarch64_extend_bitfield_pattern_p (rtx x)
6836 rtx_code outer_code = GET_CODE (x);
6837 machine_mode outer_mode = GET_MODE (x);
6839 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6840 && outer_mode != SImode && outer_mode != DImode)
6841 return NULL_RTX;
6843 rtx inner = XEXP (x, 0);
6844 rtx_code inner_code = GET_CODE (inner);
6845 machine_mode inner_mode = GET_MODE (inner);
6846 rtx op = NULL_RTX;
6848 switch (inner_code)
6850 case ASHIFT:
6851 if (CONST_INT_P (XEXP (inner, 1))
6852 && (inner_mode == QImode || inner_mode == HImode))
6853 op = XEXP (inner, 0);
6854 break;
6855 case LSHIFTRT:
6856 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6857 && (inner_mode == QImode || inner_mode == HImode))
6858 op = XEXP (inner, 0);
6859 break;
6860 case ASHIFTRT:
6861 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6862 && (inner_mode == QImode || inner_mode == HImode))
6863 op = XEXP (inner, 0);
6864 break;
6865 default:
6866 break;
6869 return op;
6872 /* Return true if the mask and a shift amount from an RTX of the form
6873 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6874 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6876 bool
6877 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6878 rtx shft_amnt)
6880 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6881 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6882 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6883 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6886 /* Calculate the cost of calculating X, storing it in *COST. Result
6887 is true if the total cost of the operation has now been calculated. */
6888 static bool
6889 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6890 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6892 rtx op0, op1, op2;
6893 const struct cpu_cost_table *extra_cost
6894 = aarch64_tune_params.insn_extra_cost;
6895 int code = GET_CODE (x);
6896 scalar_int_mode int_mode;
6898 /* By default, assume that everything has equivalent cost to the
6899 cheapest instruction. Any additional costs are applied as a delta
6900 above this default. */
6901 *cost = COSTS_N_INSNS (1);
6903 switch (code)
6905 case SET:
6906 /* The cost depends entirely on the operands to SET. */
6907 *cost = 0;
6908 op0 = SET_DEST (x);
6909 op1 = SET_SRC (x);
6911 switch (GET_CODE (op0))
6913 case MEM:
6914 if (speed)
6916 rtx address = XEXP (op0, 0);
6917 if (VECTOR_MODE_P (mode))
6918 *cost += extra_cost->ldst.storev;
6919 else if (GET_MODE_CLASS (mode) == MODE_INT)
6920 *cost += extra_cost->ldst.store;
6921 else if (mode == SFmode)
6922 *cost += extra_cost->ldst.storef;
6923 else if (mode == DFmode)
6924 *cost += extra_cost->ldst.stored;
6926 *cost +=
6927 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6928 0, speed));
6931 *cost += rtx_cost (op1, mode, SET, 1, speed);
6932 return true;
6934 case SUBREG:
6935 if (! REG_P (SUBREG_REG (op0)))
6936 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6938 /* Fall through. */
6939 case REG:
6940 /* The cost is one per vector-register copied. */
6941 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6943 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6944 / GET_MODE_SIZE (V4SImode);
6945 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6947 /* const0_rtx is in general free, but we will use an
6948 instruction to set a register to 0. */
6949 else if (REG_P (op1) || op1 == const0_rtx)
6951 /* The cost is 1 per register copied. */
6952 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6953 / UNITS_PER_WORD;
6954 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6956 else
6957 /* Cost is just the cost of the RHS of the set. */
6958 *cost += rtx_cost (op1, mode, SET, 1, speed);
6959 return true;
6961 case ZERO_EXTRACT:
6962 case SIGN_EXTRACT:
6963 /* Bit-field insertion. Strip any redundant widening of
6964 the RHS to meet the width of the target. */
6965 if (GET_CODE (op1) == SUBREG)
6966 op1 = SUBREG_REG (op1);
6967 if ((GET_CODE (op1) == ZERO_EXTEND
6968 || GET_CODE (op1) == SIGN_EXTEND)
6969 && CONST_INT_P (XEXP (op0, 1))
6970 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6971 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6972 op1 = XEXP (op1, 0);
6974 if (CONST_INT_P (op1))
6976 /* MOV immediate is assumed to always be cheap. */
6977 *cost = COSTS_N_INSNS (1);
6979 else
6981 /* BFM. */
6982 if (speed)
6983 *cost += extra_cost->alu.bfi;
6984 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6987 return true;
6989 default:
6990 /* We can't make sense of this, assume default cost. */
6991 *cost = COSTS_N_INSNS (1);
6992 return false;
6994 return false;
6996 case CONST_INT:
6997 /* If an instruction can incorporate a constant within the
6998 instruction, the instruction's expression avoids calling
6999 rtx_cost() on the constant. If rtx_cost() is called on a
7000 constant, then it is usually because the constant must be
7001 moved into a register by one or more instructions.
7003 The exception is constant 0, which can be expressed
7004 as XZR/WZR and is therefore free. The exception to this is
7005 if we have (set (reg) (const0_rtx)) in which case we must cost
7006 the move. However, we can catch that when we cost the SET, so
7007 we don't need to consider that here. */
7008 if (x == const0_rtx)
7009 *cost = 0;
7010 else
7012 /* To an approximation, building any other constant is
7013 proportionally expensive to the number of instructions
7014 required to build that constant. This is true whether we
7015 are compiling for SPEED or otherwise. */
7016 if (!is_a <scalar_int_mode> (mode, &int_mode))
7017 int_mode = word_mode;
7018 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7019 (NULL_RTX, x, false, int_mode));
7021 return true;
7023 case CONST_DOUBLE:
7025 /* First determine number of instructions to do the move
7026 as an integer constant. */
7027 if (!aarch64_float_const_representable_p (x)
7028 && !aarch64_can_const_movi_rtx_p (x, mode)
7029 && aarch64_float_const_rtx_p (x))
7031 unsigned HOST_WIDE_INT ival;
7032 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7033 gcc_assert (succeed);
7035 scalar_int_mode imode = (mode == HFmode
7036 ? SImode
7037 : int_mode_for_mode (mode).require ());
7038 int ncost = aarch64_internal_mov_immediate
7039 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7040 *cost += COSTS_N_INSNS (ncost);
7041 return true;
7044 if (speed)
7046 /* mov[df,sf]_aarch64. */
7047 if (aarch64_float_const_representable_p (x))
7048 /* FMOV (scalar immediate). */
7049 *cost += extra_cost->fp[mode == DFmode].fpconst;
7050 else if (!aarch64_float_const_zero_rtx_p (x))
7052 /* This will be a load from memory. */
7053 if (mode == DFmode)
7054 *cost += extra_cost->ldst.loadd;
7055 else
7056 *cost += extra_cost->ldst.loadf;
7058 else
7059 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7060 or MOV v0.s[0], wzr - neither of which are modeled by the
7061 cost tables. Just use the default cost. */
7066 return true;
7068 case MEM:
7069 if (speed)
7071 /* For loads we want the base cost of a load, plus an
7072 approximation for the additional cost of the addressing
7073 mode. */
7074 rtx address = XEXP (x, 0);
7075 if (VECTOR_MODE_P (mode))
7076 *cost += extra_cost->ldst.loadv;
7077 else if (GET_MODE_CLASS (mode) == MODE_INT)
7078 *cost += extra_cost->ldst.load;
7079 else if (mode == SFmode)
7080 *cost += extra_cost->ldst.loadf;
7081 else if (mode == DFmode)
7082 *cost += extra_cost->ldst.loadd;
7084 *cost +=
7085 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7086 0, speed));
7089 return true;
7091 case NEG:
7092 op0 = XEXP (x, 0);
7094 if (VECTOR_MODE_P (mode))
7096 if (speed)
7098 /* FNEG. */
7099 *cost += extra_cost->vect.alu;
7101 return false;
7104 if (GET_MODE_CLASS (mode) == MODE_INT)
7106 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7107 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7109 /* CSETM. */
7110 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7111 return true;
7114 /* Cost this as SUB wzr, X. */
7115 op0 = CONST0_RTX (mode);
7116 op1 = XEXP (x, 0);
7117 goto cost_minus;
7120 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7122 /* Support (neg(fma...)) as a single instruction only if
7123 sign of zeros is unimportant. This matches the decision
7124 making in aarch64.md. */
7125 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7127 /* FNMADD. */
7128 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7129 return true;
7131 if (GET_CODE (op0) == MULT)
7133 /* FNMUL. */
7134 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7135 return true;
7137 if (speed)
7138 /* FNEG. */
7139 *cost += extra_cost->fp[mode == DFmode].neg;
7140 return false;
7143 return false;
7145 case CLRSB:
7146 case CLZ:
7147 if (speed)
7149 if (VECTOR_MODE_P (mode))
7150 *cost += extra_cost->vect.alu;
7151 else
7152 *cost += extra_cost->alu.clz;
7155 return false;
7157 case COMPARE:
7158 op0 = XEXP (x, 0);
7159 op1 = XEXP (x, 1);
7161 if (op1 == const0_rtx
7162 && GET_CODE (op0) == AND)
7164 x = op0;
7165 mode = GET_MODE (op0);
7166 goto cost_logic;
7169 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7171 /* TODO: A write to the CC flags possibly costs extra, this
7172 needs encoding in the cost tables. */
7174 mode = GET_MODE (op0);
7175 /* ANDS. */
7176 if (GET_CODE (op0) == AND)
7178 x = op0;
7179 goto cost_logic;
7182 if (GET_CODE (op0) == PLUS)
7184 /* ADDS (and CMN alias). */
7185 x = op0;
7186 goto cost_plus;
7189 if (GET_CODE (op0) == MINUS)
7191 /* SUBS. */
7192 x = op0;
7193 goto cost_minus;
7196 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7197 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7198 && CONST_INT_P (XEXP (op0, 2)))
7200 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7201 Handle it here directly rather than going to cost_logic
7202 since we know the immediate generated for the TST is valid
7203 so we can avoid creating an intermediate rtx for it only
7204 for costing purposes. */
7205 if (speed)
7206 *cost += extra_cost->alu.logical;
7208 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7209 ZERO_EXTRACT, 0, speed);
7210 return true;
7213 if (GET_CODE (op1) == NEG)
7215 /* CMN. */
7216 if (speed)
7217 *cost += extra_cost->alu.arith;
7219 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7220 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7221 return true;
7224 /* CMP.
7226 Compare can freely swap the order of operands, and
7227 canonicalization puts the more complex operation first.
7228 But the integer MINUS logic expects the shift/extend
7229 operation in op1. */
7230 if (! (REG_P (op0)
7231 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7233 op0 = XEXP (x, 1);
7234 op1 = XEXP (x, 0);
7236 goto cost_minus;
7239 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7241 /* FCMP. */
7242 if (speed)
7243 *cost += extra_cost->fp[mode == DFmode].compare;
7245 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7247 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7248 /* FCMP supports constant 0.0 for no extra cost. */
7249 return true;
7251 return false;
7254 if (VECTOR_MODE_P (mode))
7256 /* Vector compare. */
7257 if (speed)
7258 *cost += extra_cost->vect.alu;
7260 if (aarch64_float_const_zero_rtx_p (op1))
7262 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7263 cost. */
7264 return true;
7266 return false;
7268 return false;
7270 case MINUS:
7272 op0 = XEXP (x, 0);
7273 op1 = XEXP (x, 1);
7275 cost_minus:
7276 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7278 /* Detect valid immediates. */
7279 if ((GET_MODE_CLASS (mode) == MODE_INT
7280 || (GET_MODE_CLASS (mode) == MODE_CC
7281 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7282 && CONST_INT_P (op1)
7283 && aarch64_uimm12_shift (INTVAL (op1)))
7285 if (speed)
7286 /* SUB(S) (immediate). */
7287 *cost += extra_cost->alu.arith;
7288 return true;
7291 /* Look for SUB (extended register). */
7292 if (is_a <scalar_int_mode> (mode, &int_mode)
7293 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7295 if (speed)
7296 *cost += extra_cost->alu.extend_arith;
7298 op1 = aarch64_strip_extend (op1, true);
7299 *cost += rtx_cost (op1, VOIDmode,
7300 (enum rtx_code) GET_CODE (op1), 0, speed);
7301 return true;
7304 rtx new_op1 = aarch64_strip_extend (op1, false);
7306 /* Cost this as an FMA-alike operation. */
7307 if ((GET_CODE (new_op1) == MULT
7308 || aarch64_shift_p (GET_CODE (new_op1)))
7309 && code != COMPARE)
7311 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7312 (enum rtx_code) code,
7313 speed);
7314 return true;
7317 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7319 if (speed)
7321 if (VECTOR_MODE_P (mode))
7323 /* Vector SUB. */
7324 *cost += extra_cost->vect.alu;
7326 else if (GET_MODE_CLASS (mode) == MODE_INT)
7328 /* SUB(S). */
7329 *cost += extra_cost->alu.arith;
7331 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7333 /* FSUB. */
7334 *cost += extra_cost->fp[mode == DFmode].addsub;
7337 return true;
7340 case PLUS:
7342 rtx new_op0;
7344 op0 = XEXP (x, 0);
7345 op1 = XEXP (x, 1);
7347 cost_plus:
7348 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7349 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7351 /* CSINC. */
7352 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7353 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7354 return true;
7357 if (GET_MODE_CLASS (mode) == MODE_INT
7358 && CONST_INT_P (op1)
7359 && aarch64_uimm12_shift (INTVAL (op1)))
7361 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7363 if (speed)
7364 /* ADD (immediate). */
7365 *cost += extra_cost->alu.arith;
7366 return true;
7369 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7371 /* Look for ADD (extended register). */
7372 if (is_a <scalar_int_mode> (mode, &int_mode)
7373 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7375 if (speed)
7376 *cost += extra_cost->alu.extend_arith;
7378 op0 = aarch64_strip_extend (op0, true);
7379 *cost += rtx_cost (op0, VOIDmode,
7380 (enum rtx_code) GET_CODE (op0), 0, speed);
7381 return true;
7384 /* Strip any extend, leave shifts behind as we will
7385 cost them through mult_cost. */
7386 new_op0 = aarch64_strip_extend (op0, false);
7388 if (GET_CODE (new_op0) == MULT
7389 || aarch64_shift_p (GET_CODE (new_op0)))
7391 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7392 speed);
7393 return true;
7396 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7398 if (speed)
7400 if (VECTOR_MODE_P (mode))
7402 /* Vector ADD. */
7403 *cost += extra_cost->vect.alu;
7405 else if (GET_MODE_CLASS (mode) == MODE_INT)
7407 /* ADD. */
7408 *cost += extra_cost->alu.arith;
7410 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7412 /* FADD. */
7413 *cost += extra_cost->fp[mode == DFmode].addsub;
7416 return true;
7419 case BSWAP:
7420 *cost = COSTS_N_INSNS (1);
7422 if (speed)
7424 if (VECTOR_MODE_P (mode))
7425 *cost += extra_cost->vect.alu;
7426 else
7427 *cost += extra_cost->alu.rev;
7429 return false;
7431 case IOR:
7432 if (aarch_rev16_p (x))
7434 *cost = COSTS_N_INSNS (1);
7436 if (speed)
7438 if (VECTOR_MODE_P (mode))
7439 *cost += extra_cost->vect.alu;
7440 else
7441 *cost += extra_cost->alu.rev;
7443 return true;
7446 if (aarch64_extr_rtx_p (x, &op0, &op1))
7448 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7449 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7450 if (speed)
7451 *cost += extra_cost->alu.shift;
7453 return true;
7455 /* Fall through. */
7456 case XOR:
7457 case AND:
7458 cost_logic:
7459 op0 = XEXP (x, 0);
7460 op1 = XEXP (x, 1);
7462 if (VECTOR_MODE_P (mode))
7464 if (speed)
7465 *cost += extra_cost->vect.alu;
7466 return true;
7469 if (code == AND
7470 && GET_CODE (op0) == MULT
7471 && CONST_INT_P (XEXP (op0, 1))
7472 && CONST_INT_P (op1)
7473 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7474 INTVAL (op1)) != 0)
7476 /* This is a UBFM/SBFM. */
7477 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7478 if (speed)
7479 *cost += extra_cost->alu.bfx;
7480 return true;
7483 if (is_int_mode (mode, &int_mode))
7485 if (CONST_INT_P (op1))
7487 /* We have a mask + shift version of a UBFIZ
7488 i.e. the *andim_ashift<mode>_bfiz pattern. */
7489 if (GET_CODE (op0) == ASHIFT
7490 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7491 XEXP (op0, 1)))
7493 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7494 (enum rtx_code) code, 0, speed);
7495 if (speed)
7496 *cost += extra_cost->alu.bfx;
7498 return true;
7500 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7502 /* We possibly get the immediate for free, this is not
7503 modelled. */
7504 *cost += rtx_cost (op0, int_mode,
7505 (enum rtx_code) code, 0, speed);
7506 if (speed)
7507 *cost += extra_cost->alu.logical;
7509 return true;
7512 else
7514 rtx new_op0 = op0;
7516 /* Handle ORN, EON, or BIC. */
7517 if (GET_CODE (op0) == NOT)
7518 op0 = XEXP (op0, 0);
7520 new_op0 = aarch64_strip_shift (op0);
7522 /* If we had a shift on op0 then this is a logical-shift-
7523 by-register/immediate operation. Otherwise, this is just
7524 a logical operation. */
7525 if (speed)
7527 if (new_op0 != op0)
7529 /* Shift by immediate. */
7530 if (CONST_INT_P (XEXP (op0, 1)))
7531 *cost += extra_cost->alu.log_shift;
7532 else
7533 *cost += extra_cost->alu.log_shift_reg;
7535 else
7536 *cost += extra_cost->alu.logical;
7539 /* In both cases we want to cost both operands. */
7540 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7541 0, speed);
7542 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7543 1, speed);
7545 return true;
7548 return false;
7550 case NOT:
7551 x = XEXP (x, 0);
7552 op0 = aarch64_strip_shift (x);
7554 if (VECTOR_MODE_P (mode))
7556 /* Vector NOT. */
7557 *cost += extra_cost->vect.alu;
7558 return false;
7561 /* MVN-shifted-reg. */
7562 if (op0 != x)
7564 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7566 if (speed)
7567 *cost += extra_cost->alu.log_shift;
7569 return true;
7571 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7572 Handle the second form here taking care that 'a' in the above can
7573 be a shift. */
7574 else if (GET_CODE (op0) == XOR)
7576 rtx newop0 = XEXP (op0, 0);
7577 rtx newop1 = XEXP (op0, 1);
7578 rtx op0_stripped = aarch64_strip_shift (newop0);
7580 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7581 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7583 if (speed)
7585 if (op0_stripped != newop0)
7586 *cost += extra_cost->alu.log_shift;
7587 else
7588 *cost += extra_cost->alu.logical;
7591 return true;
7593 /* MVN. */
7594 if (speed)
7595 *cost += extra_cost->alu.logical;
7597 return false;
7599 case ZERO_EXTEND:
7601 op0 = XEXP (x, 0);
7602 /* If a value is written in SI mode, then zero extended to DI
7603 mode, the operation will in general be free as a write to
7604 a 'w' register implicitly zeroes the upper bits of an 'x'
7605 register. However, if this is
7607 (set (reg) (zero_extend (reg)))
7609 we must cost the explicit register move. */
7610 if (mode == DImode
7611 && GET_MODE (op0) == SImode
7612 && outer == SET)
7614 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7616 /* If OP_COST is non-zero, then the cost of the zero extend
7617 is effectively the cost of the inner operation. Otherwise
7618 we have a MOV instruction and we take the cost from the MOV
7619 itself. This is true independently of whether we are
7620 optimizing for space or time. */
7621 if (op_cost)
7622 *cost = op_cost;
7624 return true;
7626 else if (MEM_P (op0))
7628 /* All loads can zero extend to any size for free. */
7629 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7630 return true;
7633 op0 = aarch64_extend_bitfield_pattern_p (x);
7634 if (op0)
7636 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7637 if (speed)
7638 *cost += extra_cost->alu.bfx;
7639 return true;
7642 if (speed)
7644 if (VECTOR_MODE_P (mode))
7646 /* UMOV. */
7647 *cost += extra_cost->vect.alu;
7649 else
7651 /* We generate an AND instead of UXTB/UXTH. */
7652 *cost += extra_cost->alu.logical;
7655 return false;
7657 case SIGN_EXTEND:
7658 if (MEM_P (XEXP (x, 0)))
7660 /* LDRSH. */
7661 if (speed)
7663 rtx address = XEXP (XEXP (x, 0), 0);
7664 *cost += extra_cost->ldst.load_sign_extend;
7666 *cost +=
7667 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7668 0, speed));
7670 return true;
7673 op0 = aarch64_extend_bitfield_pattern_p (x);
7674 if (op0)
7676 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7677 if (speed)
7678 *cost += extra_cost->alu.bfx;
7679 return true;
7682 if (speed)
7684 if (VECTOR_MODE_P (mode))
7685 *cost += extra_cost->vect.alu;
7686 else
7687 *cost += extra_cost->alu.extend;
7689 return false;
7691 case ASHIFT:
7692 op0 = XEXP (x, 0);
7693 op1 = XEXP (x, 1);
7695 if (CONST_INT_P (op1))
7697 if (speed)
7699 if (VECTOR_MODE_P (mode))
7701 /* Vector shift (immediate). */
7702 *cost += extra_cost->vect.alu;
7704 else
7706 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7707 aliases. */
7708 *cost += extra_cost->alu.shift;
7712 /* We can incorporate zero/sign extend for free. */
7713 if (GET_CODE (op0) == ZERO_EXTEND
7714 || GET_CODE (op0) == SIGN_EXTEND)
7715 op0 = XEXP (op0, 0);
7717 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7718 return true;
7720 else
7722 if (VECTOR_MODE_P (mode))
7724 if (speed)
7725 /* Vector shift (register). */
7726 *cost += extra_cost->vect.alu;
7728 else
7730 if (speed)
7731 /* LSLV. */
7732 *cost += extra_cost->alu.shift_reg;
7734 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7735 && CONST_INT_P (XEXP (op1, 1))
7736 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7738 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7739 /* We already demanded XEXP (op1, 0) to be REG_P, so
7740 don't recurse into it. */
7741 return true;
7744 return false; /* All arguments need to be in registers. */
7747 case ROTATE:
7748 case ROTATERT:
7749 case LSHIFTRT:
7750 case ASHIFTRT:
7751 op0 = XEXP (x, 0);
7752 op1 = XEXP (x, 1);
7754 if (CONST_INT_P (op1))
7756 /* ASR (immediate) and friends. */
7757 if (speed)
7759 if (VECTOR_MODE_P (mode))
7760 *cost += extra_cost->vect.alu;
7761 else
7762 *cost += extra_cost->alu.shift;
7765 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7766 return true;
7768 else
7770 if (VECTOR_MODE_P (mode))
7772 if (speed)
7773 /* Vector shift (register). */
7774 *cost += extra_cost->vect.alu;
7776 else
7778 if (speed)
7779 /* ASR (register) and friends. */
7780 *cost += extra_cost->alu.shift_reg;
7782 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7783 && CONST_INT_P (XEXP (op1, 1))
7784 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7786 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7787 /* We already demanded XEXP (op1, 0) to be REG_P, so
7788 don't recurse into it. */
7789 return true;
7792 return false; /* All arguments need to be in registers. */
7795 case SYMBOL_REF:
7797 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7798 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7800 /* LDR. */
7801 if (speed)
7802 *cost += extra_cost->ldst.load;
7804 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7805 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7807 /* ADRP, followed by ADD. */
7808 *cost += COSTS_N_INSNS (1);
7809 if (speed)
7810 *cost += 2 * extra_cost->alu.arith;
7812 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7813 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7815 /* ADR. */
7816 if (speed)
7817 *cost += extra_cost->alu.arith;
7820 if (flag_pic)
7822 /* One extra load instruction, after accessing the GOT. */
7823 *cost += COSTS_N_INSNS (1);
7824 if (speed)
7825 *cost += extra_cost->ldst.load;
7827 return true;
7829 case HIGH:
7830 case LO_SUM:
7831 /* ADRP/ADD (immediate). */
7832 if (speed)
7833 *cost += extra_cost->alu.arith;
7834 return true;
7836 case ZERO_EXTRACT:
7837 case SIGN_EXTRACT:
7838 /* UBFX/SBFX. */
7839 if (speed)
7841 if (VECTOR_MODE_P (mode))
7842 *cost += extra_cost->vect.alu;
7843 else
7844 *cost += extra_cost->alu.bfx;
7847 /* We can trust that the immediates used will be correct (there
7848 are no by-register forms), so we need only cost op0. */
7849 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7850 return true;
7852 case MULT:
7853 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7854 /* aarch64_rtx_mult_cost always handles recursion to its
7855 operands. */
7856 return true;
7858 case MOD:
7859 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7860 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7861 an unconditional negate. This case should only ever be reached through
7862 the set_smod_pow2_cheap check in expmed.c. */
7863 if (CONST_INT_P (XEXP (x, 1))
7864 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7865 && (mode == SImode || mode == DImode))
7867 /* We expand to 4 instructions. Reset the baseline. */
7868 *cost = COSTS_N_INSNS (4);
7870 if (speed)
7871 *cost += 2 * extra_cost->alu.logical
7872 + 2 * extra_cost->alu.arith;
7874 return true;
7877 /* Fall-through. */
7878 case UMOD:
7879 if (speed)
7881 /* Slighly prefer UMOD over SMOD. */
7882 if (VECTOR_MODE_P (mode))
7883 *cost += extra_cost->vect.alu;
7884 else if (GET_MODE_CLASS (mode) == MODE_INT)
7885 *cost += (extra_cost->mult[mode == DImode].add
7886 + extra_cost->mult[mode == DImode].idiv
7887 + (code == MOD ? 1 : 0));
7889 return false; /* All arguments need to be in registers. */
7891 case DIV:
7892 case UDIV:
7893 case SQRT:
7894 if (speed)
7896 if (VECTOR_MODE_P (mode))
7897 *cost += extra_cost->vect.alu;
7898 else if (GET_MODE_CLASS (mode) == MODE_INT)
7899 /* There is no integer SQRT, so only DIV and UDIV can get
7900 here. */
7901 *cost += (extra_cost->mult[mode == DImode].idiv
7902 /* Slighly prefer UDIV over SDIV. */
7903 + (code == DIV ? 1 : 0));
7904 else
7905 *cost += extra_cost->fp[mode == DFmode].div;
7907 return false; /* All arguments need to be in registers. */
7909 case IF_THEN_ELSE:
7910 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7911 XEXP (x, 2), cost, speed);
7913 case EQ:
7914 case NE:
7915 case GT:
7916 case GTU:
7917 case LT:
7918 case LTU:
7919 case GE:
7920 case GEU:
7921 case LE:
7922 case LEU:
7924 return false; /* All arguments must be in registers. */
7926 case FMA:
7927 op0 = XEXP (x, 0);
7928 op1 = XEXP (x, 1);
7929 op2 = XEXP (x, 2);
7931 if (speed)
7933 if (VECTOR_MODE_P (mode))
7934 *cost += extra_cost->vect.alu;
7935 else
7936 *cost += extra_cost->fp[mode == DFmode].fma;
7939 /* FMSUB, FNMADD, and FNMSUB are free. */
7940 if (GET_CODE (op0) == NEG)
7941 op0 = XEXP (op0, 0);
7943 if (GET_CODE (op2) == NEG)
7944 op2 = XEXP (op2, 0);
7946 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7947 and the by-element operand as operand 0. */
7948 if (GET_CODE (op1) == NEG)
7949 op1 = XEXP (op1, 0);
7951 /* Catch vector-by-element operations. The by-element operand can
7952 either be (vec_duplicate (vec_select (x))) or just
7953 (vec_select (x)), depending on whether we are multiplying by
7954 a vector or a scalar.
7956 Canonicalization is not very good in these cases, FMA4 will put the
7957 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7958 if (GET_CODE (op0) == VEC_DUPLICATE)
7959 op0 = XEXP (op0, 0);
7960 else if (GET_CODE (op1) == VEC_DUPLICATE)
7961 op1 = XEXP (op1, 0);
7963 if (GET_CODE (op0) == VEC_SELECT)
7964 op0 = XEXP (op0, 0);
7965 else if (GET_CODE (op1) == VEC_SELECT)
7966 op1 = XEXP (op1, 0);
7968 /* If the remaining parameters are not registers,
7969 get the cost to put them into registers. */
7970 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7971 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7972 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7973 return true;
7975 case FLOAT:
7976 case UNSIGNED_FLOAT:
7977 if (speed)
7978 *cost += extra_cost->fp[mode == DFmode].fromint;
7979 return false;
7981 case FLOAT_EXTEND:
7982 if (speed)
7984 if (VECTOR_MODE_P (mode))
7986 /*Vector truncate. */
7987 *cost += extra_cost->vect.alu;
7989 else
7990 *cost += extra_cost->fp[mode == DFmode].widen;
7992 return false;
7994 case FLOAT_TRUNCATE:
7995 if (speed)
7997 if (VECTOR_MODE_P (mode))
7999 /*Vector conversion. */
8000 *cost += extra_cost->vect.alu;
8002 else
8003 *cost += extra_cost->fp[mode == DFmode].narrow;
8005 return false;
8007 case FIX:
8008 case UNSIGNED_FIX:
8009 x = XEXP (x, 0);
8010 /* Strip the rounding part. They will all be implemented
8011 by the fcvt* family of instructions anyway. */
8012 if (GET_CODE (x) == UNSPEC)
8014 unsigned int uns_code = XINT (x, 1);
8016 if (uns_code == UNSPEC_FRINTA
8017 || uns_code == UNSPEC_FRINTM
8018 || uns_code == UNSPEC_FRINTN
8019 || uns_code == UNSPEC_FRINTP
8020 || uns_code == UNSPEC_FRINTZ)
8021 x = XVECEXP (x, 0, 0);
8024 if (speed)
8026 if (VECTOR_MODE_P (mode))
8027 *cost += extra_cost->vect.alu;
8028 else
8029 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8032 /* We can combine fmul by a power of 2 followed by a fcvt into a single
8033 fixed-point fcvt. */
8034 if (GET_CODE (x) == MULT
8035 && ((VECTOR_MODE_P (mode)
8036 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8037 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8039 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8040 0, speed);
8041 return true;
8044 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8045 return true;
8047 case ABS:
8048 if (VECTOR_MODE_P (mode))
8050 /* ABS (vector). */
8051 if (speed)
8052 *cost += extra_cost->vect.alu;
8054 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8056 op0 = XEXP (x, 0);
8058 /* FABD, which is analogous to FADD. */
8059 if (GET_CODE (op0) == MINUS)
8061 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8062 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8063 if (speed)
8064 *cost += extra_cost->fp[mode == DFmode].addsub;
8066 return true;
8068 /* Simple FABS is analogous to FNEG. */
8069 if (speed)
8070 *cost += extra_cost->fp[mode == DFmode].neg;
8072 else
8074 /* Integer ABS will either be split to
8075 two arithmetic instructions, or will be an ABS
8076 (scalar), which we don't model. */
8077 *cost = COSTS_N_INSNS (2);
8078 if (speed)
8079 *cost += 2 * extra_cost->alu.arith;
8081 return false;
8083 case SMAX:
8084 case SMIN:
8085 if (speed)
8087 if (VECTOR_MODE_P (mode))
8088 *cost += extra_cost->vect.alu;
8089 else
8091 /* FMAXNM/FMINNM/FMAX/FMIN.
8092 TODO: This may not be accurate for all implementations, but
8093 we do not model this in the cost tables. */
8094 *cost += extra_cost->fp[mode == DFmode].addsub;
8097 return false;
8099 case UNSPEC:
8100 /* The floating point round to integer frint* instructions. */
8101 if (aarch64_frint_unspec_p (XINT (x, 1)))
8103 if (speed)
8104 *cost += extra_cost->fp[mode == DFmode].roundint;
8106 return false;
8109 if (XINT (x, 1) == UNSPEC_RBIT)
8111 if (speed)
8112 *cost += extra_cost->alu.rev;
8114 return false;
8116 break;
8118 case TRUNCATE:
8120 /* Decompose <su>muldi3_highpart. */
8121 if (/* (truncate:DI */
8122 mode == DImode
8123 /* (lshiftrt:TI */
8124 && GET_MODE (XEXP (x, 0)) == TImode
8125 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8126 /* (mult:TI */
8127 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8128 /* (ANY_EXTEND:TI (reg:DI))
8129 (ANY_EXTEND:TI (reg:DI))) */
8130 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8131 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8132 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8133 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8134 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8135 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8136 /* (const_int 64) */
8137 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8138 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8140 /* UMULH/SMULH. */
8141 if (speed)
8142 *cost += extra_cost->mult[mode == DImode].extend;
8143 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8144 mode, MULT, 0, speed);
8145 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8146 mode, MULT, 1, speed);
8147 return true;
8150 /* Fall through. */
8151 default:
8152 break;
8155 if (dump_file
8156 && flag_aarch64_verbose_cost)
8157 fprintf (dump_file,
8158 "\nFailed to cost RTX. Assuming default cost.\n");
8160 return true;
8163 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8164 calculated for X. This cost is stored in *COST. Returns true
8165 if the total cost of X was calculated. */
8166 static bool
8167 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8168 int param, int *cost, bool speed)
8170 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8172 if (dump_file
8173 && flag_aarch64_verbose_cost)
8175 print_rtl_single (dump_file, x);
8176 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8177 speed ? "Hot" : "Cold",
8178 *cost, result ? "final" : "partial");
8181 return result;
8184 static int
8185 aarch64_register_move_cost (machine_mode mode,
8186 reg_class_t from_i, reg_class_t to_i)
8188 enum reg_class from = (enum reg_class) from_i;
8189 enum reg_class to = (enum reg_class) to_i;
8190 const struct cpu_regmove_cost *regmove_cost
8191 = aarch64_tune_params.regmove_cost;
8193 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8194 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8195 to = GENERAL_REGS;
8197 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8198 from = GENERAL_REGS;
8200 /* Moving between GPR and stack cost is the same as GP2GP. */
8201 if ((from == GENERAL_REGS && to == STACK_REG)
8202 || (to == GENERAL_REGS && from == STACK_REG))
8203 return regmove_cost->GP2GP;
8205 /* To/From the stack register, we move via the gprs. */
8206 if (to == STACK_REG || from == STACK_REG)
8207 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8208 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8210 if (GET_MODE_SIZE (mode) == 16)
8212 /* 128-bit operations on general registers require 2 instructions. */
8213 if (from == GENERAL_REGS && to == GENERAL_REGS)
8214 return regmove_cost->GP2GP * 2;
8215 else if (from == GENERAL_REGS)
8216 return regmove_cost->GP2FP * 2;
8217 else if (to == GENERAL_REGS)
8218 return regmove_cost->FP2GP * 2;
8220 /* When AdvSIMD instructions are disabled it is not possible to move
8221 a 128-bit value directly between Q registers. This is handled in
8222 secondary reload. A general register is used as a scratch to move
8223 the upper DI value and the lower DI value is moved directly,
8224 hence the cost is the sum of three moves. */
8225 if (! TARGET_SIMD)
8226 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8228 return regmove_cost->FP2FP;
8231 if (from == GENERAL_REGS && to == GENERAL_REGS)
8232 return regmove_cost->GP2GP;
8233 else if (from == GENERAL_REGS)
8234 return regmove_cost->GP2FP;
8235 else if (to == GENERAL_REGS)
8236 return regmove_cost->FP2GP;
8238 return regmove_cost->FP2FP;
8241 static int
8242 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8243 reg_class_t rclass ATTRIBUTE_UNUSED,
8244 bool in ATTRIBUTE_UNUSED)
8246 return aarch64_tune_params.memmov_cost;
8249 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8250 to optimize 1.0/sqrt. */
8252 static bool
8253 use_rsqrt_p (machine_mode mode)
8255 return (!flag_trapping_math
8256 && flag_unsafe_math_optimizations
8257 && ((aarch64_tune_params.approx_modes->recip_sqrt
8258 & AARCH64_APPROX_MODE (mode))
8259 || flag_mrecip_low_precision_sqrt));
8262 /* Function to decide when to use the approximate reciprocal square root
8263 builtin. */
8265 static tree
8266 aarch64_builtin_reciprocal (tree fndecl)
8268 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8270 if (!use_rsqrt_p (mode))
8271 return NULL_TREE;
8272 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8275 typedef rtx (*rsqrte_type) (rtx, rtx);
8277 /* Select reciprocal square root initial estimate insn depending on machine
8278 mode. */
8280 static rsqrte_type
8281 get_rsqrte_type (machine_mode mode)
8283 switch (mode)
8285 case E_DFmode: return gen_aarch64_rsqrtedf;
8286 case E_SFmode: return gen_aarch64_rsqrtesf;
8287 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8288 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8289 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8290 default: gcc_unreachable ();
8294 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8296 /* Select reciprocal square root series step insn depending on machine mode. */
8298 static rsqrts_type
8299 get_rsqrts_type (machine_mode mode)
8301 switch (mode)
8303 case E_DFmode: return gen_aarch64_rsqrtsdf;
8304 case E_SFmode: return gen_aarch64_rsqrtssf;
8305 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8306 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8307 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8308 default: gcc_unreachable ();
8312 /* Emit instruction sequence to compute either the approximate square root
8313 or its approximate reciprocal, depending on the flag RECP, and return
8314 whether the sequence was emitted or not. */
8316 bool
8317 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8319 machine_mode mode = GET_MODE (dst);
8321 if (GET_MODE_INNER (mode) == HFmode)
8323 gcc_assert (!recp);
8324 return false;
8327 if (!recp)
8329 if (!(flag_mlow_precision_sqrt
8330 || (aarch64_tune_params.approx_modes->sqrt
8331 & AARCH64_APPROX_MODE (mode))))
8332 return false;
8334 if (flag_finite_math_only
8335 || flag_trapping_math
8336 || !flag_unsafe_math_optimizations
8337 || optimize_function_for_size_p (cfun))
8338 return false;
8340 else
8341 /* Caller assumes we cannot fail. */
8342 gcc_assert (use_rsqrt_p (mode));
8344 machine_mode mmsk = mode_for_int_vector (mode).require ();
8345 rtx xmsk = gen_reg_rtx (mmsk);
8346 if (!recp)
8347 /* When calculating the approximate square root, compare the
8348 argument with 0.0 and create a mask. */
8349 emit_insn (gen_rtx_SET (xmsk,
8350 gen_rtx_NEG (mmsk,
8351 gen_rtx_EQ (mmsk, src,
8352 CONST0_RTX (mode)))));
8354 /* Estimate the approximate reciprocal square root. */
8355 rtx xdst = gen_reg_rtx (mode);
8356 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8358 /* Iterate over the series twice for SF and thrice for DF. */
8359 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8361 /* Optionally iterate over the series once less for faster performance
8362 while sacrificing the accuracy. */
8363 if ((recp && flag_mrecip_low_precision_sqrt)
8364 || (!recp && flag_mlow_precision_sqrt))
8365 iterations--;
8367 /* Iterate over the series to calculate the approximate reciprocal square
8368 root. */
8369 rtx x1 = gen_reg_rtx (mode);
8370 while (iterations--)
8372 rtx x2 = gen_reg_rtx (mode);
8373 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8375 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8377 if (iterations > 0)
8378 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8381 if (!recp)
8383 /* Qualify the approximate reciprocal square root when the argument is
8384 0.0 by squashing the intermediary result to 0.0. */
8385 rtx xtmp = gen_reg_rtx (mmsk);
8386 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8387 gen_rtx_SUBREG (mmsk, xdst, 0)));
8388 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8390 /* Calculate the approximate square root. */
8391 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8394 /* Finalize the approximation. */
8395 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8397 return true;
8400 typedef rtx (*recpe_type) (rtx, rtx);
8402 /* Select reciprocal initial estimate insn depending on machine mode. */
8404 static recpe_type
8405 get_recpe_type (machine_mode mode)
8407 switch (mode)
8409 case E_SFmode: return (gen_aarch64_frecpesf);
8410 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8411 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8412 case E_DFmode: return (gen_aarch64_frecpedf);
8413 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8414 default: gcc_unreachable ();
8418 typedef rtx (*recps_type) (rtx, rtx, rtx);
8420 /* Select reciprocal series step insn depending on machine mode. */
8422 static recps_type
8423 get_recps_type (machine_mode mode)
8425 switch (mode)
8427 case E_SFmode: return (gen_aarch64_frecpssf);
8428 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8429 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8430 case E_DFmode: return (gen_aarch64_frecpsdf);
8431 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8432 default: gcc_unreachable ();
8436 /* Emit the instruction sequence to compute the approximation for the division
8437 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8439 bool
8440 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8442 machine_mode mode = GET_MODE (quo);
8444 if (GET_MODE_INNER (mode) == HFmode)
8445 return false;
8447 bool use_approx_division_p = (flag_mlow_precision_div
8448 || (aarch64_tune_params.approx_modes->division
8449 & AARCH64_APPROX_MODE (mode)));
8451 if (!flag_finite_math_only
8452 || flag_trapping_math
8453 || !flag_unsafe_math_optimizations
8454 || optimize_function_for_size_p (cfun)
8455 || !use_approx_division_p)
8456 return false;
8458 /* Estimate the approximate reciprocal. */
8459 rtx xrcp = gen_reg_rtx (mode);
8460 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8462 /* Iterate over the series twice for SF and thrice for DF. */
8463 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8465 /* Optionally iterate over the series once less for faster performance,
8466 while sacrificing the accuracy. */
8467 if (flag_mlow_precision_div)
8468 iterations--;
8470 /* Iterate over the series to calculate the approximate reciprocal. */
8471 rtx xtmp = gen_reg_rtx (mode);
8472 while (iterations--)
8474 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8476 if (iterations > 0)
8477 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8480 if (num != CONST1_RTX (mode))
8482 /* As the approximate reciprocal of DEN is already calculated, only
8483 calculate the approximate division when NUM is not 1.0. */
8484 rtx xnum = force_reg (mode, num);
8485 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8488 /* Finalize the approximation. */
8489 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8490 return true;
8493 /* Return the number of instructions that can be issued per cycle. */
8494 static int
8495 aarch64_sched_issue_rate (void)
8497 return aarch64_tune_params.issue_rate;
8500 static int
8501 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8503 int issue_rate = aarch64_sched_issue_rate ();
8505 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8509 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8510 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8511 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8513 static int
8514 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8515 int ready_index)
8517 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8521 /* Vectorizer cost model target hooks. */
8523 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8524 static int
8525 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8526 tree vectype,
8527 int misalign ATTRIBUTE_UNUSED)
8529 unsigned elements;
8530 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8531 bool fp = false;
8533 if (vectype != NULL)
8534 fp = FLOAT_TYPE_P (vectype);
8536 switch (type_of_cost)
8538 case scalar_stmt:
8539 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8541 case scalar_load:
8542 return costs->scalar_load_cost;
8544 case scalar_store:
8545 return costs->scalar_store_cost;
8547 case vector_stmt:
8548 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8550 case vector_load:
8551 return costs->vec_align_load_cost;
8553 case vector_store:
8554 return costs->vec_store_cost;
8556 case vec_to_scalar:
8557 return costs->vec_to_scalar_cost;
8559 case scalar_to_vec:
8560 return costs->scalar_to_vec_cost;
8562 case unaligned_load:
8563 case vector_gather_load:
8564 return costs->vec_unalign_load_cost;
8566 case unaligned_store:
8567 case vector_scatter_store:
8568 return costs->vec_unalign_store_cost;
8570 case cond_branch_taken:
8571 return costs->cond_taken_branch_cost;
8573 case cond_branch_not_taken:
8574 return costs->cond_not_taken_branch_cost;
8576 case vec_perm:
8577 return costs->vec_permute_cost;
8579 case vec_promote_demote:
8580 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8582 case vec_construct:
8583 elements = TYPE_VECTOR_SUBPARTS (vectype);
8584 return elements / 2 + 1;
8586 default:
8587 gcc_unreachable ();
8591 /* Implement targetm.vectorize.add_stmt_cost. */
8592 static unsigned
8593 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8594 struct _stmt_vec_info *stmt_info, int misalign,
8595 enum vect_cost_model_location where)
8597 unsigned *cost = (unsigned *) data;
8598 unsigned retval = 0;
8600 if (flag_vect_cost_model)
8602 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8603 int stmt_cost =
8604 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8606 /* Statements in an inner loop relative to the loop being
8607 vectorized are weighted more heavily. The value here is
8608 arbitrary and could potentially be improved with analysis. */
8609 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8610 count *= 50; /* FIXME */
8612 retval = (unsigned) (count * stmt_cost);
8613 cost[where] += retval;
8616 return retval;
8619 static void initialize_aarch64_code_model (struct gcc_options *);
8621 /* Parse the TO_PARSE string and put the architecture struct that it
8622 selects into RES and the architectural features into ISA_FLAGS.
8623 Return an aarch64_parse_opt_result describing the parse result.
8624 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8626 static enum aarch64_parse_opt_result
8627 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8628 unsigned long *isa_flags)
8630 char *ext;
8631 const struct processor *arch;
8632 char *str = (char *) alloca (strlen (to_parse) + 1);
8633 size_t len;
8635 strcpy (str, to_parse);
8637 ext = strchr (str, '+');
8639 if (ext != NULL)
8640 len = ext - str;
8641 else
8642 len = strlen (str);
8644 if (len == 0)
8645 return AARCH64_PARSE_MISSING_ARG;
8648 /* Loop through the list of supported ARCHes to find a match. */
8649 for (arch = all_architectures; arch->name != NULL; arch++)
8651 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8653 unsigned long isa_temp = arch->flags;
8655 if (ext != NULL)
8657 /* TO_PARSE string contains at least one extension. */
8658 enum aarch64_parse_opt_result ext_res
8659 = aarch64_parse_extension (ext, &isa_temp);
8661 if (ext_res != AARCH64_PARSE_OK)
8662 return ext_res;
8664 /* Extension parsing was successful. Confirm the result
8665 arch and ISA flags. */
8666 *res = arch;
8667 *isa_flags = isa_temp;
8668 return AARCH64_PARSE_OK;
8672 /* ARCH name not found in list. */
8673 return AARCH64_PARSE_INVALID_ARG;
8676 /* Parse the TO_PARSE string and put the result tuning in RES and the
8677 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8678 describing the parse result. If there is an error parsing, RES and
8679 ISA_FLAGS are left unchanged. */
8681 static enum aarch64_parse_opt_result
8682 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8683 unsigned long *isa_flags)
8685 char *ext;
8686 const struct processor *cpu;
8687 char *str = (char *) alloca (strlen (to_parse) + 1);
8688 size_t len;
8690 strcpy (str, to_parse);
8692 ext = strchr (str, '+');
8694 if (ext != NULL)
8695 len = ext - str;
8696 else
8697 len = strlen (str);
8699 if (len == 0)
8700 return AARCH64_PARSE_MISSING_ARG;
8703 /* Loop through the list of supported CPUs to find a match. */
8704 for (cpu = all_cores; cpu->name != NULL; cpu++)
8706 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8708 unsigned long isa_temp = cpu->flags;
8711 if (ext != NULL)
8713 /* TO_PARSE string contains at least one extension. */
8714 enum aarch64_parse_opt_result ext_res
8715 = aarch64_parse_extension (ext, &isa_temp);
8717 if (ext_res != AARCH64_PARSE_OK)
8718 return ext_res;
8720 /* Extension parsing was successfull. Confirm the result
8721 cpu and ISA flags. */
8722 *res = cpu;
8723 *isa_flags = isa_temp;
8724 return AARCH64_PARSE_OK;
8728 /* CPU name not found in list. */
8729 return AARCH64_PARSE_INVALID_ARG;
8732 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8733 Return an aarch64_parse_opt_result describing the parse result.
8734 If the parsing fails the RES does not change. */
8736 static enum aarch64_parse_opt_result
8737 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8739 const struct processor *cpu;
8740 char *str = (char *) alloca (strlen (to_parse) + 1);
8742 strcpy (str, to_parse);
8744 /* Loop through the list of supported CPUs to find a match. */
8745 for (cpu = all_cores; cpu->name != NULL; cpu++)
8747 if (strcmp (cpu->name, str) == 0)
8749 *res = cpu;
8750 return AARCH64_PARSE_OK;
8754 /* CPU name not found in list. */
8755 return AARCH64_PARSE_INVALID_ARG;
8758 /* Parse TOKEN, which has length LENGTH to see if it is an option
8759 described in FLAG. If it is, return the index bit for that fusion type.
8760 If not, error (printing OPTION_NAME) and return zero. */
8762 static unsigned int
8763 aarch64_parse_one_option_token (const char *token,
8764 size_t length,
8765 const struct aarch64_flag_desc *flag,
8766 const char *option_name)
8768 for (; flag->name != NULL; flag++)
8770 if (length == strlen (flag->name)
8771 && !strncmp (flag->name, token, length))
8772 return flag->flag;
8775 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8776 return 0;
8779 /* Parse OPTION which is a comma-separated list of flags to enable.
8780 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8781 default state we inherit from the CPU tuning structures. OPTION_NAME
8782 gives the top-level option we are parsing in the -moverride string,
8783 for use in error messages. */
8785 static unsigned int
8786 aarch64_parse_boolean_options (const char *option,
8787 const struct aarch64_flag_desc *flags,
8788 unsigned int initial_state,
8789 const char *option_name)
8791 const char separator = '.';
8792 const char* specs = option;
8793 const char* ntoken = option;
8794 unsigned int found_flags = initial_state;
8796 while ((ntoken = strchr (specs, separator)))
8798 size_t token_length = ntoken - specs;
8799 unsigned token_ops = aarch64_parse_one_option_token (specs,
8800 token_length,
8801 flags,
8802 option_name);
8803 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8804 in the token stream, reset the supported operations. So:
8806 adrp+add.cmp+branch.none.adrp+add
8808 would have the result of turning on only adrp+add fusion. */
8809 if (!token_ops)
8810 found_flags = 0;
8812 found_flags |= token_ops;
8813 specs = ++ntoken;
8816 /* We ended with a comma, print something. */
8817 if (!(*specs))
8819 error ("%s string ill-formed\n", option_name);
8820 return 0;
8823 /* We still have one more token to parse. */
8824 size_t token_length = strlen (specs);
8825 unsigned token_ops = aarch64_parse_one_option_token (specs,
8826 token_length,
8827 flags,
8828 option_name);
8829 if (!token_ops)
8830 found_flags = 0;
8832 found_flags |= token_ops;
8833 return found_flags;
8836 /* Support for overriding instruction fusion. */
8838 static void
8839 aarch64_parse_fuse_string (const char *fuse_string,
8840 struct tune_params *tune)
8842 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8843 aarch64_fusible_pairs,
8844 tune->fusible_ops,
8845 "fuse=");
8848 /* Support for overriding other tuning flags. */
8850 static void
8851 aarch64_parse_tune_string (const char *tune_string,
8852 struct tune_params *tune)
8854 tune->extra_tuning_flags
8855 = aarch64_parse_boolean_options (tune_string,
8856 aarch64_tuning_flags,
8857 tune->extra_tuning_flags,
8858 "tune=");
8861 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8862 we understand. If it is, extract the option string and handoff to
8863 the appropriate function. */
8865 void
8866 aarch64_parse_one_override_token (const char* token,
8867 size_t length,
8868 struct tune_params *tune)
8870 const struct aarch64_tuning_override_function *fn
8871 = aarch64_tuning_override_functions;
8873 const char *option_part = strchr (token, '=');
8874 if (!option_part)
8876 error ("tuning string missing in option (%s)", token);
8877 return;
8880 /* Get the length of the option name. */
8881 length = option_part - token;
8882 /* Skip the '=' to get to the option string. */
8883 option_part++;
8885 for (; fn->name != NULL; fn++)
8887 if (!strncmp (fn->name, token, length))
8889 fn->parse_override (option_part, tune);
8890 return;
8894 error ("unknown tuning option (%s)",token);
8895 return;
8898 /* A checking mechanism for the implementation of the tls size. */
8900 static void
8901 initialize_aarch64_tls_size (struct gcc_options *opts)
8903 if (aarch64_tls_size == 0)
8904 aarch64_tls_size = 24;
8906 switch (opts->x_aarch64_cmodel_var)
8908 case AARCH64_CMODEL_TINY:
8909 /* Both the default and maximum TLS size allowed under tiny is 1M which
8910 needs two instructions to address, so we clamp the size to 24. */
8911 if (aarch64_tls_size > 24)
8912 aarch64_tls_size = 24;
8913 break;
8914 case AARCH64_CMODEL_SMALL:
8915 /* The maximum TLS size allowed under small is 4G. */
8916 if (aarch64_tls_size > 32)
8917 aarch64_tls_size = 32;
8918 break;
8919 case AARCH64_CMODEL_LARGE:
8920 /* The maximum TLS size allowed under large is 16E.
8921 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8922 if (aarch64_tls_size > 48)
8923 aarch64_tls_size = 48;
8924 break;
8925 default:
8926 gcc_unreachable ();
8929 return;
8932 /* Parse STRING looking for options in the format:
8933 string :: option:string
8934 option :: name=substring
8935 name :: {a-z}
8936 substring :: defined by option. */
8938 static void
8939 aarch64_parse_override_string (const char* input_string,
8940 struct tune_params* tune)
8942 const char separator = ':';
8943 size_t string_length = strlen (input_string) + 1;
8944 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8945 char *string = string_root;
8946 strncpy (string, input_string, string_length);
8947 string[string_length - 1] = '\0';
8949 char* ntoken = string;
8951 while ((ntoken = strchr (string, separator)))
8953 size_t token_length = ntoken - string;
8954 /* Make this substring look like a string. */
8955 *ntoken = '\0';
8956 aarch64_parse_one_override_token (string, token_length, tune);
8957 string = ++ntoken;
8960 /* One last option to parse. */
8961 aarch64_parse_one_override_token (string, strlen (string), tune);
8962 free (string_root);
8966 static void
8967 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8969 /* PR 70044: We have to be careful about being called multiple times for the
8970 same function. This means all changes should be repeatable. */
8972 /* If the frame pointer is enabled, set it to a special value that behaves
8973 similar to frame pointer omission. If we don't do this all leaf functions
8974 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
8975 If flag_omit_frame_pointer has this special value, we must force the
8976 frame pointer if not in a leaf function. We also need to force it in a
8977 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
8978 if (opts->x_flag_omit_frame_pointer == 0)
8979 opts->x_flag_omit_frame_pointer = 2;
8981 /* If not optimizing for size, set the default
8982 alignment to what the target wants. */
8983 if (!opts->x_optimize_size)
8985 if (opts->x_align_loops <= 0)
8986 opts->x_align_loops = aarch64_tune_params.loop_align;
8987 if (opts->x_align_jumps <= 0)
8988 opts->x_align_jumps = aarch64_tune_params.jump_align;
8989 if (opts->x_align_functions <= 0)
8990 opts->x_align_functions = aarch64_tune_params.function_align;
8993 /* We default to no pc-relative literal loads. */
8995 aarch64_pcrelative_literal_loads = false;
8997 /* If -mpc-relative-literal-loads is set on the command line, this
8998 implies that the user asked for PC relative literal loads. */
8999 if (opts->x_pcrelative_literal_loads == 1)
9000 aarch64_pcrelative_literal_loads = true;
9002 /* In the tiny memory model it makes no sense to disallow PC relative
9003 literal pool loads. */
9004 if (aarch64_cmodel == AARCH64_CMODEL_TINY
9005 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9006 aarch64_pcrelative_literal_loads = true;
9008 /* When enabling the lower precision Newton series for the square root, also
9009 enable it for the reciprocal square root, since the latter is an
9010 intermediary step for the former. */
9011 if (flag_mlow_precision_sqrt)
9012 flag_mrecip_low_precision_sqrt = true;
9015 /* 'Unpack' up the internal tuning structs and update the options
9016 in OPTS. The caller must have set up selected_tune and selected_arch
9017 as all the other target-specific codegen decisions are
9018 derived from them. */
9020 void
9021 aarch64_override_options_internal (struct gcc_options *opts)
9023 aarch64_tune_flags = selected_tune->flags;
9024 aarch64_tune = selected_tune->sched_core;
9025 /* Make a copy of the tuning parameters attached to the core, which
9026 we may later overwrite. */
9027 aarch64_tune_params = *(selected_tune->tune);
9028 aarch64_architecture_version = selected_arch->architecture_version;
9030 if (opts->x_aarch64_override_tune_string)
9031 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9032 &aarch64_tune_params);
9034 /* This target defaults to strict volatile bitfields. */
9035 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9036 opts->x_flag_strict_volatile_bitfields = 1;
9038 initialize_aarch64_code_model (opts);
9039 initialize_aarch64_tls_size (opts);
9041 int queue_depth = 0;
9042 switch (aarch64_tune_params.autoprefetcher_model)
9044 case tune_params::AUTOPREFETCHER_OFF:
9045 queue_depth = -1;
9046 break;
9047 case tune_params::AUTOPREFETCHER_WEAK:
9048 queue_depth = 0;
9049 break;
9050 case tune_params::AUTOPREFETCHER_STRONG:
9051 queue_depth = max_insn_queue_index + 1;
9052 break;
9053 default:
9054 gcc_unreachable ();
9057 /* We don't mind passing in global_options_set here as we don't use
9058 the *options_set structs anyway. */
9059 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9060 queue_depth,
9061 opts->x_param_values,
9062 global_options_set.x_param_values);
9064 /* Set up parameters to be used in prefetching algorithm. Do not
9065 override the defaults unless we are tuning for a core we have
9066 researched values for. */
9067 if (aarch64_tune_params.prefetch->num_slots > 0)
9068 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9069 aarch64_tune_params.prefetch->num_slots,
9070 opts->x_param_values,
9071 global_options_set.x_param_values);
9072 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9073 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9074 aarch64_tune_params.prefetch->l1_cache_size,
9075 opts->x_param_values,
9076 global_options_set.x_param_values);
9077 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9078 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9079 aarch64_tune_params.prefetch->l1_cache_line_size,
9080 opts->x_param_values,
9081 global_options_set.x_param_values);
9082 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9083 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9084 aarch64_tune_params.prefetch->l2_cache_size,
9085 opts->x_param_values,
9086 global_options_set.x_param_values);
9088 /* Enable sw prefetching at specified optimization level for
9089 CPUS that have prefetch. Lower optimization level threshold by 1
9090 when profiling is enabled. */
9091 if (opts->x_flag_prefetch_loop_arrays < 0
9092 && !opts->x_optimize_size
9093 && aarch64_tune_params.prefetch->default_opt_level >= 0
9094 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9095 opts->x_flag_prefetch_loop_arrays = 1;
9097 aarch64_override_options_after_change_1 (opts);
9100 /* Print a hint with a suggestion for a core or architecture name that
9101 most closely resembles what the user passed in STR. ARCH is true if
9102 the user is asking for an architecture name. ARCH is false if the user
9103 is asking for a core name. */
9105 static void
9106 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9108 auto_vec<const char *> candidates;
9109 const struct processor *entry = arch ? all_architectures : all_cores;
9110 for (; entry->name != NULL; entry++)
9111 candidates.safe_push (entry->name);
9112 char *s;
9113 const char *hint = candidates_list_and_hint (str, s, candidates);
9114 if (hint)
9115 inform (input_location, "valid arguments are: %s;"
9116 " did you mean %qs?", s, hint);
9117 XDELETEVEC (s);
9120 /* Print a hint with a suggestion for a core name that most closely resembles
9121 what the user passed in STR. */
9123 inline static void
9124 aarch64_print_hint_for_core (const char *str)
9126 aarch64_print_hint_for_core_or_arch (str, false);
9129 /* Print a hint with a suggestion for an architecture name that most closely
9130 resembles what the user passed in STR. */
9132 inline static void
9133 aarch64_print_hint_for_arch (const char *str)
9135 aarch64_print_hint_for_core_or_arch (str, true);
9138 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9139 specified in STR and throw errors if appropriate. Put the results if
9140 they are valid in RES and ISA_FLAGS. Return whether the option is
9141 valid. */
9143 static bool
9144 aarch64_validate_mcpu (const char *str, const struct processor **res,
9145 unsigned long *isa_flags)
9147 enum aarch64_parse_opt_result parse_res
9148 = aarch64_parse_cpu (str, res, isa_flags);
9150 if (parse_res == AARCH64_PARSE_OK)
9151 return true;
9153 switch (parse_res)
9155 case AARCH64_PARSE_MISSING_ARG:
9156 error ("missing cpu name in %<-mcpu=%s%>", str);
9157 break;
9158 case AARCH64_PARSE_INVALID_ARG:
9159 error ("unknown value %qs for -mcpu", str);
9160 aarch64_print_hint_for_core (str);
9161 break;
9162 case AARCH64_PARSE_INVALID_FEATURE:
9163 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9164 break;
9165 default:
9166 gcc_unreachable ();
9169 return false;
9172 /* Validate a command-line -march option. Parse the arch and extensions
9173 (if any) specified in STR and throw errors if appropriate. Put the
9174 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9175 option is valid. */
9177 static bool
9178 aarch64_validate_march (const char *str, const struct processor **res,
9179 unsigned long *isa_flags)
9181 enum aarch64_parse_opt_result parse_res
9182 = aarch64_parse_arch (str, res, isa_flags);
9184 if (parse_res == AARCH64_PARSE_OK)
9185 return true;
9187 switch (parse_res)
9189 case AARCH64_PARSE_MISSING_ARG:
9190 error ("missing arch name in %<-march=%s%>", str);
9191 break;
9192 case AARCH64_PARSE_INVALID_ARG:
9193 error ("unknown value %qs for -march", str);
9194 aarch64_print_hint_for_arch (str);
9195 break;
9196 case AARCH64_PARSE_INVALID_FEATURE:
9197 error ("invalid feature modifier in %<-march=%s%>", str);
9198 break;
9199 default:
9200 gcc_unreachable ();
9203 return false;
9206 /* Validate a command-line -mtune option. Parse the cpu
9207 specified in STR and throw errors if appropriate. Put the
9208 result, if it is valid, in RES. Return whether the option is
9209 valid. */
9211 static bool
9212 aarch64_validate_mtune (const char *str, const struct processor **res)
9214 enum aarch64_parse_opt_result parse_res
9215 = aarch64_parse_tune (str, res);
9217 if (parse_res == AARCH64_PARSE_OK)
9218 return true;
9220 switch (parse_res)
9222 case AARCH64_PARSE_MISSING_ARG:
9223 error ("missing cpu name in %<-mtune=%s%>", str);
9224 break;
9225 case AARCH64_PARSE_INVALID_ARG:
9226 error ("unknown value %qs for -mtune", str);
9227 aarch64_print_hint_for_core (str);
9228 break;
9229 default:
9230 gcc_unreachable ();
9232 return false;
9235 /* Return the CPU corresponding to the enum CPU.
9236 If it doesn't specify a cpu, return the default. */
9238 static const struct processor *
9239 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9241 if (cpu != aarch64_none)
9242 return &all_cores[cpu];
9244 /* The & 0x3f is to extract the bottom 6 bits that encode the
9245 default cpu as selected by the --with-cpu GCC configure option
9246 in config.gcc.
9247 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9248 flags mechanism should be reworked to make it more sane. */
9249 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9252 /* Return the architecture corresponding to the enum ARCH.
9253 If it doesn't specify a valid architecture, return the default. */
9255 static const struct processor *
9256 aarch64_get_arch (enum aarch64_arch arch)
9258 if (arch != aarch64_no_arch)
9259 return &all_architectures[arch];
9261 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9263 return &all_architectures[cpu->arch];
9266 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9267 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9268 tuning structs. In particular it must set selected_tune and
9269 aarch64_isa_flags that define the available ISA features and tuning
9270 decisions. It must also set selected_arch as this will be used to
9271 output the .arch asm tags for each function. */
9273 static void
9274 aarch64_override_options (void)
9276 unsigned long cpu_isa = 0;
9277 unsigned long arch_isa = 0;
9278 aarch64_isa_flags = 0;
9280 bool valid_cpu = true;
9281 bool valid_tune = true;
9282 bool valid_arch = true;
9284 selected_cpu = NULL;
9285 selected_arch = NULL;
9286 selected_tune = NULL;
9288 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9289 If either of -march or -mtune is given, they override their
9290 respective component of -mcpu. */
9291 if (aarch64_cpu_string)
9292 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9293 &cpu_isa);
9295 if (aarch64_arch_string)
9296 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9297 &arch_isa);
9299 if (aarch64_tune_string)
9300 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9302 /* If the user did not specify a processor, choose the default
9303 one for them. This will be the CPU set during configuration using
9304 --with-cpu, otherwise it is "generic". */
9305 if (!selected_cpu)
9307 if (selected_arch)
9309 selected_cpu = &all_cores[selected_arch->ident];
9310 aarch64_isa_flags = arch_isa;
9311 explicit_arch = selected_arch->arch;
9313 else
9315 /* Get default configure-time CPU. */
9316 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9317 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9320 if (selected_tune)
9321 explicit_tune_core = selected_tune->ident;
9323 /* If both -mcpu and -march are specified check that they are architecturally
9324 compatible, warn if they're not and prefer the -march ISA flags. */
9325 else if (selected_arch)
9327 if (selected_arch->arch != selected_cpu->arch)
9329 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9330 all_architectures[selected_cpu->arch].name,
9331 selected_arch->name);
9333 aarch64_isa_flags = arch_isa;
9334 explicit_arch = selected_arch->arch;
9335 explicit_tune_core = selected_tune ? selected_tune->ident
9336 : selected_cpu->ident;
9338 else
9340 /* -mcpu but no -march. */
9341 aarch64_isa_flags = cpu_isa;
9342 explicit_tune_core = selected_tune ? selected_tune->ident
9343 : selected_cpu->ident;
9344 gcc_assert (selected_cpu);
9345 selected_arch = &all_architectures[selected_cpu->arch];
9346 explicit_arch = selected_arch->arch;
9349 /* Set the arch as well as we will need it when outputing
9350 the .arch directive in assembly. */
9351 if (!selected_arch)
9353 gcc_assert (selected_cpu);
9354 selected_arch = &all_architectures[selected_cpu->arch];
9357 if (!selected_tune)
9358 selected_tune = selected_cpu;
9360 #ifndef HAVE_AS_MABI_OPTION
9361 /* The compiler may have been configured with 2.23.* binutils, which does
9362 not have support for ILP32. */
9363 if (TARGET_ILP32)
9364 error ("Assembler does not support -mabi=ilp32");
9365 #endif
9367 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9368 sorry ("Return address signing is only supported for -mabi=lp64");
9370 /* Make sure we properly set up the explicit options. */
9371 if ((aarch64_cpu_string && valid_cpu)
9372 || (aarch64_tune_string && valid_tune))
9373 gcc_assert (explicit_tune_core != aarch64_none);
9375 if ((aarch64_cpu_string && valid_cpu)
9376 || (aarch64_arch_string && valid_arch))
9377 gcc_assert (explicit_arch != aarch64_no_arch);
9379 aarch64_override_options_internal (&global_options);
9381 /* Save these options as the default ones in case we push and pop them later
9382 while processing functions with potential target attributes. */
9383 target_option_default_node = target_option_current_node
9384 = build_target_option_node (&global_options);
9387 /* Implement targetm.override_options_after_change. */
9389 static void
9390 aarch64_override_options_after_change (void)
9392 aarch64_override_options_after_change_1 (&global_options);
9395 static struct machine_function *
9396 aarch64_init_machine_status (void)
9398 struct machine_function *machine;
9399 machine = ggc_cleared_alloc<machine_function> ();
9400 return machine;
9403 void
9404 aarch64_init_expanders (void)
9406 init_machine_status = aarch64_init_machine_status;
9409 /* A checking mechanism for the implementation of the various code models. */
9410 static void
9411 initialize_aarch64_code_model (struct gcc_options *opts)
9413 if (opts->x_flag_pic)
9415 switch (opts->x_aarch64_cmodel_var)
9417 case AARCH64_CMODEL_TINY:
9418 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9419 break;
9420 case AARCH64_CMODEL_SMALL:
9421 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9422 aarch64_cmodel = (flag_pic == 2
9423 ? AARCH64_CMODEL_SMALL_PIC
9424 : AARCH64_CMODEL_SMALL_SPIC);
9425 #else
9426 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9427 #endif
9428 break;
9429 case AARCH64_CMODEL_LARGE:
9430 sorry ("code model %qs with -f%s", "large",
9431 opts->x_flag_pic > 1 ? "PIC" : "pic");
9432 break;
9433 default:
9434 gcc_unreachable ();
9437 else
9438 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9441 /* Implement TARGET_OPTION_SAVE. */
9443 static void
9444 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9446 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9449 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9450 using the information saved in PTR. */
9452 static void
9453 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9455 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9456 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9457 opts->x_explicit_arch = ptr->x_explicit_arch;
9458 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9459 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9461 aarch64_override_options_internal (opts);
9464 /* Implement TARGET_OPTION_PRINT. */
9466 static void
9467 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9469 const struct processor *cpu
9470 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9471 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9472 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9473 std::string extension
9474 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9476 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9477 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9478 arch->name, extension.c_str ());
9481 static GTY(()) tree aarch64_previous_fndecl;
9483 void
9484 aarch64_reset_previous_fndecl (void)
9486 aarch64_previous_fndecl = NULL;
9489 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9490 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9491 make sure optab availability predicates are recomputed when necessary. */
9493 void
9494 aarch64_save_restore_target_globals (tree new_tree)
9496 if (TREE_TARGET_GLOBALS (new_tree))
9497 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9498 else if (new_tree == target_option_default_node)
9499 restore_target_globals (&default_target_globals);
9500 else
9501 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9504 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9505 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9506 of the function, if such exists. This function may be called multiple
9507 times on a single function so use aarch64_previous_fndecl to avoid
9508 setting up identical state. */
9510 static void
9511 aarch64_set_current_function (tree fndecl)
9513 if (!fndecl || fndecl == aarch64_previous_fndecl)
9514 return;
9516 tree old_tree = (aarch64_previous_fndecl
9517 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9518 : NULL_TREE);
9520 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9522 /* If current function has no attributes but the previous one did,
9523 use the default node. */
9524 if (!new_tree && old_tree)
9525 new_tree = target_option_default_node;
9527 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9528 the default have been handled by aarch64_save_restore_target_globals from
9529 aarch64_pragma_target_parse. */
9530 if (old_tree == new_tree)
9531 return;
9533 aarch64_previous_fndecl = fndecl;
9535 /* First set the target options. */
9536 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9538 aarch64_save_restore_target_globals (new_tree);
9541 /* Enum describing the various ways we can handle attributes.
9542 In many cases we can reuse the generic option handling machinery. */
9544 enum aarch64_attr_opt_type
9546 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9547 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9548 aarch64_attr_enum, /* Attribute sets an enum variable. */
9549 aarch64_attr_custom /* Attribute requires a custom handling function. */
9552 /* All the information needed to handle a target attribute.
9553 NAME is the name of the attribute.
9554 ATTR_TYPE specifies the type of behavior of the attribute as described
9555 in the definition of enum aarch64_attr_opt_type.
9556 ALLOW_NEG is true if the attribute supports a "no-" form.
9557 HANDLER is the function that takes the attribute string and whether
9558 it is a pragma or attribute and handles the option. It is needed only
9559 when the ATTR_TYPE is aarch64_attr_custom.
9560 OPT_NUM is the enum specifying the option that the attribute modifies.
9561 This is needed for attributes that mirror the behavior of a command-line
9562 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9563 aarch64_attr_enum. */
9565 struct aarch64_attribute_info
9567 const char *name;
9568 enum aarch64_attr_opt_type attr_type;
9569 bool allow_neg;
9570 bool (*handler) (const char *, const char *);
9571 enum opt_code opt_num;
9574 /* Handle the ARCH_STR argument to the arch= target attribute.
9575 PRAGMA_OR_ATTR is used in potential error messages. */
9577 static bool
9578 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9580 const struct processor *tmp_arch = NULL;
9581 enum aarch64_parse_opt_result parse_res
9582 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9584 if (parse_res == AARCH64_PARSE_OK)
9586 gcc_assert (tmp_arch);
9587 selected_arch = tmp_arch;
9588 explicit_arch = selected_arch->arch;
9589 return true;
9592 switch (parse_res)
9594 case AARCH64_PARSE_MISSING_ARG:
9595 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9596 break;
9597 case AARCH64_PARSE_INVALID_ARG:
9598 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9599 aarch64_print_hint_for_arch (str);
9600 break;
9601 case AARCH64_PARSE_INVALID_FEATURE:
9602 error ("invalid feature modifier %qs for 'arch' target %s",
9603 str, pragma_or_attr);
9604 break;
9605 default:
9606 gcc_unreachable ();
9609 return false;
9612 /* Handle the argument CPU_STR to the cpu= target attribute.
9613 PRAGMA_OR_ATTR is used in potential error messages. */
9615 static bool
9616 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9618 const struct processor *tmp_cpu = NULL;
9619 enum aarch64_parse_opt_result parse_res
9620 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9622 if (parse_res == AARCH64_PARSE_OK)
9624 gcc_assert (tmp_cpu);
9625 selected_tune = tmp_cpu;
9626 explicit_tune_core = selected_tune->ident;
9628 selected_arch = &all_architectures[tmp_cpu->arch];
9629 explicit_arch = selected_arch->arch;
9630 return true;
9633 switch (parse_res)
9635 case AARCH64_PARSE_MISSING_ARG:
9636 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9637 break;
9638 case AARCH64_PARSE_INVALID_ARG:
9639 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9640 aarch64_print_hint_for_core (str);
9641 break;
9642 case AARCH64_PARSE_INVALID_FEATURE:
9643 error ("invalid feature modifier %qs for 'cpu' target %s",
9644 str, pragma_or_attr);
9645 break;
9646 default:
9647 gcc_unreachable ();
9650 return false;
9653 /* Handle the argument STR to the tune= target attribute.
9654 PRAGMA_OR_ATTR is used in potential error messages. */
9656 static bool
9657 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9659 const struct processor *tmp_tune = NULL;
9660 enum aarch64_parse_opt_result parse_res
9661 = aarch64_parse_tune (str, &tmp_tune);
9663 if (parse_res == AARCH64_PARSE_OK)
9665 gcc_assert (tmp_tune);
9666 selected_tune = tmp_tune;
9667 explicit_tune_core = selected_tune->ident;
9668 return true;
9671 switch (parse_res)
9673 case AARCH64_PARSE_INVALID_ARG:
9674 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9675 aarch64_print_hint_for_core (str);
9676 break;
9677 default:
9678 gcc_unreachable ();
9681 return false;
9684 /* Parse an architecture extensions target attribute string specified in STR.
9685 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9686 if successful. Update aarch64_isa_flags to reflect the ISA features
9687 modified.
9688 PRAGMA_OR_ATTR is used in potential error messages. */
9690 static bool
9691 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9693 enum aarch64_parse_opt_result parse_res;
9694 unsigned long isa_flags = aarch64_isa_flags;
9696 /* We allow "+nothing" in the beginning to clear out all architectural
9697 features if the user wants to handpick specific features. */
9698 if (strncmp ("+nothing", str, 8) == 0)
9700 isa_flags = 0;
9701 str += 8;
9704 parse_res = aarch64_parse_extension (str, &isa_flags);
9706 if (parse_res == AARCH64_PARSE_OK)
9708 aarch64_isa_flags = isa_flags;
9709 return true;
9712 switch (parse_res)
9714 case AARCH64_PARSE_MISSING_ARG:
9715 error ("missing feature modifier in target %s %qs",
9716 pragma_or_attr, str);
9717 break;
9719 case AARCH64_PARSE_INVALID_FEATURE:
9720 error ("invalid feature modifier in target %s %qs",
9721 pragma_or_attr, str);
9722 break;
9724 default:
9725 gcc_unreachable ();
9728 return false;
9731 /* The target attributes that we support. On top of these we also support just
9732 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9733 handled explicitly in aarch64_process_one_target_attr. */
9735 static const struct aarch64_attribute_info aarch64_attributes[] =
9737 { "general-regs-only", aarch64_attr_mask, false, NULL,
9738 OPT_mgeneral_regs_only },
9739 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9740 OPT_mfix_cortex_a53_835769 },
9741 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9742 OPT_mfix_cortex_a53_843419 },
9743 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9744 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9745 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9746 OPT_momit_leaf_frame_pointer },
9747 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9748 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9749 OPT_march_ },
9750 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9751 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9752 OPT_mtune_ },
9753 { "sign-return-address", aarch64_attr_enum, false, NULL,
9754 OPT_msign_return_address_ },
9755 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9758 /* Parse ARG_STR which contains the definition of one target attribute.
9759 Show appropriate errors if any or return true if the attribute is valid.
9760 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9761 we're processing a target attribute or pragma. */
9763 static bool
9764 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9766 bool invert = false;
9768 size_t len = strlen (arg_str);
9770 if (len == 0)
9772 error ("malformed target %s", pragma_or_attr);
9773 return false;
9776 char *str_to_check = (char *) alloca (len + 1);
9777 strcpy (str_to_check, arg_str);
9779 /* Skip leading whitespace. */
9780 while (*str_to_check == ' ' || *str_to_check == '\t')
9781 str_to_check++;
9783 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9784 It is easier to detect and handle it explicitly here rather than going
9785 through the machinery for the rest of the target attributes in this
9786 function. */
9787 if (*str_to_check == '+')
9788 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9790 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9792 invert = true;
9793 str_to_check += 3;
9795 char *arg = strchr (str_to_check, '=');
9797 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9798 and point ARG to "foo". */
9799 if (arg)
9801 *arg = '\0';
9802 arg++;
9804 const struct aarch64_attribute_info *p_attr;
9805 bool found = false;
9806 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9808 /* If the names don't match up, or the user has given an argument
9809 to an attribute that doesn't accept one, or didn't give an argument
9810 to an attribute that expects one, fail to match. */
9811 if (strcmp (str_to_check, p_attr->name) != 0)
9812 continue;
9814 found = true;
9815 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9816 || p_attr->attr_type == aarch64_attr_enum;
9818 if (attr_need_arg_p ^ (arg != NULL))
9820 error ("target %s %qs does not accept an argument",
9821 pragma_or_attr, str_to_check);
9822 return false;
9825 /* If the name matches but the attribute does not allow "no-" versions
9826 then we can't match. */
9827 if (invert && !p_attr->allow_neg)
9829 error ("target %s %qs does not allow a negated form",
9830 pragma_or_attr, str_to_check);
9831 return false;
9834 switch (p_attr->attr_type)
9836 /* Has a custom handler registered.
9837 For example, cpu=, arch=, tune=. */
9838 case aarch64_attr_custom:
9839 gcc_assert (p_attr->handler);
9840 if (!p_attr->handler (arg, pragma_or_attr))
9841 return false;
9842 break;
9844 /* Either set or unset a boolean option. */
9845 case aarch64_attr_bool:
9847 struct cl_decoded_option decoded;
9849 generate_option (p_attr->opt_num, NULL, !invert,
9850 CL_TARGET, &decoded);
9851 aarch64_handle_option (&global_options, &global_options_set,
9852 &decoded, input_location);
9853 break;
9855 /* Set or unset a bit in the target_flags. aarch64_handle_option
9856 should know what mask to apply given the option number. */
9857 case aarch64_attr_mask:
9859 struct cl_decoded_option decoded;
9860 /* We only need to specify the option number.
9861 aarch64_handle_option will know which mask to apply. */
9862 decoded.opt_index = p_attr->opt_num;
9863 decoded.value = !invert;
9864 aarch64_handle_option (&global_options, &global_options_set,
9865 &decoded, input_location);
9866 break;
9868 /* Use the option setting machinery to set an option to an enum. */
9869 case aarch64_attr_enum:
9871 gcc_assert (arg);
9872 bool valid;
9873 int value;
9874 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9875 &value, CL_TARGET);
9876 if (valid)
9878 set_option (&global_options, NULL, p_attr->opt_num, value,
9879 NULL, DK_UNSPECIFIED, input_location,
9880 global_dc);
9882 else
9884 error ("target %s %s=%s is not valid",
9885 pragma_or_attr, str_to_check, arg);
9887 break;
9889 default:
9890 gcc_unreachable ();
9894 /* If we reached here we either have found an attribute and validated
9895 it or didn't match any. If we matched an attribute but its arguments
9896 were malformed we will have returned false already. */
9897 return found;
9900 /* Count how many times the character C appears in
9901 NULL-terminated string STR. */
9903 static unsigned int
9904 num_occurences_in_str (char c, char *str)
9906 unsigned int res = 0;
9907 while (*str != '\0')
9909 if (*str == c)
9910 res++;
9912 str++;
9915 return res;
9918 /* Parse the tree in ARGS that contains the target attribute information
9919 and update the global target options space. PRAGMA_OR_ATTR is a string
9920 to be used in error messages, specifying whether this is processing
9921 a target attribute or a target pragma. */
9923 bool
9924 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9926 if (TREE_CODE (args) == TREE_LIST)
9930 tree head = TREE_VALUE (args);
9931 if (head)
9933 if (!aarch64_process_target_attr (head, pragma_or_attr))
9934 return false;
9936 args = TREE_CHAIN (args);
9937 } while (args);
9939 return true;
9942 if (TREE_CODE (args) != STRING_CST)
9944 error ("attribute %<target%> argument not a string");
9945 return false;
9948 size_t len = strlen (TREE_STRING_POINTER (args));
9949 char *str_to_check = (char *) alloca (len + 1);
9950 strcpy (str_to_check, TREE_STRING_POINTER (args));
9952 if (len == 0)
9954 error ("malformed target %s value", pragma_or_attr);
9955 return false;
9958 /* Used to catch empty spaces between commas i.e.
9959 attribute ((target ("attr1,,attr2"))). */
9960 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9962 /* Handle multiple target attributes separated by ','. */
9963 char *token = strtok (str_to_check, ",");
9965 unsigned int num_attrs = 0;
9966 while (token)
9968 num_attrs++;
9969 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9971 error ("target %s %qs is invalid", pragma_or_attr, token);
9972 return false;
9975 token = strtok (NULL, ",");
9978 if (num_attrs != num_commas + 1)
9980 error ("malformed target %s list %qs",
9981 pragma_or_attr, TREE_STRING_POINTER (args));
9982 return false;
9985 return true;
9988 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9989 process attribute ((target ("..."))). */
9991 static bool
9992 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9994 struct cl_target_option cur_target;
9995 bool ret;
9996 tree old_optimize;
9997 tree new_target, new_optimize;
9998 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10000 /* If what we're processing is the current pragma string then the
10001 target option node is already stored in target_option_current_node
10002 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
10003 having to re-parse the string. This is especially useful to keep
10004 arm_neon.h compile times down since that header contains a lot
10005 of intrinsics enclosed in pragmas. */
10006 if (!existing_target && args == current_target_pragma)
10008 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10009 return true;
10011 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10013 old_optimize = build_optimization_node (&global_options);
10014 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10016 /* If the function changed the optimization levels as well as setting
10017 target options, start with the optimizations specified. */
10018 if (func_optimize && func_optimize != old_optimize)
10019 cl_optimization_restore (&global_options,
10020 TREE_OPTIMIZATION (func_optimize));
10022 /* Save the current target options to restore at the end. */
10023 cl_target_option_save (&cur_target, &global_options);
10025 /* If fndecl already has some target attributes applied to it, unpack
10026 them so that we add this attribute on top of them, rather than
10027 overwriting them. */
10028 if (existing_target)
10030 struct cl_target_option *existing_options
10031 = TREE_TARGET_OPTION (existing_target);
10033 if (existing_options)
10034 cl_target_option_restore (&global_options, existing_options);
10036 else
10037 cl_target_option_restore (&global_options,
10038 TREE_TARGET_OPTION (target_option_current_node));
10041 ret = aarch64_process_target_attr (args, "attribute");
10043 /* Set up any additional state. */
10044 if (ret)
10046 aarch64_override_options_internal (&global_options);
10047 /* Initialize SIMD builtins if we haven't already.
10048 Set current_target_pragma to NULL for the duration so that
10049 the builtin initialization code doesn't try to tag the functions
10050 being built with the attributes specified by any current pragma, thus
10051 going into an infinite recursion. */
10052 if (TARGET_SIMD)
10054 tree saved_current_target_pragma = current_target_pragma;
10055 current_target_pragma = NULL;
10056 aarch64_init_simd_builtins ();
10057 current_target_pragma = saved_current_target_pragma;
10059 new_target = build_target_option_node (&global_options);
10061 else
10062 new_target = NULL;
10064 new_optimize = build_optimization_node (&global_options);
10066 if (fndecl && ret)
10068 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10070 if (old_optimize != new_optimize)
10071 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10074 cl_target_option_restore (&global_options, &cur_target);
10076 if (old_optimize != new_optimize)
10077 cl_optimization_restore (&global_options,
10078 TREE_OPTIMIZATION (old_optimize));
10079 return ret;
10082 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10083 tri-bool options (yes, no, don't care) and the default value is
10084 DEF, determine whether to reject inlining. */
10086 static bool
10087 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10088 int dont_care, int def)
10090 /* If the callee doesn't care, always allow inlining. */
10091 if (callee == dont_care)
10092 return true;
10094 /* If the caller doesn't care, always allow inlining. */
10095 if (caller == dont_care)
10096 return true;
10098 /* Otherwise, allow inlining if either the callee and caller values
10099 agree, or if the callee is using the default value. */
10100 return (callee == caller || callee == def);
10103 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10104 to inline CALLEE into CALLER based on target-specific info.
10105 Make sure that the caller and callee have compatible architectural
10106 features. Then go through the other possible target attributes
10107 and see if they can block inlining. Try not to reject always_inline
10108 callees unless they are incompatible architecturally. */
10110 static bool
10111 aarch64_can_inline_p (tree caller, tree callee)
10113 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10114 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10116 /* If callee has no option attributes, then it is ok to inline. */
10117 if (!callee_tree)
10118 return true;
10120 struct cl_target_option *caller_opts
10121 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10122 : target_option_default_node);
10124 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10127 /* Callee's ISA flags should be a subset of the caller's. */
10128 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10129 != callee_opts->x_aarch64_isa_flags)
10130 return false;
10132 /* Allow non-strict aligned functions inlining into strict
10133 aligned ones. */
10134 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10135 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10136 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10137 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10138 return false;
10140 bool always_inline = lookup_attribute ("always_inline",
10141 DECL_ATTRIBUTES (callee));
10143 /* If the architectural features match up and the callee is always_inline
10144 then the other attributes don't matter. */
10145 if (always_inline)
10146 return true;
10148 if (caller_opts->x_aarch64_cmodel_var
10149 != callee_opts->x_aarch64_cmodel_var)
10150 return false;
10152 if (caller_opts->x_aarch64_tls_dialect
10153 != callee_opts->x_aarch64_tls_dialect)
10154 return false;
10156 /* Honour explicit requests to workaround errata. */
10157 if (!aarch64_tribools_ok_for_inlining_p (
10158 caller_opts->x_aarch64_fix_a53_err835769,
10159 callee_opts->x_aarch64_fix_a53_err835769,
10160 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10161 return false;
10163 if (!aarch64_tribools_ok_for_inlining_p (
10164 caller_opts->x_aarch64_fix_a53_err843419,
10165 callee_opts->x_aarch64_fix_a53_err843419,
10166 2, TARGET_FIX_ERR_A53_843419))
10167 return false;
10169 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10170 caller and calle and they don't match up, reject inlining. */
10171 if (!aarch64_tribools_ok_for_inlining_p (
10172 caller_opts->x_flag_omit_leaf_frame_pointer,
10173 callee_opts->x_flag_omit_leaf_frame_pointer,
10174 2, 1))
10175 return false;
10177 /* If the callee has specific tuning overrides, respect them. */
10178 if (callee_opts->x_aarch64_override_tune_string != NULL
10179 && caller_opts->x_aarch64_override_tune_string == NULL)
10180 return false;
10182 /* If the user specified tuning override strings for the
10183 caller and callee and they don't match up, reject inlining.
10184 We just do a string compare here, we don't analyze the meaning
10185 of the string, as it would be too costly for little gain. */
10186 if (callee_opts->x_aarch64_override_tune_string
10187 && caller_opts->x_aarch64_override_tune_string
10188 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10189 caller_opts->x_aarch64_override_tune_string) != 0))
10190 return false;
10192 return true;
10195 /* Return true if SYMBOL_REF X binds locally. */
10197 static bool
10198 aarch64_symbol_binds_local_p (const_rtx x)
10200 return (SYMBOL_REF_DECL (x)
10201 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10202 : SYMBOL_REF_LOCAL_P (x));
10205 /* Return true if SYMBOL_REF X is thread local */
10206 static bool
10207 aarch64_tls_symbol_p (rtx x)
10209 if (! TARGET_HAVE_TLS)
10210 return false;
10212 if (GET_CODE (x) != SYMBOL_REF)
10213 return false;
10215 return SYMBOL_REF_TLS_MODEL (x) != 0;
10218 /* Classify a TLS symbol into one of the TLS kinds. */
10219 enum aarch64_symbol_type
10220 aarch64_classify_tls_symbol (rtx x)
10222 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10224 switch (tls_kind)
10226 case TLS_MODEL_GLOBAL_DYNAMIC:
10227 case TLS_MODEL_LOCAL_DYNAMIC:
10228 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10230 case TLS_MODEL_INITIAL_EXEC:
10231 switch (aarch64_cmodel)
10233 case AARCH64_CMODEL_TINY:
10234 case AARCH64_CMODEL_TINY_PIC:
10235 return SYMBOL_TINY_TLSIE;
10236 default:
10237 return SYMBOL_SMALL_TLSIE;
10240 case TLS_MODEL_LOCAL_EXEC:
10241 if (aarch64_tls_size == 12)
10242 return SYMBOL_TLSLE12;
10243 else if (aarch64_tls_size == 24)
10244 return SYMBOL_TLSLE24;
10245 else if (aarch64_tls_size == 32)
10246 return SYMBOL_TLSLE32;
10247 else if (aarch64_tls_size == 48)
10248 return SYMBOL_TLSLE48;
10249 else
10250 gcc_unreachable ();
10252 case TLS_MODEL_EMULATED:
10253 case TLS_MODEL_NONE:
10254 return SYMBOL_FORCE_TO_MEM;
10256 default:
10257 gcc_unreachable ();
10261 /* Return the method that should be used to access SYMBOL_REF or
10262 LABEL_REF X. */
10264 enum aarch64_symbol_type
10265 aarch64_classify_symbol (rtx x, rtx offset)
10267 if (GET_CODE (x) == LABEL_REF)
10269 switch (aarch64_cmodel)
10271 case AARCH64_CMODEL_LARGE:
10272 return SYMBOL_FORCE_TO_MEM;
10274 case AARCH64_CMODEL_TINY_PIC:
10275 case AARCH64_CMODEL_TINY:
10276 return SYMBOL_TINY_ABSOLUTE;
10278 case AARCH64_CMODEL_SMALL_SPIC:
10279 case AARCH64_CMODEL_SMALL_PIC:
10280 case AARCH64_CMODEL_SMALL:
10281 return SYMBOL_SMALL_ABSOLUTE;
10283 default:
10284 gcc_unreachable ();
10288 if (GET_CODE (x) == SYMBOL_REF)
10290 if (aarch64_tls_symbol_p (x))
10291 return aarch64_classify_tls_symbol (x);
10293 switch (aarch64_cmodel)
10295 case AARCH64_CMODEL_TINY:
10296 /* When we retrieve symbol + offset address, we have to make sure
10297 the offset does not cause overflow of the final address. But
10298 we have no way of knowing the address of symbol at compile time
10299 so we can't accurately say if the distance between the PC and
10300 symbol + offset is outside the addressible range of +/-1M in the
10301 TINY code model. So we rely on images not being greater than
10302 1M and cap the offset at 1M and anything beyond 1M will have to
10303 be loaded using an alternative mechanism. Furthermore if the
10304 symbol is a weak reference to something that isn't known to
10305 resolve to a symbol in this module, then force to memory. */
10306 if ((SYMBOL_REF_WEAK (x)
10307 && !aarch64_symbol_binds_local_p (x))
10308 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10309 return SYMBOL_FORCE_TO_MEM;
10310 return SYMBOL_TINY_ABSOLUTE;
10312 case AARCH64_CMODEL_SMALL:
10313 /* Same reasoning as the tiny code model, but the offset cap here is
10314 4G. */
10315 if ((SYMBOL_REF_WEAK (x)
10316 && !aarch64_symbol_binds_local_p (x))
10317 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10318 HOST_WIDE_INT_C (4294967264)))
10319 return SYMBOL_FORCE_TO_MEM;
10320 return SYMBOL_SMALL_ABSOLUTE;
10322 case AARCH64_CMODEL_TINY_PIC:
10323 if (!aarch64_symbol_binds_local_p (x))
10324 return SYMBOL_TINY_GOT;
10325 return SYMBOL_TINY_ABSOLUTE;
10327 case AARCH64_CMODEL_SMALL_SPIC:
10328 case AARCH64_CMODEL_SMALL_PIC:
10329 if (!aarch64_symbol_binds_local_p (x))
10330 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10331 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10332 return SYMBOL_SMALL_ABSOLUTE;
10334 case AARCH64_CMODEL_LARGE:
10335 /* This is alright even in PIC code as the constant
10336 pool reference is always PC relative and within
10337 the same translation unit. */
10338 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10339 return SYMBOL_SMALL_ABSOLUTE;
10340 else
10341 return SYMBOL_FORCE_TO_MEM;
10343 default:
10344 gcc_unreachable ();
10348 /* By default push everything into the constant pool. */
10349 return SYMBOL_FORCE_TO_MEM;
10352 bool
10353 aarch64_constant_address_p (rtx x)
10355 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10358 bool
10359 aarch64_legitimate_pic_operand_p (rtx x)
10361 if (GET_CODE (x) == SYMBOL_REF
10362 || (GET_CODE (x) == CONST
10363 && GET_CODE (XEXP (x, 0)) == PLUS
10364 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10365 return false;
10367 return true;
10370 /* Return true if X holds either a quarter-precision or
10371 floating-point +0.0 constant. */
10372 static bool
10373 aarch64_valid_floating_const (rtx x)
10375 if (!CONST_DOUBLE_P (x))
10376 return false;
10378 /* This call determines which constants can be used in mov<mode>
10379 as integer moves instead of constant loads. */
10380 if (aarch64_float_const_rtx_p (x))
10381 return true;
10383 return aarch64_float_const_representable_p (x);
10386 static bool
10387 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10389 /* Do not allow vector struct mode constants. We could support
10390 0 and -1 easily, but they need support in aarch64-simd.md. */
10391 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10392 return false;
10394 /* For these cases we never want to use a literal load.
10395 As such we have to prevent the compiler from forcing these
10396 to memory. */
10397 if ((GET_CODE (x) == CONST_VECTOR
10398 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10399 || CONST_INT_P (x)
10400 || aarch64_valid_floating_const (x)
10401 || aarch64_can_const_movi_rtx_p (x, mode)
10402 || aarch64_float_const_rtx_p (x))
10403 return !targetm.cannot_force_const_mem (mode, x);
10405 if (GET_CODE (x) == HIGH
10406 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10407 return true;
10409 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10410 so spilling them is better than rematerialization. */
10411 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10412 return true;
10414 return aarch64_constant_address_p (x);
10418 aarch64_load_tp (rtx target)
10420 if (!target
10421 || GET_MODE (target) != Pmode
10422 || !register_operand (target, Pmode))
10423 target = gen_reg_rtx (Pmode);
10425 /* Can return in any reg. */
10426 emit_insn (gen_aarch64_load_tp_hard (target));
10427 return target;
10430 /* On AAPCS systems, this is the "struct __va_list". */
10431 static GTY(()) tree va_list_type;
10433 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10434 Return the type to use as __builtin_va_list.
10436 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10438 struct __va_list
10440 void *__stack;
10441 void *__gr_top;
10442 void *__vr_top;
10443 int __gr_offs;
10444 int __vr_offs;
10445 }; */
10447 static tree
10448 aarch64_build_builtin_va_list (void)
10450 tree va_list_name;
10451 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10453 /* Create the type. */
10454 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10455 /* Give it the required name. */
10456 va_list_name = build_decl (BUILTINS_LOCATION,
10457 TYPE_DECL,
10458 get_identifier ("__va_list"),
10459 va_list_type);
10460 DECL_ARTIFICIAL (va_list_name) = 1;
10461 TYPE_NAME (va_list_type) = va_list_name;
10462 TYPE_STUB_DECL (va_list_type) = va_list_name;
10464 /* Create the fields. */
10465 f_stack = build_decl (BUILTINS_LOCATION,
10466 FIELD_DECL, get_identifier ("__stack"),
10467 ptr_type_node);
10468 f_grtop = build_decl (BUILTINS_LOCATION,
10469 FIELD_DECL, get_identifier ("__gr_top"),
10470 ptr_type_node);
10471 f_vrtop = build_decl (BUILTINS_LOCATION,
10472 FIELD_DECL, get_identifier ("__vr_top"),
10473 ptr_type_node);
10474 f_groff = build_decl (BUILTINS_LOCATION,
10475 FIELD_DECL, get_identifier ("__gr_offs"),
10476 integer_type_node);
10477 f_vroff = build_decl (BUILTINS_LOCATION,
10478 FIELD_DECL, get_identifier ("__vr_offs"),
10479 integer_type_node);
10481 /* Tell tree-stdarg pass about our internal offset fields.
10482 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10483 purpose to identify whether the code is updating va_list internal
10484 offset fields through irregular way. */
10485 va_list_gpr_counter_field = f_groff;
10486 va_list_fpr_counter_field = f_vroff;
10488 DECL_ARTIFICIAL (f_stack) = 1;
10489 DECL_ARTIFICIAL (f_grtop) = 1;
10490 DECL_ARTIFICIAL (f_vrtop) = 1;
10491 DECL_ARTIFICIAL (f_groff) = 1;
10492 DECL_ARTIFICIAL (f_vroff) = 1;
10494 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10495 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10496 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10497 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10498 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10500 TYPE_FIELDS (va_list_type) = f_stack;
10501 DECL_CHAIN (f_stack) = f_grtop;
10502 DECL_CHAIN (f_grtop) = f_vrtop;
10503 DECL_CHAIN (f_vrtop) = f_groff;
10504 DECL_CHAIN (f_groff) = f_vroff;
10506 /* Compute its layout. */
10507 layout_type (va_list_type);
10509 return va_list_type;
10512 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10513 static void
10514 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10516 const CUMULATIVE_ARGS *cum;
10517 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10518 tree stack, grtop, vrtop, groff, vroff;
10519 tree t;
10520 int gr_save_area_size = cfun->va_list_gpr_size;
10521 int vr_save_area_size = cfun->va_list_fpr_size;
10522 int vr_offset;
10524 cum = &crtl->args.info;
10525 if (cfun->va_list_gpr_size)
10526 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10527 cfun->va_list_gpr_size);
10528 if (cfun->va_list_fpr_size)
10529 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10530 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10532 if (!TARGET_FLOAT)
10534 gcc_assert (cum->aapcs_nvrn == 0);
10535 vr_save_area_size = 0;
10538 f_stack = TYPE_FIELDS (va_list_type_node);
10539 f_grtop = DECL_CHAIN (f_stack);
10540 f_vrtop = DECL_CHAIN (f_grtop);
10541 f_groff = DECL_CHAIN (f_vrtop);
10542 f_vroff = DECL_CHAIN (f_groff);
10544 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10545 NULL_TREE);
10546 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10547 NULL_TREE);
10548 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10549 NULL_TREE);
10550 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10551 NULL_TREE);
10552 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10553 NULL_TREE);
10555 /* Emit code to initialize STACK, which points to the next varargs stack
10556 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10557 by named arguments. STACK is 8-byte aligned. */
10558 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10559 if (cum->aapcs_stack_size > 0)
10560 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10561 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10562 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10564 /* Emit code to initialize GRTOP, the top of the GR save area.
10565 virtual_incoming_args_rtx should have been 16 byte aligned. */
10566 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10567 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10568 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10570 /* Emit code to initialize VRTOP, the top of the VR save area.
10571 This address is gr_save_area_bytes below GRTOP, rounded
10572 down to the next 16-byte boundary. */
10573 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10574 vr_offset = ROUND_UP (gr_save_area_size,
10575 STACK_BOUNDARY / BITS_PER_UNIT);
10577 if (vr_offset)
10578 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10579 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10580 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10582 /* Emit code to initialize GROFF, the offset from GRTOP of the
10583 next GPR argument. */
10584 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10585 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10586 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10588 /* Likewise emit code to initialize VROFF, the offset from FTOP
10589 of the next VR argument. */
10590 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10591 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10592 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10595 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10597 static tree
10598 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10599 gimple_seq *post_p ATTRIBUTE_UNUSED)
10601 tree addr;
10602 bool indirect_p;
10603 bool is_ha; /* is HFA or HVA. */
10604 bool dw_align; /* double-word align. */
10605 machine_mode ag_mode = VOIDmode;
10606 int nregs;
10607 machine_mode mode;
10609 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10610 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10611 HOST_WIDE_INT size, rsize, adjust, align;
10612 tree t, u, cond1, cond2;
10614 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10615 if (indirect_p)
10616 type = build_pointer_type (type);
10618 mode = TYPE_MODE (type);
10620 f_stack = TYPE_FIELDS (va_list_type_node);
10621 f_grtop = DECL_CHAIN (f_stack);
10622 f_vrtop = DECL_CHAIN (f_grtop);
10623 f_groff = DECL_CHAIN (f_vrtop);
10624 f_vroff = DECL_CHAIN (f_groff);
10626 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10627 f_stack, NULL_TREE);
10628 size = int_size_in_bytes (type);
10629 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10631 dw_align = false;
10632 adjust = 0;
10633 if (aarch64_vfp_is_call_or_return_candidate (mode,
10634 type,
10635 &ag_mode,
10636 &nregs,
10637 &is_ha))
10639 /* TYPE passed in fp/simd registers. */
10640 if (!TARGET_FLOAT)
10641 aarch64_err_no_fpadvsimd (mode, "varargs");
10643 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10644 unshare_expr (valist), f_vrtop, NULL_TREE);
10645 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10646 unshare_expr (valist), f_vroff, NULL_TREE);
10648 rsize = nregs * UNITS_PER_VREG;
10650 if (is_ha)
10652 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10653 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10655 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10656 && size < UNITS_PER_VREG)
10658 adjust = UNITS_PER_VREG - size;
10661 else
10663 /* TYPE passed in general registers. */
10664 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10665 unshare_expr (valist), f_grtop, NULL_TREE);
10666 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10667 unshare_expr (valist), f_groff, NULL_TREE);
10668 rsize = ROUND_UP (size, UNITS_PER_WORD);
10669 nregs = rsize / UNITS_PER_WORD;
10671 if (align > 8)
10672 dw_align = true;
10674 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10675 && size < UNITS_PER_WORD)
10677 adjust = UNITS_PER_WORD - size;
10681 /* Get a local temporary for the field value. */
10682 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10684 /* Emit code to branch if off >= 0. */
10685 t = build2 (GE_EXPR, boolean_type_node, off,
10686 build_int_cst (TREE_TYPE (off), 0));
10687 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10689 if (dw_align)
10691 /* Emit: offs = (offs + 15) & -16. */
10692 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10693 build_int_cst (TREE_TYPE (off), 15));
10694 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10695 build_int_cst (TREE_TYPE (off), -16));
10696 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10698 else
10699 roundup = NULL;
10701 /* Update ap.__[g|v]r_offs */
10702 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10703 build_int_cst (TREE_TYPE (off), rsize));
10704 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10706 /* String up. */
10707 if (roundup)
10708 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10710 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10711 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10712 build_int_cst (TREE_TYPE (f_off), 0));
10713 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10715 /* String up: make sure the assignment happens before the use. */
10716 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10717 COND_EXPR_ELSE (cond1) = t;
10719 /* Prepare the trees handling the argument that is passed on the stack;
10720 the top level node will store in ON_STACK. */
10721 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10722 if (align > 8)
10724 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10725 t = fold_convert (intDI_type_node, arg);
10726 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10727 build_int_cst (TREE_TYPE (t), 15));
10728 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10729 build_int_cst (TREE_TYPE (t), -16));
10730 t = fold_convert (TREE_TYPE (arg), t);
10731 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10733 else
10734 roundup = NULL;
10735 /* Advance ap.__stack */
10736 t = fold_convert (intDI_type_node, arg);
10737 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10738 build_int_cst (TREE_TYPE (t), size + 7));
10739 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10740 build_int_cst (TREE_TYPE (t), -8));
10741 t = fold_convert (TREE_TYPE (arg), t);
10742 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10743 /* String up roundup and advance. */
10744 if (roundup)
10745 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10746 /* String up with arg */
10747 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10748 /* Big-endianness related address adjustment. */
10749 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10750 && size < UNITS_PER_WORD)
10752 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10753 size_int (UNITS_PER_WORD - size));
10754 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10757 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10758 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10760 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10761 t = off;
10762 if (adjust)
10763 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10764 build_int_cst (TREE_TYPE (off), adjust));
10766 t = fold_convert (sizetype, t);
10767 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10769 if (is_ha)
10771 /* type ha; // treat as "struct {ftype field[n];}"
10772 ... [computing offs]
10773 for (i = 0; i <nregs; ++i, offs += 16)
10774 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10775 return ha; */
10776 int i;
10777 tree tmp_ha, field_t, field_ptr_t;
10779 /* Declare a local variable. */
10780 tmp_ha = create_tmp_var_raw (type, "ha");
10781 gimple_add_tmp_var (tmp_ha);
10783 /* Establish the base type. */
10784 switch (ag_mode)
10786 case E_SFmode:
10787 field_t = float_type_node;
10788 field_ptr_t = float_ptr_type_node;
10789 break;
10790 case E_DFmode:
10791 field_t = double_type_node;
10792 field_ptr_t = double_ptr_type_node;
10793 break;
10794 case E_TFmode:
10795 field_t = long_double_type_node;
10796 field_ptr_t = long_double_ptr_type_node;
10797 break;
10798 case E_HFmode:
10799 field_t = aarch64_fp16_type_node;
10800 field_ptr_t = aarch64_fp16_ptr_type_node;
10801 break;
10802 case E_V2SImode:
10803 case E_V4SImode:
10805 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10806 field_t = build_vector_type_for_mode (innertype, ag_mode);
10807 field_ptr_t = build_pointer_type (field_t);
10809 break;
10810 default:
10811 gcc_assert (0);
10814 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10815 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10816 addr = t;
10817 t = fold_convert (field_ptr_t, addr);
10818 t = build2 (MODIFY_EXPR, field_t,
10819 build1 (INDIRECT_REF, field_t, tmp_ha),
10820 build1 (INDIRECT_REF, field_t, t));
10822 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10823 for (i = 1; i < nregs; ++i)
10825 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10826 u = fold_convert (field_ptr_t, addr);
10827 u = build2 (MODIFY_EXPR, field_t,
10828 build2 (MEM_REF, field_t, tmp_ha,
10829 build_int_cst (field_ptr_t,
10830 (i *
10831 int_size_in_bytes (field_t)))),
10832 build1 (INDIRECT_REF, field_t, u));
10833 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10836 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10837 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10840 COND_EXPR_ELSE (cond2) = t;
10841 addr = fold_convert (build_pointer_type (type), cond1);
10842 addr = build_va_arg_indirect_ref (addr);
10844 if (indirect_p)
10845 addr = build_va_arg_indirect_ref (addr);
10847 return addr;
10850 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10852 static void
10853 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10854 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10855 int no_rtl)
10857 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10858 CUMULATIVE_ARGS local_cum;
10859 int gr_saved = cfun->va_list_gpr_size;
10860 int vr_saved = cfun->va_list_fpr_size;
10862 /* The caller has advanced CUM up to, but not beyond, the last named
10863 argument. Advance a local copy of CUM past the last "real" named
10864 argument, to find out how many registers are left over. */
10865 local_cum = *cum;
10866 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10868 /* Found out how many registers we need to save.
10869 Honor tree-stdvar analysis results. */
10870 if (cfun->va_list_gpr_size)
10871 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10872 cfun->va_list_gpr_size / UNITS_PER_WORD);
10873 if (cfun->va_list_fpr_size)
10874 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10875 cfun->va_list_fpr_size / UNITS_PER_VREG);
10877 if (!TARGET_FLOAT)
10879 gcc_assert (local_cum.aapcs_nvrn == 0);
10880 vr_saved = 0;
10883 if (!no_rtl)
10885 if (gr_saved > 0)
10887 rtx ptr, mem;
10889 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10890 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10891 - gr_saved * UNITS_PER_WORD);
10892 mem = gen_frame_mem (BLKmode, ptr);
10893 set_mem_alias_set (mem, get_varargs_alias_set ());
10895 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10896 mem, gr_saved);
10898 if (vr_saved > 0)
10900 /* We can't use move_block_from_reg, because it will use
10901 the wrong mode, storing D regs only. */
10902 machine_mode mode = TImode;
10903 int off, i, vr_start;
10905 /* Set OFF to the offset from virtual_incoming_args_rtx of
10906 the first vector register. The VR save area lies below
10907 the GR one, and is aligned to 16 bytes. */
10908 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10909 STACK_BOUNDARY / BITS_PER_UNIT);
10910 off -= vr_saved * UNITS_PER_VREG;
10912 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10913 for (i = 0; i < vr_saved; ++i)
10915 rtx ptr, mem;
10917 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10918 mem = gen_frame_mem (mode, ptr);
10919 set_mem_alias_set (mem, get_varargs_alias_set ());
10920 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10921 off += UNITS_PER_VREG;
10926 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10927 any complication of having crtl->args.pretend_args_size changed. */
10928 cfun->machine->frame.saved_varargs_size
10929 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10930 STACK_BOUNDARY / BITS_PER_UNIT)
10931 + vr_saved * UNITS_PER_VREG);
10934 static void
10935 aarch64_conditional_register_usage (void)
10937 int i;
10938 if (!TARGET_FLOAT)
10940 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10942 fixed_regs[i] = 1;
10943 call_used_regs[i] = 1;
10948 /* Walk down the type tree of TYPE counting consecutive base elements.
10949 If *MODEP is VOIDmode, then set it to the first valid floating point
10950 type. If a non-floating point type is found, or if a floating point
10951 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10952 otherwise return the count in the sub-tree. */
10953 static int
10954 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10956 machine_mode mode;
10957 HOST_WIDE_INT size;
10959 switch (TREE_CODE (type))
10961 case REAL_TYPE:
10962 mode = TYPE_MODE (type);
10963 if (mode != DFmode && mode != SFmode
10964 && mode != TFmode && mode != HFmode)
10965 return -1;
10967 if (*modep == VOIDmode)
10968 *modep = mode;
10970 if (*modep == mode)
10971 return 1;
10973 break;
10975 case COMPLEX_TYPE:
10976 mode = TYPE_MODE (TREE_TYPE (type));
10977 if (mode != DFmode && mode != SFmode
10978 && mode != TFmode && mode != HFmode)
10979 return -1;
10981 if (*modep == VOIDmode)
10982 *modep = mode;
10984 if (*modep == mode)
10985 return 2;
10987 break;
10989 case VECTOR_TYPE:
10990 /* Use V2SImode and V4SImode as representatives of all 64-bit
10991 and 128-bit vector types. */
10992 size = int_size_in_bytes (type);
10993 switch (size)
10995 case 8:
10996 mode = V2SImode;
10997 break;
10998 case 16:
10999 mode = V4SImode;
11000 break;
11001 default:
11002 return -1;
11005 if (*modep == VOIDmode)
11006 *modep = mode;
11008 /* Vector modes are considered to be opaque: two vectors are
11009 equivalent for the purposes of being homogeneous aggregates
11010 if they are the same size. */
11011 if (*modep == mode)
11012 return 1;
11014 break;
11016 case ARRAY_TYPE:
11018 int count;
11019 tree index = TYPE_DOMAIN (type);
11021 /* Can't handle incomplete types nor sizes that are not
11022 fixed. */
11023 if (!COMPLETE_TYPE_P (type)
11024 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11025 return -1;
11027 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11028 if (count == -1
11029 || !index
11030 || !TYPE_MAX_VALUE (index)
11031 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11032 || !TYPE_MIN_VALUE (index)
11033 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11034 || count < 0)
11035 return -1;
11037 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11038 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11040 /* There must be no padding. */
11041 if (wi::to_wide (TYPE_SIZE (type))
11042 != count * GET_MODE_BITSIZE (*modep))
11043 return -1;
11045 return count;
11048 case RECORD_TYPE:
11050 int count = 0;
11051 int sub_count;
11052 tree field;
11054 /* Can't handle incomplete types nor sizes that are not
11055 fixed. */
11056 if (!COMPLETE_TYPE_P (type)
11057 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11058 return -1;
11060 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11062 if (TREE_CODE (field) != FIELD_DECL)
11063 continue;
11065 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11066 if (sub_count < 0)
11067 return -1;
11068 count += sub_count;
11071 /* There must be no padding. */
11072 if (wi::to_wide (TYPE_SIZE (type))
11073 != count * GET_MODE_BITSIZE (*modep))
11074 return -1;
11076 return count;
11079 case UNION_TYPE:
11080 case QUAL_UNION_TYPE:
11082 /* These aren't very interesting except in a degenerate case. */
11083 int count = 0;
11084 int sub_count;
11085 tree field;
11087 /* Can't handle incomplete types nor sizes that are not
11088 fixed. */
11089 if (!COMPLETE_TYPE_P (type)
11090 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11091 return -1;
11093 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11095 if (TREE_CODE (field) != FIELD_DECL)
11096 continue;
11098 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11099 if (sub_count < 0)
11100 return -1;
11101 count = count > sub_count ? count : sub_count;
11104 /* There must be no padding. */
11105 if (wi::to_wide (TYPE_SIZE (type))
11106 != count * GET_MODE_BITSIZE (*modep))
11107 return -1;
11109 return count;
11112 default:
11113 break;
11116 return -1;
11119 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11120 type as described in AAPCS64 \S 4.1.2.
11122 See the comment above aarch64_composite_type_p for the notes on MODE. */
11124 static bool
11125 aarch64_short_vector_p (const_tree type,
11126 machine_mode mode)
11128 HOST_WIDE_INT size = -1;
11130 if (type && TREE_CODE (type) == VECTOR_TYPE)
11131 size = int_size_in_bytes (type);
11132 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11133 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11134 size = GET_MODE_SIZE (mode);
11136 return (size == 8 || size == 16);
11139 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11140 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11141 array types. The C99 floating-point complex types are also considered
11142 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11143 types, which are GCC extensions and out of the scope of AAPCS64, are
11144 treated as composite types here as well.
11146 Note that MODE itself is not sufficient in determining whether a type
11147 is such a composite type or not. This is because
11148 stor-layout.c:compute_record_mode may have already changed the MODE
11149 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11150 structure with only one field may have its MODE set to the mode of the
11151 field. Also an integer mode whose size matches the size of the
11152 RECORD_TYPE type may be used to substitute the original mode
11153 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11154 solely relied on. */
11156 static bool
11157 aarch64_composite_type_p (const_tree type,
11158 machine_mode mode)
11160 if (aarch64_short_vector_p (type, mode))
11161 return false;
11163 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11164 return true;
11166 if (mode == BLKmode
11167 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11168 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11169 return true;
11171 return false;
11174 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11175 shall be passed or returned in simd/fp register(s) (providing these
11176 parameter passing registers are available).
11178 Upon successful return, *COUNT returns the number of needed registers,
11179 *BASE_MODE returns the mode of the individual register and when IS_HAF
11180 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11181 floating-point aggregate or a homogeneous short-vector aggregate. */
11183 static bool
11184 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11185 const_tree type,
11186 machine_mode *base_mode,
11187 int *count,
11188 bool *is_ha)
11190 machine_mode new_mode = VOIDmode;
11191 bool composite_p = aarch64_composite_type_p (type, mode);
11193 if (is_ha != NULL) *is_ha = false;
11195 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11196 || aarch64_short_vector_p (type, mode))
11198 *count = 1;
11199 new_mode = mode;
11201 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11203 if (is_ha != NULL) *is_ha = true;
11204 *count = 2;
11205 new_mode = GET_MODE_INNER (mode);
11207 else if (type && composite_p)
11209 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11211 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11213 if (is_ha != NULL) *is_ha = true;
11214 *count = ag_count;
11216 else
11217 return false;
11219 else
11220 return false;
11222 *base_mode = new_mode;
11223 return true;
11226 /* Implement TARGET_STRUCT_VALUE_RTX. */
11228 static rtx
11229 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11230 int incoming ATTRIBUTE_UNUSED)
11232 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11235 /* Implements target hook vector_mode_supported_p. */
11236 static bool
11237 aarch64_vector_mode_supported_p (machine_mode mode)
11239 if (TARGET_SIMD
11240 && (mode == V4SImode || mode == V8HImode
11241 || mode == V16QImode || mode == V2DImode
11242 || mode == V2SImode || mode == V4HImode
11243 || mode == V8QImode || mode == V2SFmode
11244 || mode == V4SFmode || mode == V2DFmode
11245 || mode == V4HFmode || mode == V8HFmode
11246 || mode == V1DFmode))
11247 return true;
11249 return false;
11252 /* Return appropriate SIMD container
11253 for MODE within a vector of WIDTH bits. */
11254 static machine_mode
11255 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11257 gcc_assert (width == 64 || width == 128);
11258 if (TARGET_SIMD)
11260 if (width == 128)
11261 switch (mode)
11263 case E_DFmode:
11264 return V2DFmode;
11265 case E_SFmode:
11266 return V4SFmode;
11267 case E_HFmode:
11268 return V8HFmode;
11269 case E_SImode:
11270 return V4SImode;
11271 case E_HImode:
11272 return V8HImode;
11273 case E_QImode:
11274 return V16QImode;
11275 case E_DImode:
11276 return V2DImode;
11277 default:
11278 break;
11280 else
11281 switch (mode)
11283 case E_SFmode:
11284 return V2SFmode;
11285 case E_HFmode:
11286 return V4HFmode;
11287 case E_SImode:
11288 return V2SImode;
11289 case E_HImode:
11290 return V4HImode;
11291 case E_QImode:
11292 return V8QImode;
11293 default:
11294 break;
11297 return word_mode;
11300 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11301 static machine_mode
11302 aarch64_preferred_simd_mode (scalar_mode mode)
11304 return aarch64_simd_container_mode (mode, 128);
11307 /* Return the bitmask of possible vector sizes for the vectorizer
11308 to iterate over. */
11309 static unsigned int
11310 aarch64_autovectorize_vector_sizes (void)
11312 return (16 | 8);
11315 /* Implement TARGET_MANGLE_TYPE. */
11317 static const char *
11318 aarch64_mangle_type (const_tree type)
11320 /* The AArch64 ABI documents say that "__va_list" has to be
11321 managled as if it is in the "std" namespace. */
11322 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11323 return "St9__va_list";
11325 /* Half-precision float. */
11326 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11327 return "Dh";
11329 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11330 builtin types. */
11331 if (TYPE_NAME (type) != NULL)
11332 return aarch64_mangle_builtin_type (type);
11334 /* Use the default mangling. */
11335 return NULL;
11338 /* Find the first rtx_insn before insn that will generate an assembly
11339 instruction. */
11341 static rtx_insn *
11342 aarch64_prev_real_insn (rtx_insn *insn)
11344 if (!insn)
11345 return NULL;
11349 insn = prev_real_insn (insn);
11351 while (insn && recog_memoized (insn) < 0);
11353 return insn;
11356 static bool
11357 is_madd_op (enum attr_type t1)
11359 unsigned int i;
11360 /* A number of these may be AArch32 only. */
11361 enum attr_type mlatypes[] = {
11362 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11363 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11364 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11367 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11369 if (t1 == mlatypes[i])
11370 return true;
11373 return false;
11376 /* Check if there is a register dependency between a load and the insn
11377 for which we hold recog_data. */
11379 static bool
11380 dep_between_memop_and_curr (rtx memop)
11382 rtx load_reg;
11383 int opno;
11385 gcc_assert (GET_CODE (memop) == SET);
11387 if (!REG_P (SET_DEST (memop)))
11388 return false;
11390 load_reg = SET_DEST (memop);
11391 for (opno = 1; opno < recog_data.n_operands; opno++)
11393 rtx operand = recog_data.operand[opno];
11394 if (REG_P (operand)
11395 && reg_overlap_mentioned_p (load_reg, operand))
11396 return true;
11399 return false;
11403 /* When working around the Cortex-A53 erratum 835769,
11404 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11405 instruction and has a preceding memory instruction such that a NOP
11406 should be inserted between them. */
11408 bool
11409 aarch64_madd_needs_nop (rtx_insn* insn)
11411 enum attr_type attr_type;
11412 rtx_insn *prev;
11413 rtx body;
11415 if (!TARGET_FIX_ERR_A53_835769)
11416 return false;
11418 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11419 return false;
11421 attr_type = get_attr_type (insn);
11422 if (!is_madd_op (attr_type))
11423 return false;
11425 prev = aarch64_prev_real_insn (insn);
11426 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11427 Restore recog state to INSN to avoid state corruption. */
11428 extract_constrain_insn_cached (insn);
11430 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11431 return false;
11433 body = single_set (prev);
11435 /* If the previous insn is a memory op and there is no dependency between
11436 it and the DImode madd, emit a NOP between them. If body is NULL then we
11437 have a complex memory operation, probably a load/store pair.
11438 Be conservative for now and emit a NOP. */
11439 if (GET_MODE (recog_data.operand[0]) == DImode
11440 && (!body || !dep_between_memop_and_curr (body)))
11441 return true;
11443 return false;
11448 /* Implement FINAL_PRESCAN_INSN. */
11450 void
11451 aarch64_final_prescan_insn (rtx_insn *insn)
11453 if (aarch64_madd_needs_nop (insn))
11454 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11458 /* Return the equivalent letter for size. */
11459 static char
11460 sizetochar (int size)
11462 switch (size)
11464 case 64: return 'd';
11465 case 32: return 's';
11466 case 16: return 'h';
11467 case 8 : return 'b';
11468 default: gcc_unreachable ();
11472 /* Return true iff x is a uniform vector of floating-point
11473 constants, and the constant can be represented in
11474 quarter-precision form. Note, as aarch64_float_const_representable
11475 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11476 static bool
11477 aarch64_vect_float_const_representable_p (rtx x)
11479 rtx elt;
11480 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11481 && const_vec_duplicate_p (x, &elt)
11482 && aarch64_float_const_representable_p (elt));
11485 /* Return true for valid and false for invalid. */
11486 bool
11487 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11488 struct simd_immediate_info *info,
11489 enum simd_immediate_check which)
11491 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11492 matches = 1; \
11493 for (i = 0; i < idx; i += (STRIDE)) \
11494 if (!(TEST)) \
11495 matches = 0; \
11496 if (matches) \
11498 immtype = (CLASS); \
11499 elsize = (ELSIZE); \
11500 eshift = (SHIFT); \
11501 emvn = (NEG); \
11502 break; \
11505 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11506 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11507 unsigned char bytes[16];
11508 int immtype = -1, matches;
11509 unsigned int invmask = inverse ? 0xff : 0;
11510 int eshift, emvn;
11512 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11514 if (! (aarch64_simd_imm_zero_p (op, mode)
11515 || aarch64_vect_float_const_representable_p (op)))
11516 return false;
11518 if (info)
11520 rtx elt = CONST_VECTOR_ELT (op, 0);
11521 scalar_float_mode elt_mode
11522 = as_a <scalar_float_mode> (GET_MODE (elt));
11524 info->value = elt;
11525 info->element_width = GET_MODE_BITSIZE (elt_mode);
11526 info->mvn = false;
11527 info->shift = 0;
11530 return true;
11533 /* Splat vector constant out into a byte vector. */
11534 for (i = 0; i < n_elts; i++)
11536 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11537 it must be laid out in the vector register in reverse order. */
11538 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11539 unsigned HOST_WIDE_INT elpart;
11541 gcc_assert (CONST_INT_P (el));
11542 elpart = INTVAL (el);
11544 for (unsigned int byte = 0; byte < innersize; byte++)
11546 bytes[idx++] = (elpart & 0xff) ^ invmask;
11547 elpart >>= BITS_PER_UNIT;
11552 /* Sanity check. */
11553 gcc_assert (idx == GET_MODE_SIZE (mode));
11557 if (which & AARCH64_CHECK_ORR)
11559 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11560 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11562 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11563 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11565 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11566 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11568 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11569 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11571 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11573 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11576 if (which & AARCH64_CHECK_BIC)
11578 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11579 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11581 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11582 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11584 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11585 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11587 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11588 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11590 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11592 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11595 /* Shifting ones / 8-bit / 64-bit variants only checked
11596 for 'ALL' (MOVI/MVNI). */
11597 if (which == AARCH64_CHECK_MOV)
11599 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11600 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11602 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11603 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11605 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11606 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11608 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11609 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11611 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11613 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11614 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11617 while (0);
11619 if (immtype == -1)
11620 return false;
11622 if (info)
11624 info->element_width = elsize;
11625 info->mvn = emvn != 0;
11626 info->shift = eshift;
11628 unsigned HOST_WIDE_INT imm = 0;
11630 if (immtype >= 12 && immtype <= 15)
11631 info->msl = true;
11633 /* Un-invert bytes of recognized vector, if necessary. */
11634 if (invmask != 0)
11635 for (i = 0; i < idx; i++)
11636 bytes[i] ^= invmask;
11638 if (immtype == 17)
11640 /* FIXME: Broken on 32-bit H_W_I hosts. */
11641 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11643 for (i = 0; i < 8; i++)
11644 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11645 << (i * BITS_PER_UNIT);
11648 info->value = GEN_INT (imm);
11650 else
11652 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11653 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11655 /* Construct 'abcdefgh' because the assembler cannot handle
11656 generic constants. */
11657 if (info->mvn)
11658 imm = ~imm;
11659 imm = (imm >> info->shift) & 0xff;
11660 info->value = GEN_INT (imm);
11664 return true;
11665 #undef CHECK
11668 /* Check of immediate shift constants are within range. */
11669 bool
11670 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11672 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11673 if (left)
11674 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11675 else
11676 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11679 /* Return true if X is a uniform vector where all elements
11680 are either the floating-point constant 0.0 or the
11681 integer constant 0. */
11682 bool
11683 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11685 return x == CONST0_RTX (mode);
11689 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11690 operation of width WIDTH at bit position POS. */
11693 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11695 gcc_assert (CONST_INT_P (width));
11696 gcc_assert (CONST_INT_P (pos));
11698 unsigned HOST_WIDE_INT mask
11699 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11700 return GEN_INT (mask << UINTVAL (pos));
11703 bool
11704 aarch64_mov_operand_p (rtx x, machine_mode mode)
11706 if (GET_CODE (x) == HIGH
11707 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11708 return true;
11710 if (CONST_INT_P (x))
11711 return true;
11713 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11714 return true;
11716 return aarch64_classify_symbolic_expression (x)
11717 == SYMBOL_TINY_ABSOLUTE;
11720 /* Return a const_int vector of VAL. */
11722 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11724 int nunits = GET_MODE_NUNITS (mode);
11725 rtvec v = rtvec_alloc (nunits);
11726 int i;
11728 rtx cache = GEN_INT (val);
11730 for (i=0; i < nunits; i++)
11731 RTVEC_ELT (v, i) = cache;
11733 return gen_rtx_CONST_VECTOR (mode, v);
11736 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11738 bool
11739 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11741 machine_mode vmode;
11743 vmode = aarch64_preferred_simd_mode (mode);
11744 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11745 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11748 /* Construct and return a PARALLEL RTX vector with elements numbering the
11749 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11750 the vector - from the perspective of the architecture. This does not
11751 line up with GCC's perspective on lane numbers, so we end up with
11752 different masks depending on our target endian-ness. The diagram
11753 below may help. We must draw the distinction when building masks
11754 which select one half of the vector. An instruction selecting
11755 architectural low-lanes for a big-endian target, must be described using
11756 a mask selecting GCC high-lanes.
11758 Big-Endian Little-Endian
11760 GCC 0 1 2 3 3 2 1 0
11761 | x | x | x | x | | x | x | x | x |
11762 Architecture 3 2 1 0 3 2 1 0
11764 Low Mask: { 2, 3 } { 0, 1 }
11765 High Mask: { 0, 1 } { 2, 3 }
11769 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11771 int nunits = GET_MODE_NUNITS (mode);
11772 rtvec v = rtvec_alloc (nunits / 2);
11773 int high_base = nunits / 2;
11774 int low_base = 0;
11775 int base;
11776 rtx t1;
11777 int i;
11779 if (BYTES_BIG_ENDIAN)
11780 base = high ? low_base : high_base;
11781 else
11782 base = high ? high_base : low_base;
11784 for (i = 0; i < nunits / 2; i++)
11785 RTVEC_ELT (v, i) = GEN_INT (base + i);
11787 t1 = gen_rtx_PARALLEL (mode, v);
11788 return t1;
11791 /* Check OP for validity as a PARALLEL RTX vector with elements
11792 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11793 from the perspective of the architecture. See the diagram above
11794 aarch64_simd_vect_par_cnst_half for more details. */
11796 bool
11797 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11798 bool high)
11800 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11801 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11802 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11803 int i = 0;
11805 if (!VECTOR_MODE_P (mode))
11806 return false;
11808 if (count_op != count_ideal)
11809 return false;
11811 for (i = 0; i < count_ideal; i++)
11813 rtx elt_op = XVECEXP (op, 0, i);
11814 rtx elt_ideal = XVECEXP (ideal, 0, i);
11816 if (!CONST_INT_P (elt_op)
11817 || INTVAL (elt_ideal) != INTVAL (elt_op))
11818 return false;
11820 return true;
11823 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11824 HIGH (exclusive). */
11825 void
11826 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11827 const_tree exp)
11829 HOST_WIDE_INT lane;
11830 gcc_assert (CONST_INT_P (operand));
11831 lane = INTVAL (operand);
11833 if (lane < low || lane >= high)
11835 if (exp)
11836 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11837 else
11838 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11842 /* Return TRUE if OP is a valid vector addressing mode. */
11843 bool
11844 aarch64_simd_mem_operand_p (rtx op)
11846 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11847 || REG_P (XEXP (op, 0)));
11850 /* Emit a register copy from operand to operand, taking care not to
11851 early-clobber source registers in the process.
11853 COUNT is the number of components into which the copy needs to be
11854 decomposed. */
11855 void
11856 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11857 unsigned int count)
11859 unsigned int i;
11860 int rdest = REGNO (operands[0]);
11861 int rsrc = REGNO (operands[1]);
11863 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11864 || rdest < rsrc)
11865 for (i = 0; i < count; i++)
11866 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11867 gen_rtx_REG (mode, rsrc + i));
11868 else
11869 for (i = 0; i < count; i++)
11870 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11871 gen_rtx_REG (mode, rsrc + count - i - 1));
11874 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11875 one of VSTRUCT modes: OI, CI, or XI. */
11877 aarch64_simd_attr_length_rglist (machine_mode mode)
11879 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11882 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11883 alignment of a vector to 128 bits. */
11884 static HOST_WIDE_INT
11885 aarch64_simd_vector_alignment (const_tree type)
11887 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11888 return MIN (align, 128);
11891 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11892 static bool
11893 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11895 if (is_packed)
11896 return false;
11898 /* We guarantee alignment for vectors up to 128-bits. */
11899 if (tree_int_cst_compare (TYPE_SIZE (type),
11900 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11901 return false;
11903 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11904 return true;
11907 /* Return true if the vector misalignment factor is supported by the
11908 target. */
11909 static bool
11910 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11911 const_tree type, int misalignment,
11912 bool is_packed)
11914 if (TARGET_SIMD && STRICT_ALIGNMENT)
11916 /* Return if movmisalign pattern is not supported for this mode. */
11917 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11918 return false;
11920 /* Misalignment factor is unknown at compile time. */
11921 if (misalignment == -1)
11922 return false;
11924 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11925 is_packed);
11928 /* If VALS is a vector constant that can be loaded into a register
11929 using DUP, generate instructions to do so and return an RTX to
11930 assign to the register. Otherwise return NULL_RTX. */
11931 static rtx
11932 aarch64_simd_dup_constant (rtx vals)
11934 machine_mode mode = GET_MODE (vals);
11935 machine_mode inner_mode = GET_MODE_INNER (mode);
11936 rtx x;
11938 if (!const_vec_duplicate_p (vals, &x))
11939 return NULL_RTX;
11941 /* We can load this constant by using DUP and a constant in a
11942 single ARM register. This will be cheaper than a vector
11943 load. */
11944 x = copy_to_mode_reg (inner_mode, x);
11945 return gen_rtx_VEC_DUPLICATE (mode, x);
11949 /* Generate code to load VALS, which is a PARALLEL containing only
11950 constants (for vec_init) or CONST_VECTOR, efficiently into a
11951 register. Returns an RTX to copy into the register, or NULL_RTX
11952 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11953 static rtx
11954 aarch64_simd_make_constant (rtx vals)
11956 machine_mode mode = GET_MODE (vals);
11957 rtx const_dup;
11958 rtx const_vec = NULL_RTX;
11959 int n_elts = GET_MODE_NUNITS (mode);
11960 int n_const = 0;
11961 int i;
11963 if (GET_CODE (vals) == CONST_VECTOR)
11964 const_vec = vals;
11965 else if (GET_CODE (vals) == PARALLEL)
11967 /* A CONST_VECTOR must contain only CONST_INTs and
11968 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11969 Only store valid constants in a CONST_VECTOR. */
11970 for (i = 0; i < n_elts; ++i)
11972 rtx x = XVECEXP (vals, 0, i);
11973 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11974 n_const++;
11976 if (n_const == n_elts)
11977 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11979 else
11980 gcc_unreachable ();
11982 if (const_vec != NULL_RTX
11983 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11984 /* Load using MOVI/MVNI. */
11985 return const_vec;
11986 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11987 /* Loaded using DUP. */
11988 return const_dup;
11989 else if (const_vec != NULL_RTX)
11990 /* Load from constant pool. We can not take advantage of single-cycle
11991 LD1 because we need a PC-relative addressing mode. */
11992 return const_vec;
11993 else
11994 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11995 We can not construct an initializer. */
11996 return NULL_RTX;
11999 /* Expand a vector initialisation sequence, such that TARGET is
12000 initialised to contain VALS. */
12002 void
12003 aarch64_expand_vector_init (rtx target, rtx vals)
12005 machine_mode mode = GET_MODE (target);
12006 scalar_mode inner_mode = GET_MODE_INNER (mode);
12007 /* The number of vector elements. */
12008 int n_elts = GET_MODE_NUNITS (mode);
12009 /* The number of vector elements which are not constant. */
12010 int n_var = 0;
12011 rtx any_const = NULL_RTX;
12012 /* The first element of vals. */
12013 rtx v0 = XVECEXP (vals, 0, 0);
12014 bool all_same = true;
12016 /* Count the number of variable elements to initialise. */
12017 for (int i = 0; i < n_elts; ++i)
12019 rtx x = XVECEXP (vals, 0, i);
12020 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12021 ++n_var;
12022 else
12023 any_const = x;
12025 all_same &= rtx_equal_p (x, v0);
12028 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12029 how best to handle this. */
12030 if (n_var == 0)
12032 rtx constant = aarch64_simd_make_constant (vals);
12033 if (constant != NULL_RTX)
12035 emit_move_insn (target, constant);
12036 return;
12040 /* Splat a single non-constant element if we can. */
12041 if (all_same)
12043 rtx x = copy_to_mode_reg (inner_mode, v0);
12044 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12045 return;
12048 enum insn_code icode = optab_handler (vec_set_optab, mode);
12049 gcc_assert (icode != CODE_FOR_nothing);
12051 /* If there are only variable elements, try to optimize
12052 the insertion using dup for the most common element
12053 followed by insertions. */
12055 /* The algorithm will fill matches[*][0] with the earliest matching element,
12056 and matches[X][1] with the count of duplicate elements (if X is the
12057 earliest element which has duplicates). */
12059 if (n_var == n_elts && n_elts <= 16)
12061 int matches[16][2] = {0};
12062 for (int i = 0; i < n_elts; i++)
12064 for (int j = 0; j <= i; j++)
12066 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12068 matches[i][0] = j;
12069 matches[j][1]++;
12070 break;
12074 int maxelement = 0;
12075 int maxv = 0;
12076 for (int i = 0; i < n_elts; i++)
12077 if (matches[i][1] > maxv)
12079 maxelement = i;
12080 maxv = matches[i][1];
12083 /* Create a duplicate of the most common element. */
12084 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12085 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12087 /* Insert the rest. */
12088 for (int i = 0; i < n_elts; i++)
12090 rtx x = XVECEXP (vals, 0, i);
12091 if (matches[i][0] == maxelement)
12092 continue;
12093 x = copy_to_mode_reg (inner_mode, x);
12094 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12096 return;
12099 /* Initialise a vector which is part-variable. We want to first try
12100 to build those lanes which are constant in the most efficient way we
12101 can. */
12102 if (n_var != n_elts)
12104 rtx copy = copy_rtx (vals);
12106 /* Load constant part of vector. We really don't care what goes into the
12107 parts we will overwrite, but we're more likely to be able to load the
12108 constant efficiently if it has fewer, larger, repeating parts
12109 (see aarch64_simd_valid_immediate). */
12110 for (int i = 0; i < n_elts; i++)
12112 rtx x = XVECEXP (vals, 0, i);
12113 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12114 continue;
12115 rtx subst = any_const;
12116 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12118 /* Look in the copied vector, as more elements are const. */
12119 rtx test = XVECEXP (copy, 0, i ^ bit);
12120 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12122 subst = test;
12123 break;
12126 XVECEXP (copy, 0, i) = subst;
12128 aarch64_expand_vector_init (target, copy);
12131 /* Insert the variable lanes directly. */
12132 for (int i = 0; i < n_elts; i++)
12134 rtx x = XVECEXP (vals, 0, i);
12135 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12136 continue;
12137 x = copy_to_mode_reg (inner_mode, x);
12138 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12142 static unsigned HOST_WIDE_INT
12143 aarch64_shift_truncation_mask (machine_mode mode)
12145 return
12146 (!SHIFT_COUNT_TRUNCATED
12147 || aarch64_vector_mode_supported_p (mode)
12148 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12151 /* Select a format to encode pointers in exception handling data. */
12153 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12155 int type;
12156 switch (aarch64_cmodel)
12158 case AARCH64_CMODEL_TINY:
12159 case AARCH64_CMODEL_TINY_PIC:
12160 case AARCH64_CMODEL_SMALL:
12161 case AARCH64_CMODEL_SMALL_PIC:
12162 case AARCH64_CMODEL_SMALL_SPIC:
12163 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12164 for everything. */
12165 type = DW_EH_PE_sdata4;
12166 break;
12167 default:
12168 /* No assumptions here. 8-byte relocs required. */
12169 type = DW_EH_PE_sdata8;
12170 break;
12172 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12175 /* The last .arch and .tune assembly strings that we printed. */
12176 static std::string aarch64_last_printed_arch_string;
12177 static std::string aarch64_last_printed_tune_string;
12179 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12180 by the function fndecl. */
12182 void
12183 aarch64_declare_function_name (FILE *stream, const char* name,
12184 tree fndecl)
12186 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12188 struct cl_target_option *targ_options;
12189 if (target_parts)
12190 targ_options = TREE_TARGET_OPTION (target_parts);
12191 else
12192 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12193 gcc_assert (targ_options);
12195 const struct processor *this_arch
12196 = aarch64_get_arch (targ_options->x_explicit_arch);
12198 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12199 std::string extension
12200 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12201 this_arch->flags);
12202 /* Only update the assembler .arch string if it is distinct from the last
12203 such string we printed. */
12204 std::string to_print = this_arch->name + extension;
12205 if (to_print != aarch64_last_printed_arch_string)
12207 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12208 aarch64_last_printed_arch_string = to_print;
12211 /* Print the cpu name we're tuning for in the comments, might be
12212 useful to readers of the generated asm. Do it only when it changes
12213 from function to function and verbose assembly is requested. */
12214 const struct processor *this_tune
12215 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12217 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12219 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12220 this_tune->name);
12221 aarch64_last_printed_tune_string = this_tune->name;
12224 /* Don't forget the type directive for ELF. */
12225 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12226 ASM_OUTPUT_LABEL (stream, name);
12229 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12231 static void
12232 aarch64_start_file (void)
12234 struct cl_target_option *default_options
12235 = TREE_TARGET_OPTION (target_option_default_node);
12237 const struct processor *default_arch
12238 = aarch64_get_arch (default_options->x_explicit_arch);
12239 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12240 std::string extension
12241 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12242 default_arch->flags);
12244 aarch64_last_printed_arch_string = default_arch->name + extension;
12245 aarch64_last_printed_tune_string = "";
12246 asm_fprintf (asm_out_file, "\t.arch %s\n",
12247 aarch64_last_printed_arch_string.c_str ());
12249 default_file_start ();
12252 /* Emit load exclusive. */
12254 static void
12255 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12256 rtx mem, rtx model_rtx)
12258 rtx (*gen) (rtx, rtx, rtx);
12260 switch (mode)
12262 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12263 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12264 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12265 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12266 default:
12267 gcc_unreachable ();
12270 emit_insn (gen (rval, mem, model_rtx));
12273 /* Emit store exclusive. */
12275 static void
12276 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12277 rtx rval, rtx mem, rtx model_rtx)
12279 rtx (*gen) (rtx, rtx, rtx, rtx);
12281 switch (mode)
12283 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12284 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12285 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12286 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12287 default:
12288 gcc_unreachable ();
12291 emit_insn (gen (bval, rval, mem, model_rtx));
12294 /* Mark the previous jump instruction as unlikely. */
12296 static void
12297 aarch64_emit_unlikely_jump (rtx insn)
12299 rtx_insn *jump = emit_jump_insn (insn);
12300 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12303 /* Expand a compare and swap pattern. */
12305 void
12306 aarch64_expand_compare_and_swap (rtx operands[])
12308 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12309 machine_mode mode, cmp_mode;
12310 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12311 int idx;
12312 gen_cas_fn gen;
12313 const gen_cas_fn split_cas[] =
12315 gen_aarch64_compare_and_swapqi,
12316 gen_aarch64_compare_and_swaphi,
12317 gen_aarch64_compare_and_swapsi,
12318 gen_aarch64_compare_and_swapdi
12320 const gen_cas_fn atomic_cas[] =
12322 gen_aarch64_compare_and_swapqi_lse,
12323 gen_aarch64_compare_and_swaphi_lse,
12324 gen_aarch64_compare_and_swapsi_lse,
12325 gen_aarch64_compare_and_swapdi_lse
12328 bval = operands[0];
12329 rval = operands[1];
12330 mem = operands[2];
12331 oldval = operands[3];
12332 newval = operands[4];
12333 is_weak = operands[5];
12334 mod_s = operands[6];
12335 mod_f = operands[7];
12336 mode = GET_MODE (mem);
12337 cmp_mode = mode;
12339 /* Normally the succ memory model must be stronger than fail, but in the
12340 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12341 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12343 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12344 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12345 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12347 switch (mode)
12349 case E_QImode:
12350 case E_HImode:
12351 /* For short modes, we're going to perform the comparison in SImode,
12352 so do the zero-extension now. */
12353 cmp_mode = SImode;
12354 rval = gen_reg_rtx (SImode);
12355 oldval = convert_modes (SImode, mode, oldval, true);
12356 /* Fall through. */
12358 case E_SImode:
12359 case E_DImode:
12360 /* Force the value into a register if needed. */
12361 if (!aarch64_plus_operand (oldval, mode))
12362 oldval = force_reg (cmp_mode, oldval);
12363 break;
12365 default:
12366 gcc_unreachable ();
12369 switch (mode)
12371 case E_QImode: idx = 0; break;
12372 case E_HImode: idx = 1; break;
12373 case E_SImode: idx = 2; break;
12374 case E_DImode: idx = 3; break;
12375 default:
12376 gcc_unreachable ();
12378 if (TARGET_LSE)
12379 gen = atomic_cas[idx];
12380 else
12381 gen = split_cas[idx];
12383 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12385 if (mode == QImode || mode == HImode)
12386 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12388 x = gen_rtx_REG (CCmode, CC_REGNUM);
12389 x = gen_rtx_EQ (SImode, x, const0_rtx);
12390 emit_insn (gen_rtx_SET (bval, x));
12393 /* Test whether the target supports using a atomic load-operate instruction.
12394 CODE is the operation and AFTER is TRUE if the data in memory after the
12395 operation should be returned and FALSE if the data before the operation
12396 should be returned. Returns FALSE if the operation isn't supported by the
12397 architecture. */
12399 bool
12400 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12402 if (!TARGET_LSE)
12403 return false;
12405 switch (code)
12407 case SET:
12408 case AND:
12409 case IOR:
12410 case XOR:
12411 case MINUS:
12412 case PLUS:
12413 return true;
12414 default:
12415 return false;
12419 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12420 sequence implementing an atomic operation. */
12422 static void
12423 aarch64_emit_post_barrier (enum memmodel model)
12425 const enum memmodel base_model = memmodel_base (model);
12427 if (is_mm_sync (model)
12428 && (base_model == MEMMODEL_ACQUIRE
12429 || base_model == MEMMODEL_ACQ_REL
12430 || base_model == MEMMODEL_SEQ_CST))
12432 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12436 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12437 for the data in memory. EXPECTED is the value expected to be in memory.
12438 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12439 is the memory ordering to use. */
12441 void
12442 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12443 rtx expected, rtx desired,
12444 rtx model)
12446 rtx (*gen) (rtx, rtx, rtx, rtx);
12447 machine_mode mode;
12449 mode = GET_MODE (mem);
12451 switch (mode)
12453 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12454 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12455 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12456 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12457 default:
12458 gcc_unreachable ();
12461 /* Move the expected value into the CAS destination register. */
12462 emit_insn (gen_rtx_SET (rval, expected));
12464 /* Emit the CAS. */
12465 emit_insn (gen (rval, mem, desired, model));
12467 /* Compare the expected value with the value loaded by the CAS, to establish
12468 whether the swap was made. */
12469 aarch64_gen_compare_reg (EQ, rval, expected);
12472 /* Split a compare and swap pattern. */
12474 void
12475 aarch64_split_compare_and_swap (rtx operands[])
12477 rtx rval, mem, oldval, newval, scratch;
12478 machine_mode mode;
12479 bool is_weak;
12480 rtx_code_label *label1, *label2;
12481 rtx x, cond;
12482 enum memmodel model;
12483 rtx model_rtx;
12485 rval = operands[0];
12486 mem = operands[1];
12487 oldval = operands[2];
12488 newval = operands[3];
12489 is_weak = (operands[4] != const0_rtx);
12490 model_rtx = operands[5];
12491 scratch = operands[7];
12492 mode = GET_MODE (mem);
12493 model = memmodel_from_int (INTVAL (model_rtx));
12495 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12496 loop:
12497 .label1:
12498 LD[A]XR rval, [mem]
12499 CBNZ rval, .label2
12500 ST[L]XR scratch, newval, [mem]
12501 CBNZ scratch, .label1
12502 .label2:
12503 CMP rval, 0. */
12504 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12506 label1 = NULL;
12507 if (!is_weak)
12509 label1 = gen_label_rtx ();
12510 emit_label (label1);
12512 label2 = gen_label_rtx ();
12514 /* The initial load can be relaxed for a __sync operation since a final
12515 barrier will be emitted to stop code hoisting. */
12516 if (is_mm_sync (model))
12517 aarch64_emit_load_exclusive (mode, rval, mem,
12518 GEN_INT (MEMMODEL_RELAXED));
12519 else
12520 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12522 if (strong_zero_p)
12524 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12525 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12526 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12527 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12529 else
12531 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12532 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12533 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12534 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12535 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12538 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12540 if (!is_weak)
12542 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12543 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12544 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12545 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12547 else
12549 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12550 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12551 emit_insn (gen_rtx_SET (cond, x));
12554 emit_label (label2);
12555 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12556 to set the condition flags. If this is not used it will be removed by
12557 later passes. */
12558 if (strong_zero_p)
12560 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12561 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12562 emit_insn (gen_rtx_SET (cond, x));
12564 /* Emit any final barrier needed for a __sync operation. */
12565 if (is_mm_sync (model))
12566 aarch64_emit_post_barrier (model);
12569 /* Emit a BIC instruction. */
12571 static void
12572 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12574 rtx shift_rtx = GEN_INT (shift);
12575 rtx (*gen) (rtx, rtx, rtx, rtx);
12577 switch (mode)
12579 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12580 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12581 default:
12582 gcc_unreachable ();
12585 emit_insn (gen (dst, s2, shift_rtx, s1));
12588 /* Emit an atomic swap. */
12590 static void
12591 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12592 rtx mem, rtx model)
12594 rtx (*gen) (rtx, rtx, rtx, rtx);
12596 switch (mode)
12598 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12599 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12600 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12601 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12602 default:
12603 gcc_unreachable ();
12606 emit_insn (gen (dst, mem, value, model));
12609 /* Operations supported by aarch64_emit_atomic_load_op. */
12611 enum aarch64_atomic_load_op_code
12613 AARCH64_LDOP_PLUS, /* A + B */
12614 AARCH64_LDOP_XOR, /* A ^ B */
12615 AARCH64_LDOP_OR, /* A | B */
12616 AARCH64_LDOP_BIC /* A & ~B */
12619 /* Emit an atomic load-operate. */
12621 static void
12622 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12623 machine_mode mode, rtx dst, rtx src,
12624 rtx mem, rtx model)
12626 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12627 const aarch64_atomic_load_op_fn plus[] =
12629 gen_aarch64_atomic_loadaddqi,
12630 gen_aarch64_atomic_loadaddhi,
12631 gen_aarch64_atomic_loadaddsi,
12632 gen_aarch64_atomic_loadadddi
12634 const aarch64_atomic_load_op_fn eor[] =
12636 gen_aarch64_atomic_loadeorqi,
12637 gen_aarch64_atomic_loadeorhi,
12638 gen_aarch64_atomic_loadeorsi,
12639 gen_aarch64_atomic_loadeordi
12641 const aarch64_atomic_load_op_fn ior[] =
12643 gen_aarch64_atomic_loadsetqi,
12644 gen_aarch64_atomic_loadsethi,
12645 gen_aarch64_atomic_loadsetsi,
12646 gen_aarch64_atomic_loadsetdi
12648 const aarch64_atomic_load_op_fn bic[] =
12650 gen_aarch64_atomic_loadclrqi,
12651 gen_aarch64_atomic_loadclrhi,
12652 gen_aarch64_atomic_loadclrsi,
12653 gen_aarch64_atomic_loadclrdi
12655 aarch64_atomic_load_op_fn gen;
12656 int idx = 0;
12658 switch (mode)
12660 case E_QImode: idx = 0; break;
12661 case E_HImode: idx = 1; break;
12662 case E_SImode: idx = 2; break;
12663 case E_DImode: idx = 3; break;
12664 default:
12665 gcc_unreachable ();
12668 switch (code)
12670 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12671 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12672 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12673 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12674 default:
12675 gcc_unreachable ();
12678 emit_insn (gen (dst, mem, src, model));
12681 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12682 location to store the data read from memory. OUT_RESULT is the location to
12683 store the result of the operation. MEM is the memory location to read and
12684 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12685 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12686 be NULL. */
12688 void
12689 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12690 rtx mem, rtx value, rtx model_rtx)
12692 machine_mode mode = GET_MODE (mem);
12693 machine_mode wmode = (mode == DImode ? DImode : SImode);
12694 const bool short_mode = (mode < SImode);
12695 aarch64_atomic_load_op_code ldop_code;
12696 rtx src;
12697 rtx x;
12699 if (out_data)
12700 out_data = gen_lowpart (mode, out_data);
12702 if (out_result)
12703 out_result = gen_lowpart (mode, out_result);
12705 /* Make sure the value is in a register, putting it into a destination
12706 register if it needs to be manipulated. */
12707 if (!register_operand (value, mode)
12708 || code == AND || code == MINUS)
12710 src = out_result ? out_result : out_data;
12711 emit_move_insn (src, gen_lowpart (mode, value));
12713 else
12714 src = value;
12715 gcc_assert (register_operand (src, mode));
12717 /* Preprocess the data for the operation as necessary. If the operation is
12718 a SET then emit a swap instruction and finish. */
12719 switch (code)
12721 case SET:
12722 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12723 return;
12725 case MINUS:
12726 /* Negate the value and treat it as a PLUS. */
12728 rtx neg_src;
12730 /* Resize the value if necessary. */
12731 if (short_mode)
12732 src = gen_lowpart (wmode, src);
12734 neg_src = gen_rtx_NEG (wmode, src);
12735 emit_insn (gen_rtx_SET (src, neg_src));
12737 if (short_mode)
12738 src = gen_lowpart (mode, src);
12740 /* Fall-through. */
12741 case PLUS:
12742 ldop_code = AARCH64_LDOP_PLUS;
12743 break;
12745 case IOR:
12746 ldop_code = AARCH64_LDOP_OR;
12747 break;
12749 case XOR:
12750 ldop_code = AARCH64_LDOP_XOR;
12751 break;
12753 case AND:
12755 rtx not_src;
12757 /* Resize the value if necessary. */
12758 if (short_mode)
12759 src = gen_lowpart (wmode, src);
12761 not_src = gen_rtx_NOT (wmode, src);
12762 emit_insn (gen_rtx_SET (src, not_src));
12764 if (short_mode)
12765 src = gen_lowpart (mode, src);
12767 ldop_code = AARCH64_LDOP_BIC;
12768 break;
12770 default:
12771 /* The operation can't be done with atomic instructions. */
12772 gcc_unreachable ();
12775 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12777 /* If necessary, calculate the data in memory after the update by redoing the
12778 operation from values in registers. */
12779 if (!out_result)
12780 return;
12782 if (short_mode)
12784 src = gen_lowpart (wmode, src);
12785 out_data = gen_lowpart (wmode, out_data);
12786 out_result = gen_lowpart (wmode, out_result);
12789 x = NULL_RTX;
12791 switch (code)
12793 case MINUS:
12794 case PLUS:
12795 x = gen_rtx_PLUS (wmode, out_data, src);
12796 break;
12797 case IOR:
12798 x = gen_rtx_IOR (wmode, out_data, src);
12799 break;
12800 case XOR:
12801 x = gen_rtx_XOR (wmode, out_data, src);
12802 break;
12803 case AND:
12804 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12805 return;
12806 default:
12807 gcc_unreachable ();
12810 emit_set_insn (out_result, x);
12812 return;
12815 /* Split an atomic operation. */
12817 void
12818 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12819 rtx value, rtx model_rtx, rtx cond)
12821 machine_mode mode = GET_MODE (mem);
12822 machine_mode wmode = (mode == DImode ? DImode : SImode);
12823 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12824 const bool is_sync = is_mm_sync (model);
12825 rtx_code_label *label;
12826 rtx x;
12828 /* Split the atomic operation into a sequence. */
12829 label = gen_label_rtx ();
12830 emit_label (label);
12832 if (new_out)
12833 new_out = gen_lowpart (wmode, new_out);
12834 if (old_out)
12835 old_out = gen_lowpart (wmode, old_out);
12836 else
12837 old_out = new_out;
12838 value = simplify_gen_subreg (wmode, value, mode, 0);
12840 /* The initial load can be relaxed for a __sync operation since a final
12841 barrier will be emitted to stop code hoisting. */
12842 if (is_sync)
12843 aarch64_emit_load_exclusive (mode, old_out, mem,
12844 GEN_INT (MEMMODEL_RELAXED));
12845 else
12846 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12848 switch (code)
12850 case SET:
12851 new_out = value;
12852 break;
12854 case NOT:
12855 x = gen_rtx_AND (wmode, old_out, value);
12856 emit_insn (gen_rtx_SET (new_out, x));
12857 x = gen_rtx_NOT (wmode, new_out);
12858 emit_insn (gen_rtx_SET (new_out, x));
12859 break;
12861 case MINUS:
12862 if (CONST_INT_P (value))
12864 value = GEN_INT (-INTVAL (value));
12865 code = PLUS;
12867 /* Fall through. */
12869 default:
12870 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12871 emit_insn (gen_rtx_SET (new_out, x));
12872 break;
12875 aarch64_emit_store_exclusive (mode, cond, mem,
12876 gen_lowpart (mode, new_out), model_rtx);
12878 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12879 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12880 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12881 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12883 /* Emit any final barrier needed for a __sync operation. */
12884 if (is_sync)
12885 aarch64_emit_post_barrier (model);
12888 static void
12889 aarch64_init_libfuncs (void)
12891 /* Half-precision float operations. The compiler handles all operations
12892 with NULL libfuncs by converting to SFmode. */
12894 /* Conversions. */
12895 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12896 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12898 /* Arithmetic. */
12899 set_optab_libfunc (add_optab, HFmode, NULL);
12900 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12901 set_optab_libfunc (smul_optab, HFmode, NULL);
12902 set_optab_libfunc (neg_optab, HFmode, NULL);
12903 set_optab_libfunc (sub_optab, HFmode, NULL);
12905 /* Comparisons. */
12906 set_optab_libfunc (eq_optab, HFmode, NULL);
12907 set_optab_libfunc (ne_optab, HFmode, NULL);
12908 set_optab_libfunc (lt_optab, HFmode, NULL);
12909 set_optab_libfunc (le_optab, HFmode, NULL);
12910 set_optab_libfunc (ge_optab, HFmode, NULL);
12911 set_optab_libfunc (gt_optab, HFmode, NULL);
12912 set_optab_libfunc (unord_optab, HFmode, NULL);
12915 /* Target hook for c_mode_for_suffix. */
12916 static machine_mode
12917 aarch64_c_mode_for_suffix (char suffix)
12919 if (suffix == 'q')
12920 return TFmode;
12922 return VOIDmode;
12925 /* We can only represent floating point constants which will fit in
12926 "quarter-precision" values. These values are characterised by
12927 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12930 (-1)^s * (n/16) * 2^r
12932 Where:
12933 's' is the sign bit.
12934 'n' is an integer in the range 16 <= n <= 31.
12935 'r' is an integer in the range -3 <= r <= 4. */
12937 /* Return true iff X can be represented by a quarter-precision
12938 floating point immediate operand X. Note, we cannot represent 0.0. */
12939 bool
12940 aarch64_float_const_representable_p (rtx x)
12942 /* This represents our current view of how many bits
12943 make up the mantissa. */
12944 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12945 int exponent;
12946 unsigned HOST_WIDE_INT mantissa, mask;
12947 REAL_VALUE_TYPE r, m;
12948 bool fail;
12950 if (!CONST_DOUBLE_P (x))
12951 return false;
12953 /* We don't support HFmode constants yet. */
12954 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12955 return false;
12957 r = *CONST_DOUBLE_REAL_VALUE (x);
12959 /* We cannot represent infinities, NaNs or +/-zero. We won't
12960 know if we have +zero until we analyse the mantissa, but we
12961 can reject the other invalid values. */
12962 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12963 || REAL_VALUE_MINUS_ZERO (r))
12964 return false;
12966 /* Extract exponent. */
12967 r = real_value_abs (&r);
12968 exponent = REAL_EXP (&r);
12970 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12971 highest (sign) bit, with a fixed binary point at bit point_pos.
12972 m1 holds the low part of the mantissa, m2 the high part.
12973 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12974 bits for the mantissa, this can fail (low bits will be lost). */
12975 real_ldexp (&m, &r, point_pos - exponent);
12976 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12978 /* If the low part of the mantissa has bits set we cannot represent
12979 the value. */
12980 if (w.ulow () != 0)
12981 return false;
12982 /* We have rejected the lower HOST_WIDE_INT, so update our
12983 understanding of how many bits lie in the mantissa and
12984 look only at the high HOST_WIDE_INT. */
12985 mantissa = w.elt (1);
12986 point_pos -= HOST_BITS_PER_WIDE_INT;
12988 /* We can only represent values with a mantissa of the form 1.xxxx. */
12989 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12990 if ((mantissa & mask) != 0)
12991 return false;
12993 /* Having filtered unrepresentable values, we may now remove all
12994 but the highest 5 bits. */
12995 mantissa >>= point_pos - 5;
12997 /* We cannot represent the value 0.0, so reject it. This is handled
12998 elsewhere. */
12999 if (mantissa == 0)
13000 return false;
13002 /* Then, as bit 4 is always set, we can mask it off, leaving
13003 the mantissa in the range [0, 15]. */
13004 mantissa &= ~(1 << 4);
13005 gcc_assert (mantissa <= 15);
13007 /* GCC internally does not use IEEE754-like encoding (where normalized
13008 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
13009 Our mantissa values are shifted 4 places to the left relative to
13010 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
13011 by 5 places to correct for GCC's representation. */
13012 exponent = 5 - exponent;
13014 return (exponent >= 0 && exponent <= 7);
13017 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
13018 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
13019 output MOVI/MVNI, ORR or BIC immediate. */
13020 char*
13021 aarch64_output_simd_mov_immediate (rtx const_vector,
13022 machine_mode mode,
13023 unsigned width,
13024 enum simd_immediate_check which)
13026 bool is_valid;
13027 static char templ[40];
13028 const char *mnemonic;
13029 const char *shift_op;
13030 unsigned int lane_count = 0;
13031 char element_char;
13033 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13035 /* This will return true to show const_vector is legal for use as either
13036 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
13037 It will also update INFO to show how the immediate should be generated.
13038 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
13039 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false,
13040 &info, which);
13041 gcc_assert (is_valid);
13043 element_char = sizetochar (info.element_width);
13044 lane_count = width / info.element_width;
13046 mode = GET_MODE_INNER (mode);
13047 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13049 gcc_assert (info.shift == 0 && ! info.mvn);
13050 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13051 move immediate path. */
13052 if (aarch64_float_const_zero_rtx_p (info.value))
13053 info.value = GEN_INT (0);
13054 else
13056 const unsigned int buf_size = 20;
13057 char float_buf[buf_size] = {'\0'};
13058 real_to_decimal_for_mode (float_buf,
13059 CONST_DOUBLE_REAL_VALUE (info.value),
13060 buf_size, buf_size, 1, mode);
13062 if (lane_count == 1)
13063 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13064 else
13065 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13066 lane_count, element_char, float_buf);
13067 return templ;
13071 gcc_assert (CONST_INT_P (info.value));
13073 if (which == AARCH64_CHECK_MOV)
13075 mnemonic = info.mvn ? "mvni" : "movi";
13076 shift_op = info.msl ? "msl" : "lsl";
13077 if (lane_count == 1)
13078 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13079 mnemonic, UINTVAL (info.value));
13080 else if (info.shift)
13081 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13082 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
13083 element_char, UINTVAL (info.value), shift_op, info.shift);
13084 else
13085 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13086 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
13087 element_char, UINTVAL (info.value));
13089 else
13091 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
13092 mnemonic = info.mvn ? "bic" : "orr";
13093 if (info.shift)
13094 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13095 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
13096 element_char, UINTVAL (info.value), "lsl", info.shift);
13097 else
13098 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13099 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
13100 element_char, UINTVAL (info.value));
13102 return templ;
13105 char*
13106 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13109 /* If a floating point number was passed and we desire to use it in an
13110 integer mode do the conversion to integer. */
13111 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13113 unsigned HOST_WIDE_INT ival;
13114 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13115 gcc_unreachable ();
13116 immediate = gen_int_mode (ival, mode);
13119 machine_mode vmode;
13120 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13121 a 128 bit vector mode. */
13122 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13124 vmode = aarch64_simd_container_mode (mode, width);
13125 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13126 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13129 /* Split operands into moves from op[1] + op[2] into op[0]. */
13131 void
13132 aarch64_split_combinev16qi (rtx operands[3])
13134 unsigned int dest = REGNO (operands[0]);
13135 unsigned int src1 = REGNO (operands[1]);
13136 unsigned int src2 = REGNO (operands[2]);
13137 machine_mode halfmode = GET_MODE (operands[1]);
13138 unsigned int halfregs = REG_NREGS (operands[1]);
13139 rtx destlo, desthi;
13141 gcc_assert (halfmode == V16QImode);
13143 if (src1 == dest && src2 == dest + halfregs)
13145 /* No-op move. Can't split to nothing; emit something. */
13146 emit_note (NOTE_INSN_DELETED);
13147 return;
13150 /* Preserve register attributes for variable tracking. */
13151 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13152 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13153 GET_MODE_SIZE (halfmode));
13155 /* Special case of reversed high/low parts. */
13156 if (reg_overlap_mentioned_p (operands[2], destlo)
13157 && reg_overlap_mentioned_p (operands[1], desthi))
13159 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13160 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13161 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13163 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13165 /* Try to avoid unnecessary moves if part of the result
13166 is in the right place already. */
13167 if (src1 != dest)
13168 emit_move_insn (destlo, operands[1]);
13169 if (src2 != dest + halfregs)
13170 emit_move_insn (desthi, operands[2]);
13172 else
13174 if (src2 != dest + halfregs)
13175 emit_move_insn (desthi, operands[2]);
13176 if (src1 != dest)
13177 emit_move_insn (destlo, operands[1]);
13181 /* vec_perm support. */
13183 #define MAX_VECT_LEN 16
13185 struct expand_vec_perm_d
13187 rtx target, op0, op1;
13188 auto_vec_perm_indices perm;
13189 machine_mode vmode;
13190 bool one_vector_p;
13191 bool testing_p;
13194 /* Generate a variable permutation. */
13196 static void
13197 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13199 machine_mode vmode = GET_MODE (target);
13200 bool one_vector_p = rtx_equal_p (op0, op1);
13202 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13203 gcc_checking_assert (GET_MODE (op0) == vmode);
13204 gcc_checking_assert (GET_MODE (op1) == vmode);
13205 gcc_checking_assert (GET_MODE (sel) == vmode);
13206 gcc_checking_assert (TARGET_SIMD);
13208 if (one_vector_p)
13210 if (vmode == V8QImode)
13212 /* Expand the argument to a V16QI mode by duplicating it. */
13213 rtx pair = gen_reg_rtx (V16QImode);
13214 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13215 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13217 else
13219 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13222 else
13224 rtx pair;
13226 if (vmode == V8QImode)
13228 pair = gen_reg_rtx (V16QImode);
13229 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13230 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13232 else
13234 pair = gen_reg_rtx (OImode);
13235 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13236 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13241 void
13242 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13244 machine_mode vmode = GET_MODE (target);
13245 unsigned int nelt = GET_MODE_NUNITS (vmode);
13246 bool one_vector_p = rtx_equal_p (op0, op1);
13247 rtx mask;
13249 /* The TBL instruction does not use a modulo index, so we must take care
13250 of that ourselves. */
13251 mask = aarch64_simd_gen_const_vector_dup (vmode,
13252 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13253 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13255 /* For big-endian, we also need to reverse the index within the vector
13256 (but not which vector). */
13257 if (BYTES_BIG_ENDIAN)
13259 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13260 if (!one_vector_p)
13261 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13262 sel = expand_simple_binop (vmode, XOR, sel, mask,
13263 NULL, 0, OPTAB_LIB_WIDEN);
13265 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13268 /* Recognize patterns suitable for the TRN instructions. */
13269 static bool
13270 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13272 unsigned int i, odd, mask, nelt = d->perm.length ();
13273 rtx out, in0, in1, x;
13274 rtx (*gen) (rtx, rtx, rtx);
13275 machine_mode vmode = d->vmode;
13277 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13278 return false;
13280 /* Note that these are little-endian tests.
13281 We correct for big-endian later. */
13282 if (d->perm[0] == 0)
13283 odd = 0;
13284 else if (d->perm[0] == 1)
13285 odd = 1;
13286 else
13287 return false;
13288 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13290 for (i = 0; i < nelt; i += 2)
13292 if (d->perm[i] != i + odd)
13293 return false;
13294 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13295 return false;
13298 /* Success! */
13299 if (d->testing_p)
13300 return true;
13302 in0 = d->op0;
13303 in1 = d->op1;
13304 if (BYTES_BIG_ENDIAN)
13306 x = in0, in0 = in1, in1 = x;
13307 odd = !odd;
13309 out = d->target;
13311 if (odd)
13313 switch (vmode)
13315 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13316 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13317 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13318 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13319 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13320 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13321 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13322 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13323 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13324 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13325 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13326 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13327 default:
13328 return false;
13331 else
13333 switch (vmode)
13335 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13336 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13337 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13338 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13339 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13340 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13341 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13342 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13343 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13344 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13345 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13346 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13347 default:
13348 return false;
13352 emit_insn (gen (out, in0, in1));
13353 return true;
13356 /* Recognize patterns suitable for the UZP instructions. */
13357 static bool
13358 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13360 unsigned int i, odd, mask, nelt = d->perm.length ();
13361 rtx out, in0, in1, x;
13362 rtx (*gen) (rtx, rtx, rtx);
13363 machine_mode vmode = d->vmode;
13365 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13366 return false;
13368 /* Note that these are little-endian tests.
13369 We correct for big-endian later. */
13370 if (d->perm[0] == 0)
13371 odd = 0;
13372 else if (d->perm[0] == 1)
13373 odd = 1;
13374 else
13375 return false;
13376 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13378 for (i = 0; i < nelt; i++)
13380 unsigned elt = (i * 2 + odd) & mask;
13381 if (d->perm[i] != elt)
13382 return false;
13385 /* Success! */
13386 if (d->testing_p)
13387 return true;
13389 in0 = d->op0;
13390 in1 = d->op1;
13391 if (BYTES_BIG_ENDIAN)
13393 x = in0, in0 = in1, in1 = x;
13394 odd = !odd;
13396 out = d->target;
13398 if (odd)
13400 switch (vmode)
13402 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13403 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13404 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13405 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13406 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13407 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13408 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13409 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13410 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13411 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13412 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13413 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13414 default:
13415 return false;
13418 else
13420 switch (vmode)
13422 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13423 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13424 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13425 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13426 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13427 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13428 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13429 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13430 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13431 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13432 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13433 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13434 default:
13435 return false;
13439 emit_insn (gen (out, in0, in1));
13440 return true;
13443 /* Recognize patterns suitable for the ZIP instructions. */
13444 static bool
13445 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13447 unsigned int i, high, mask, nelt = d->perm.length ();
13448 rtx out, in0, in1, x;
13449 rtx (*gen) (rtx, rtx, rtx);
13450 machine_mode vmode = d->vmode;
13452 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13453 return false;
13455 /* Note that these are little-endian tests.
13456 We correct for big-endian later. */
13457 high = nelt / 2;
13458 if (d->perm[0] == high)
13459 /* Do Nothing. */
13461 else if (d->perm[0] == 0)
13462 high = 0;
13463 else
13464 return false;
13465 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13467 for (i = 0; i < nelt / 2; i++)
13469 unsigned elt = (i + high) & mask;
13470 if (d->perm[i * 2] != elt)
13471 return false;
13472 elt = (elt + nelt) & mask;
13473 if (d->perm[i * 2 + 1] != elt)
13474 return false;
13477 /* Success! */
13478 if (d->testing_p)
13479 return true;
13481 in0 = d->op0;
13482 in1 = d->op1;
13483 if (BYTES_BIG_ENDIAN)
13485 x = in0, in0 = in1, in1 = x;
13486 high = !high;
13488 out = d->target;
13490 if (high)
13492 switch (vmode)
13494 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13495 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13496 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13497 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13498 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13499 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13500 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13501 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13502 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13503 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13504 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13505 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13506 default:
13507 return false;
13510 else
13512 switch (vmode)
13514 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13515 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13516 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13517 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13518 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13519 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13520 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13521 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13522 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13523 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13524 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13525 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13526 default:
13527 return false;
13531 emit_insn (gen (out, in0, in1));
13532 return true;
13535 /* Recognize patterns for the EXT insn. */
13537 static bool
13538 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13540 unsigned int i, nelt = d->perm.length ();
13541 rtx (*gen) (rtx, rtx, rtx, rtx);
13542 rtx offset;
13544 unsigned int location = d->perm[0]; /* Always < nelt. */
13546 /* Check if the extracted indices are increasing by one. */
13547 for (i = 1; i < nelt; i++)
13549 unsigned int required = location + i;
13550 if (d->one_vector_p)
13552 /* We'll pass the same vector in twice, so allow indices to wrap. */
13553 required &= (nelt - 1);
13555 if (d->perm[i] != required)
13556 return false;
13559 switch (d->vmode)
13561 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13562 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13563 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13564 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13565 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13566 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13567 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13568 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13569 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13570 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13571 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13572 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13573 default:
13574 return false;
13577 /* Success! */
13578 if (d->testing_p)
13579 return true;
13581 /* The case where (location == 0) is a no-op for both big- and little-endian,
13582 and is removed by the mid-end at optimization levels -O1 and higher. */
13584 if (BYTES_BIG_ENDIAN && (location != 0))
13586 /* After setup, we want the high elements of the first vector (stored
13587 at the LSB end of the register), and the low elements of the second
13588 vector (stored at the MSB end of the register). So swap. */
13589 std::swap (d->op0, d->op1);
13590 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13591 location = nelt - location;
13594 offset = GEN_INT (location);
13595 emit_insn (gen (d->target, d->op0, d->op1, offset));
13596 return true;
13599 /* Recognize patterns for the REV insns. */
13601 static bool
13602 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13604 unsigned int i, j, diff, nelt = d->perm.length ();
13605 rtx (*gen) (rtx, rtx);
13607 if (!d->one_vector_p)
13608 return false;
13610 diff = d->perm[0];
13611 switch (diff)
13613 case 7:
13614 switch (d->vmode)
13616 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13617 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13618 default:
13619 return false;
13621 break;
13622 case 3:
13623 switch (d->vmode)
13625 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13626 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13627 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13628 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13629 default:
13630 return false;
13632 break;
13633 case 1:
13634 switch (d->vmode)
13636 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13637 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13638 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13639 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13640 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13641 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13642 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13643 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13644 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13645 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13646 default:
13647 return false;
13649 break;
13650 default:
13651 return false;
13654 for (i = 0; i < nelt ; i += diff + 1)
13655 for (j = 0; j <= diff; j += 1)
13657 /* This is guaranteed to be true as the value of diff
13658 is 7, 3, 1 and we should have enough elements in the
13659 queue to generate this. Getting a vector mask with a
13660 value of diff other than these values implies that
13661 something is wrong by the time we get here. */
13662 gcc_assert (i + j < nelt);
13663 if (d->perm[i + j] != i + diff - j)
13664 return false;
13667 /* Success! */
13668 if (d->testing_p)
13669 return true;
13671 emit_insn (gen (d->target, d->op0));
13672 return true;
13675 static bool
13676 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13678 rtx (*gen) (rtx, rtx, rtx);
13679 rtx out = d->target;
13680 rtx in0;
13681 machine_mode vmode = d->vmode;
13682 unsigned int i, elt, nelt = d->perm.length ();
13683 rtx lane;
13685 elt = d->perm[0];
13686 for (i = 1; i < nelt; i++)
13688 if (elt != d->perm[i])
13689 return false;
13692 /* The generic preparation in aarch64_expand_vec_perm_const_1
13693 swaps the operand order and the permute indices if it finds
13694 d->perm[0] to be in the second operand. Thus, we can always
13695 use d->op0 and need not do any extra arithmetic to get the
13696 correct lane number. */
13697 in0 = d->op0;
13698 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13700 switch (vmode)
13702 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13703 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13704 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13705 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13706 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13707 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13708 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13709 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13710 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13711 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13712 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13713 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13714 default:
13715 return false;
13718 emit_insn (gen (out, in0, lane));
13719 return true;
13722 static bool
13723 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13725 rtx rperm[MAX_VECT_LEN], sel;
13726 machine_mode vmode = d->vmode;
13727 unsigned int i, nelt = d->perm.length ();
13729 if (d->testing_p)
13730 return true;
13732 /* Generic code will try constant permutation twice. Once with the
13733 original mode and again with the elements lowered to QImode.
13734 So wait and don't do the selector expansion ourselves. */
13735 if (vmode != V8QImode && vmode != V16QImode)
13736 return false;
13738 for (i = 0; i < nelt; ++i)
13740 int nunits = GET_MODE_NUNITS (vmode);
13742 /* If big-endian and two vectors we end up with a weird mixed-endian
13743 mode on NEON. Reverse the index within each word but not the word
13744 itself. */
13745 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13746 : d->perm[i]);
13748 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13749 sel = force_reg (vmode, sel);
13751 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13752 return true;
13755 static bool
13756 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13758 /* The pattern matching functions above are written to look for a small
13759 number to begin the sequence (0, 1, N/2). If we begin with an index
13760 from the second operand, we can swap the operands. */
13761 unsigned int nelt = d->perm.length ();
13762 if (d->perm[0] >= nelt)
13764 gcc_assert (nelt == (nelt & -nelt));
13765 for (unsigned int i = 0; i < nelt; ++i)
13766 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13768 std::swap (d->op0, d->op1);
13771 if (TARGET_SIMD)
13773 if (aarch64_evpc_rev (d))
13774 return true;
13775 else if (aarch64_evpc_ext (d))
13776 return true;
13777 else if (aarch64_evpc_dup (d))
13778 return true;
13779 else if (aarch64_evpc_zip (d))
13780 return true;
13781 else if (aarch64_evpc_uzp (d))
13782 return true;
13783 else if (aarch64_evpc_trn (d))
13784 return true;
13785 return aarch64_evpc_tbl (d);
13787 return false;
13790 /* Expand a vec_perm_const pattern. */
13792 bool
13793 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13795 struct expand_vec_perm_d d;
13796 int i, nelt, which;
13798 d.target = target;
13799 d.op0 = op0;
13800 d.op1 = op1;
13802 d.vmode = GET_MODE (target);
13803 gcc_assert (VECTOR_MODE_P (d.vmode));
13804 d.testing_p = false;
13806 nelt = GET_MODE_NUNITS (d.vmode);
13807 d.perm.reserve (nelt);
13808 for (i = which = 0; i < nelt; ++i)
13810 rtx e = XVECEXP (sel, 0, i);
13811 int ei = INTVAL (e) & (2 * nelt - 1);
13812 which |= (ei < nelt ? 1 : 2);
13813 d.perm.quick_push (ei);
13816 switch (which)
13818 default:
13819 gcc_unreachable ();
13821 case 3:
13822 d.one_vector_p = false;
13823 if (!rtx_equal_p (op0, op1))
13824 break;
13826 /* The elements of PERM do not suggest that only the first operand
13827 is used, but both operands are identical. Allow easier matching
13828 of the permutation by folding the permutation into the single
13829 input vector. */
13830 /* Fall Through. */
13831 case 2:
13832 for (i = 0; i < nelt; ++i)
13833 d.perm[i] &= nelt - 1;
13834 d.op0 = op1;
13835 d.one_vector_p = true;
13836 break;
13838 case 1:
13839 d.op1 = op0;
13840 d.one_vector_p = true;
13841 break;
13844 return aarch64_expand_vec_perm_const_1 (&d);
13847 static bool
13848 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
13850 struct expand_vec_perm_d d;
13851 unsigned int i, nelt, which;
13852 bool ret;
13854 d.vmode = vmode;
13855 d.testing_p = true;
13856 d.perm.safe_splice (sel);
13858 /* Calculate whether all elements are in one vector. */
13859 nelt = sel.length ();
13860 for (i = which = 0; i < nelt; ++i)
13862 unsigned int e = d.perm[i];
13863 gcc_assert (e < 2 * nelt);
13864 which |= (e < nelt ? 1 : 2);
13867 /* If all elements are from the second vector, reindex as if from the
13868 first vector. */
13869 if (which == 2)
13870 for (i = 0; i < nelt; ++i)
13871 d.perm[i] -= nelt;
13873 /* Check whether the mask can be applied to a single vector. */
13874 d.one_vector_p = (which != 3);
13876 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13877 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13878 if (!d.one_vector_p)
13879 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13881 start_sequence ();
13882 ret = aarch64_expand_vec_perm_const_1 (&d);
13883 end_sequence ();
13885 return ret;
13889 aarch64_reverse_mask (machine_mode mode)
13891 /* We have to reverse each vector because we dont have
13892 a permuted load that can reverse-load according to ABI rules. */
13893 rtx mask;
13894 rtvec v = rtvec_alloc (16);
13895 int i, j;
13896 int nunits = GET_MODE_NUNITS (mode);
13897 int usize = GET_MODE_UNIT_SIZE (mode);
13899 gcc_assert (BYTES_BIG_ENDIAN);
13900 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13902 for (i = 0; i < nunits; i++)
13903 for (j = 0; j < usize; j++)
13904 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13905 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13906 return force_reg (V16QImode, mask);
13909 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
13910 true. However due to issues with register allocation it is preferable
13911 to avoid tieing integer scalar and FP scalar modes. Executing integer
13912 operations in general registers is better than treating them as scalar
13913 vector operations. This reduces latency and avoids redundant int<->FP
13914 moves. So tie modes if they are either the same class, or vector modes
13915 with other vector modes, vector structs or any scalar mode. */
13917 static bool
13918 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13920 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13921 return true;
13923 /* We specifically want to allow elements of "structure" modes to
13924 be tieable to the structure. This more general condition allows
13925 other rarer situations too. */
13926 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13927 return true;
13929 /* Also allow any scalar modes with vectors. */
13930 if (aarch64_vector_mode_supported_p (mode1)
13931 || aarch64_vector_mode_supported_p (mode2))
13932 return true;
13934 return false;
13937 /* Return a new RTX holding the result of moving POINTER forward by
13938 AMOUNT bytes. */
13940 static rtx
13941 aarch64_move_pointer (rtx pointer, int amount)
13943 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13945 return adjust_automodify_address (pointer, GET_MODE (pointer),
13946 next, amount);
13949 /* Return a new RTX holding the result of moving POINTER forward by the
13950 size of the mode it points to. */
13952 static rtx
13953 aarch64_progress_pointer (rtx pointer)
13955 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13957 return aarch64_move_pointer (pointer, amount);
13960 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13961 MODE bytes. */
13963 static void
13964 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13965 machine_mode mode)
13967 rtx reg = gen_reg_rtx (mode);
13969 /* "Cast" the pointers to the correct mode. */
13970 *src = adjust_address (*src, mode, 0);
13971 *dst = adjust_address (*dst, mode, 0);
13972 /* Emit the memcpy. */
13973 emit_move_insn (reg, *src);
13974 emit_move_insn (*dst, reg);
13975 /* Move the pointers forward. */
13976 *src = aarch64_progress_pointer (*src);
13977 *dst = aarch64_progress_pointer (*dst);
13980 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13981 we succeed, otherwise return false. */
13983 bool
13984 aarch64_expand_movmem (rtx *operands)
13986 unsigned int n;
13987 rtx dst = operands[0];
13988 rtx src = operands[1];
13989 rtx base;
13990 bool speed_p = !optimize_function_for_size_p (cfun);
13992 /* When optimizing for size, give a better estimate of the length of a
13993 memcpy call, but use the default otherwise. */
13994 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13996 /* We can't do anything smart if the amount to copy is not constant. */
13997 if (!CONST_INT_P (operands[2]))
13998 return false;
14000 n = UINTVAL (operands[2]);
14002 /* Try to keep the number of instructions low. For cases below 16 bytes we
14003 need to make at most two moves. For cases above 16 bytes it will be one
14004 move for each 16 byte chunk, then at most two additional moves. */
14005 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
14006 return false;
14008 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
14009 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
14011 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
14012 src = adjust_automodify_address (src, VOIDmode, base, 0);
14014 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
14015 1-byte chunk. */
14016 if (n < 4)
14018 if (n >= 2)
14020 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14021 n -= 2;
14024 if (n == 1)
14025 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14027 return true;
14030 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
14031 4-byte chunk, partially overlapping with the previously copied chunk. */
14032 if (n < 8)
14034 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14035 n -= 4;
14036 if (n > 0)
14038 int move = n - 4;
14040 src = aarch64_move_pointer (src, move);
14041 dst = aarch64_move_pointer (dst, move);
14042 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14044 return true;
14047 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
14048 them, then (if applicable) an 8-byte chunk. */
14049 while (n >= 8)
14051 if (n / 16)
14053 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14054 n -= 16;
14056 else
14058 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14059 n -= 8;
14063 /* Finish the final bytes of the copy. We can always do this in one
14064 instruction. We either copy the exact amount we need, or partially
14065 overlap with the previous chunk we copied and copy 8-bytes. */
14066 if (n == 0)
14067 return true;
14068 else if (n == 1)
14069 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14070 else if (n == 2)
14071 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14072 else if (n == 4)
14073 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14074 else
14076 if (n == 3)
14078 src = aarch64_move_pointer (src, -1);
14079 dst = aarch64_move_pointer (dst, -1);
14080 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14082 else
14084 int move = n - 8;
14086 src = aarch64_move_pointer (src, move);
14087 dst = aarch64_move_pointer (dst, move);
14088 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14092 return true;
14095 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14096 SImode stores. Handle the case when the constant has identical
14097 bottom and top halves. This is beneficial when the two stores can be
14098 merged into an STP and we avoid synthesising potentially expensive
14099 immediates twice. Return true if such a split is possible. */
14101 bool
14102 aarch64_split_dimode_const_store (rtx dst, rtx src)
14104 rtx lo = gen_lowpart (SImode, src);
14105 rtx hi = gen_highpart_mode (SImode, DImode, src);
14107 bool size_p = optimize_function_for_size_p (cfun);
14109 if (!rtx_equal_p (lo, hi))
14110 return false;
14112 unsigned int orig_cost
14113 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14114 unsigned int lo_cost
14115 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14117 /* We want to transform:
14118 MOV x1, 49370
14119 MOVK x1, 0x140, lsl 16
14120 MOVK x1, 0xc0da, lsl 32
14121 MOVK x1, 0x140, lsl 48
14122 STR x1, [x0]
14123 into:
14124 MOV w1, 49370
14125 MOVK w1, 0x140, lsl 16
14126 STP w1, w1, [x0]
14127 So we want to perform this only when we save two instructions
14128 or more. When optimizing for size, however, accept any code size
14129 savings we can. */
14130 if (size_p && orig_cost <= lo_cost)
14131 return false;
14133 if (!size_p
14134 && (orig_cost <= lo_cost + 1))
14135 return false;
14137 rtx mem_lo = adjust_address (dst, SImode, 0);
14138 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14139 return false;
14141 rtx tmp_reg = gen_reg_rtx (SImode);
14142 aarch64_expand_mov_immediate (tmp_reg, lo);
14143 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14144 /* Don't emit an explicit store pair as this may not be always profitable.
14145 Let the sched-fusion logic decide whether to merge them. */
14146 emit_move_insn (mem_lo, tmp_reg);
14147 emit_move_insn (mem_hi, tmp_reg);
14149 return true;
14152 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14154 static unsigned HOST_WIDE_INT
14155 aarch64_asan_shadow_offset (void)
14157 return (HOST_WIDE_INT_1 << 36);
14160 static bool
14161 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14162 unsigned int align,
14163 enum by_pieces_operation op,
14164 bool speed_p)
14166 /* STORE_BY_PIECES can be used when copying a constant string, but
14167 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14168 For now we always fail this and let the move_by_pieces code copy
14169 the string from read-only memory. */
14170 if (op == STORE_BY_PIECES)
14171 return false;
14173 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14176 static rtx
14177 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14178 int code, tree treeop0, tree treeop1)
14180 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14181 rtx op0, op1;
14182 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14183 insn_code icode;
14184 struct expand_operand ops[4];
14186 start_sequence ();
14187 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14189 op_mode = GET_MODE (op0);
14190 if (op_mode == VOIDmode)
14191 op_mode = GET_MODE (op1);
14193 switch (op_mode)
14195 case E_QImode:
14196 case E_HImode:
14197 case E_SImode:
14198 cmp_mode = SImode;
14199 icode = CODE_FOR_cmpsi;
14200 break;
14202 case E_DImode:
14203 cmp_mode = DImode;
14204 icode = CODE_FOR_cmpdi;
14205 break;
14207 case E_SFmode:
14208 cmp_mode = SFmode;
14209 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14210 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14211 break;
14213 case E_DFmode:
14214 cmp_mode = DFmode;
14215 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14216 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14217 break;
14219 default:
14220 end_sequence ();
14221 return NULL_RTX;
14224 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14225 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14226 if (!op0 || !op1)
14228 end_sequence ();
14229 return NULL_RTX;
14231 *prep_seq = get_insns ();
14232 end_sequence ();
14234 create_fixed_operand (&ops[0], op0);
14235 create_fixed_operand (&ops[1], op1);
14237 start_sequence ();
14238 if (!maybe_expand_insn (icode, 2, ops))
14240 end_sequence ();
14241 return NULL_RTX;
14243 *gen_seq = get_insns ();
14244 end_sequence ();
14246 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14247 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14250 static rtx
14251 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14252 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14254 rtx op0, op1, target;
14255 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14256 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14257 insn_code icode;
14258 struct expand_operand ops[6];
14259 int aarch64_cond;
14261 push_to_sequence (*prep_seq);
14262 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14264 op_mode = GET_MODE (op0);
14265 if (op_mode == VOIDmode)
14266 op_mode = GET_MODE (op1);
14268 switch (op_mode)
14270 case E_QImode:
14271 case E_HImode:
14272 case E_SImode:
14273 cmp_mode = SImode;
14274 icode = CODE_FOR_ccmpsi;
14275 break;
14277 case E_DImode:
14278 cmp_mode = DImode;
14279 icode = CODE_FOR_ccmpdi;
14280 break;
14282 case E_SFmode:
14283 cmp_mode = SFmode;
14284 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14285 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14286 break;
14288 case E_DFmode:
14289 cmp_mode = DFmode;
14290 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14291 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14292 break;
14294 default:
14295 end_sequence ();
14296 return NULL_RTX;
14299 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14300 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14301 if (!op0 || !op1)
14303 end_sequence ();
14304 return NULL_RTX;
14306 *prep_seq = get_insns ();
14307 end_sequence ();
14309 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14310 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14312 if (bit_code != AND)
14314 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14315 GET_MODE (XEXP (prev, 0))),
14316 VOIDmode, XEXP (prev, 0), const0_rtx);
14317 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14320 create_fixed_operand (&ops[0], XEXP (prev, 0));
14321 create_fixed_operand (&ops[1], target);
14322 create_fixed_operand (&ops[2], op0);
14323 create_fixed_operand (&ops[3], op1);
14324 create_fixed_operand (&ops[4], prev);
14325 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14327 push_to_sequence (*gen_seq);
14328 if (!maybe_expand_insn (icode, 6, ops))
14330 end_sequence ();
14331 return NULL_RTX;
14334 *gen_seq = get_insns ();
14335 end_sequence ();
14337 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14340 #undef TARGET_GEN_CCMP_FIRST
14341 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14343 #undef TARGET_GEN_CCMP_NEXT
14344 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14346 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14347 instruction fusion of some sort. */
14349 static bool
14350 aarch64_macro_fusion_p (void)
14352 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14356 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14357 should be kept together during scheduling. */
14359 static bool
14360 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14362 rtx set_dest;
14363 rtx prev_set = single_set (prev);
14364 rtx curr_set = single_set (curr);
14365 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14366 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14368 if (!aarch64_macro_fusion_p ())
14369 return false;
14371 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14373 /* We are trying to match:
14374 prev (mov) == (set (reg r0) (const_int imm16))
14375 curr (movk) == (set (zero_extract (reg r0)
14376 (const_int 16)
14377 (const_int 16))
14378 (const_int imm16_1)) */
14380 set_dest = SET_DEST (curr_set);
14382 if (GET_CODE (set_dest) == ZERO_EXTRACT
14383 && CONST_INT_P (SET_SRC (curr_set))
14384 && CONST_INT_P (SET_SRC (prev_set))
14385 && CONST_INT_P (XEXP (set_dest, 2))
14386 && INTVAL (XEXP (set_dest, 2)) == 16
14387 && REG_P (XEXP (set_dest, 0))
14388 && REG_P (SET_DEST (prev_set))
14389 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14391 return true;
14395 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14398 /* We're trying to match:
14399 prev (adrp) == (set (reg r1)
14400 (high (symbol_ref ("SYM"))))
14401 curr (add) == (set (reg r0)
14402 (lo_sum (reg r1)
14403 (symbol_ref ("SYM"))))
14404 Note that r0 need not necessarily be the same as r1, especially
14405 during pre-regalloc scheduling. */
14407 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14408 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14410 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14411 && REG_P (XEXP (SET_SRC (curr_set), 0))
14412 && REGNO (XEXP (SET_SRC (curr_set), 0))
14413 == REGNO (SET_DEST (prev_set))
14414 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14415 XEXP (SET_SRC (curr_set), 1)))
14416 return true;
14420 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14423 /* We're trying to match:
14424 prev (movk) == (set (zero_extract (reg r0)
14425 (const_int 16)
14426 (const_int 32))
14427 (const_int imm16_1))
14428 curr (movk) == (set (zero_extract (reg r0)
14429 (const_int 16)
14430 (const_int 48))
14431 (const_int imm16_2)) */
14433 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14434 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14435 && REG_P (XEXP (SET_DEST (prev_set), 0))
14436 && REG_P (XEXP (SET_DEST (curr_set), 0))
14437 && REGNO (XEXP (SET_DEST (prev_set), 0))
14438 == REGNO (XEXP (SET_DEST (curr_set), 0))
14439 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14440 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14441 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14442 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14443 && CONST_INT_P (SET_SRC (prev_set))
14444 && CONST_INT_P (SET_SRC (curr_set)))
14445 return true;
14448 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14450 /* We're trying to match:
14451 prev (adrp) == (set (reg r0)
14452 (high (symbol_ref ("SYM"))))
14453 curr (ldr) == (set (reg r1)
14454 (mem (lo_sum (reg r0)
14455 (symbol_ref ("SYM")))))
14457 curr (ldr) == (set (reg r1)
14458 (zero_extend (mem
14459 (lo_sum (reg r0)
14460 (symbol_ref ("SYM")))))) */
14461 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14462 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14464 rtx curr_src = SET_SRC (curr_set);
14466 if (GET_CODE (curr_src) == ZERO_EXTEND)
14467 curr_src = XEXP (curr_src, 0);
14469 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14470 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14471 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14472 == REGNO (SET_DEST (prev_set))
14473 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14474 XEXP (SET_SRC (prev_set), 0)))
14475 return true;
14479 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14480 && aarch_crypto_can_dual_issue (prev, curr))
14481 return true;
14483 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14484 && any_condjump_p (curr))
14486 enum attr_type prev_type = get_attr_type (prev);
14488 unsigned int condreg1, condreg2;
14489 rtx cc_reg_1;
14490 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14491 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14493 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14494 && prev
14495 && modified_in_p (cc_reg_1, prev))
14497 /* FIXME: this misses some which is considered simple arthematic
14498 instructions for ThunderX. Simple shifts are missed here. */
14499 if (prev_type == TYPE_ALUS_SREG
14500 || prev_type == TYPE_ALUS_IMM
14501 || prev_type == TYPE_LOGICS_REG
14502 || prev_type == TYPE_LOGICS_IMM)
14503 return true;
14507 if (prev_set
14508 && curr_set
14509 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14510 && any_condjump_p (curr))
14512 /* We're trying to match:
14513 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14514 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14515 (const_int 0))
14516 (label_ref ("SYM"))
14517 (pc)) */
14518 if (SET_DEST (curr_set) == (pc_rtx)
14519 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14520 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14521 && REG_P (SET_DEST (prev_set))
14522 && REGNO (SET_DEST (prev_set))
14523 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14525 /* Fuse ALU operations followed by conditional branch instruction. */
14526 switch (get_attr_type (prev))
14528 case TYPE_ALU_IMM:
14529 case TYPE_ALU_SREG:
14530 case TYPE_ADC_REG:
14531 case TYPE_ADC_IMM:
14532 case TYPE_ADCS_REG:
14533 case TYPE_ADCS_IMM:
14534 case TYPE_LOGIC_REG:
14535 case TYPE_LOGIC_IMM:
14536 case TYPE_CSEL:
14537 case TYPE_ADR:
14538 case TYPE_MOV_IMM:
14539 case TYPE_SHIFT_REG:
14540 case TYPE_SHIFT_IMM:
14541 case TYPE_BFM:
14542 case TYPE_RBIT:
14543 case TYPE_REV:
14544 case TYPE_EXTEND:
14545 return true;
14547 default:;
14552 return false;
14555 /* Return true iff the instruction fusion described by OP is enabled. */
14557 bool
14558 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14560 return (aarch64_tune_params.fusible_ops & op) != 0;
14563 /* If MEM is in the form of [base+offset], extract the two parts
14564 of address and set to BASE and OFFSET, otherwise return false
14565 after clearing BASE and OFFSET. */
14567 bool
14568 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14570 rtx addr;
14572 gcc_assert (MEM_P (mem));
14574 addr = XEXP (mem, 0);
14576 if (REG_P (addr))
14578 *base = addr;
14579 *offset = const0_rtx;
14580 return true;
14583 if (GET_CODE (addr) == PLUS
14584 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14586 *base = XEXP (addr, 0);
14587 *offset = XEXP (addr, 1);
14588 return true;
14591 *base = NULL_RTX;
14592 *offset = NULL_RTX;
14594 return false;
14597 /* Types for scheduling fusion. */
14598 enum sched_fusion_type
14600 SCHED_FUSION_NONE = 0,
14601 SCHED_FUSION_LD_SIGN_EXTEND,
14602 SCHED_FUSION_LD_ZERO_EXTEND,
14603 SCHED_FUSION_LD,
14604 SCHED_FUSION_ST,
14605 SCHED_FUSION_NUM
14608 /* If INSN is a load or store of address in the form of [base+offset],
14609 extract the two parts and set to BASE and OFFSET. Return scheduling
14610 fusion type this INSN is. */
14612 static enum sched_fusion_type
14613 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14615 rtx x, dest, src;
14616 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14618 gcc_assert (INSN_P (insn));
14619 x = PATTERN (insn);
14620 if (GET_CODE (x) != SET)
14621 return SCHED_FUSION_NONE;
14623 src = SET_SRC (x);
14624 dest = SET_DEST (x);
14626 machine_mode dest_mode = GET_MODE (dest);
14628 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14629 return SCHED_FUSION_NONE;
14631 if (GET_CODE (src) == SIGN_EXTEND)
14633 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14634 src = XEXP (src, 0);
14635 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14636 return SCHED_FUSION_NONE;
14638 else if (GET_CODE (src) == ZERO_EXTEND)
14640 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14641 src = XEXP (src, 0);
14642 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14643 return SCHED_FUSION_NONE;
14646 if (GET_CODE (src) == MEM && REG_P (dest))
14647 extract_base_offset_in_addr (src, base, offset);
14648 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14650 fusion = SCHED_FUSION_ST;
14651 extract_base_offset_in_addr (dest, base, offset);
14653 else
14654 return SCHED_FUSION_NONE;
14656 if (*base == NULL_RTX || *offset == NULL_RTX)
14657 fusion = SCHED_FUSION_NONE;
14659 return fusion;
14662 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14664 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14665 and PRI are only calculated for these instructions. For other instruction,
14666 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14667 type instruction fusion can be added by returning different priorities.
14669 It's important that irrelevant instructions get the largest FUSION_PRI. */
14671 static void
14672 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14673 int *fusion_pri, int *pri)
14675 int tmp, off_val;
14676 rtx base, offset;
14677 enum sched_fusion_type fusion;
14679 gcc_assert (INSN_P (insn));
14681 tmp = max_pri - 1;
14682 fusion = fusion_load_store (insn, &base, &offset);
14683 if (fusion == SCHED_FUSION_NONE)
14685 *pri = tmp;
14686 *fusion_pri = tmp;
14687 return;
14690 /* Set FUSION_PRI according to fusion type and base register. */
14691 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14693 /* Calculate PRI. */
14694 tmp /= 2;
14696 /* INSN with smaller offset goes first. */
14697 off_val = (int)(INTVAL (offset));
14698 if (off_val >= 0)
14699 tmp -= (off_val & 0xfffff);
14700 else
14701 tmp += ((- off_val) & 0xfffff);
14703 *pri = tmp;
14704 return;
14707 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14708 Adjust priority of sha1h instructions so they are scheduled before
14709 other SHA1 instructions. */
14711 static int
14712 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14714 rtx x = PATTERN (insn);
14716 if (GET_CODE (x) == SET)
14718 x = SET_SRC (x);
14720 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14721 return priority + 10;
14724 return priority;
14727 /* Given OPERANDS of consecutive load/store, check if we can merge
14728 them into ldp/stp. LOAD is true if they are load instructions.
14729 MODE is the mode of memory operands. */
14731 bool
14732 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14733 machine_mode mode)
14735 HOST_WIDE_INT offval_1, offval_2, msize;
14736 enum reg_class rclass_1, rclass_2;
14737 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14739 if (load)
14741 mem_1 = operands[1];
14742 mem_2 = operands[3];
14743 reg_1 = operands[0];
14744 reg_2 = operands[2];
14745 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14746 if (REGNO (reg_1) == REGNO (reg_2))
14747 return false;
14749 else
14751 mem_1 = operands[0];
14752 mem_2 = operands[2];
14753 reg_1 = operands[1];
14754 reg_2 = operands[3];
14757 /* The mems cannot be volatile. */
14758 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14759 return false;
14761 /* If we have SImode and slow unaligned ldp,
14762 check the alignment to be at least 8 byte. */
14763 if (mode == SImode
14764 && (aarch64_tune_params.extra_tuning_flags
14765 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14766 && !optimize_size
14767 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14768 return false;
14770 /* Check if the addresses are in the form of [base+offset]. */
14771 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14772 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14773 return false;
14774 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14775 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14776 return false;
14778 /* Check if the bases are same. */
14779 if (!rtx_equal_p (base_1, base_2))
14780 return false;
14782 offval_1 = INTVAL (offset_1);
14783 offval_2 = INTVAL (offset_2);
14784 msize = GET_MODE_SIZE (mode);
14785 /* Check if the offsets are consecutive. */
14786 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14787 return false;
14789 /* Check if the addresses are clobbered by load. */
14790 if (load)
14792 if (reg_mentioned_p (reg_1, mem_1))
14793 return false;
14795 /* In increasing order, the last load can clobber the address. */
14796 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14797 return false;
14800 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14801 rclass_1 = FP_REGS;
14802 else
14803 rclass_1 = GENERAL_REGS;
14805 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14806 rclass_2 = FP_REGS;
14807 else
14808 rclass_2 = GENERAL_REGS;
14810 /* Check if the registers are of same class. */
14811 if (rclass_1 != rclass_2)
14812 return false;
14814 return true;
14817 /* Given OPERANDS of consecutive load/store, check if we can merge
14818 them into ldp/stp by adjusting the offset. LOAD is true if they
14819 are load instructions. MODE is the mode of memory operands.
14821 Given below consecutive stores:
14823 str w1, [xb, 0x100]
14824 str w1, [xb, 0x104]
14825 str w1, [xb, 0x108]
14826 str w1, [xb, 0x10c]
14828 Though the offsets are out of the range supported by stp, we can
14829 still pair them after adjusting the offset, like:
14831 add scratch, xb, 0x100
14832 stp w1, w1, [scratch]
14833 stp w1, w1, [scratch, 0x8]
14835 The peephole patterns detecting this opportunity should guarantee
14836 the scratch register is avaliable. */
14838 bool
14839 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14840 scalar_mode mode)
14842 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14843 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14844 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14845 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14847 if (load)
14849 reg_1 = operands[0];
14850 mem_1 = operands[1];
14851 reg_2 = operands[2];
14852 mem_2 = operands[3];
14853 reg_3 = operands[4];
14854 mem_3 = operands[5];
14855 reg_4 = operands[6];
14856 mem_4 = operands[7];
14857 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14858 && REG_P (reg_3) && REG_P (reg_4));
14859 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14860 return false;
14862 else
14864 mem_1 = operands[0];
14865 reg_1 = operands[1];
14866 mem_2 = operands[2];
14867 reg_2 = operands[3];
14868 mem_3 = operands[4];
14869 reg_3 = operands[5];
14870 mem_4 = operands[6];
14871 reg_4 = operands[7];
14873 /* Skip if memory operand is by itslef valid for ldp/stp. */
14874 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14875 return false;
14877 /* The mems cannot be volatile. */
14878 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14879 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14880 return false;
14882 /* Check if the addresses are in the form of [base+offset]. */
14883 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14884 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14885 return false;
14886 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14887 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14888 return false;
14889 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14890 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14891 return false;
14892 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14893 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14894 return false;
14896 /* Check if the bases are same. */
14897 if (!rtx_equal_p (base_1, base_2)
14898 || !rtx_equal_p (base_2, base_3)
14899 || !rtx_equal_p (base_3, base_4))
14900 return false;
14902 offval_1 = INTVAL (offset_1);
14903 offval_2 = INTVAL (offset_2);
14904 offval_3 = INTVAL (offset_3);
14905 offval_4 = INTVAL (offset_4);
14906 msize = GET_MODE_SIZE (mode);
14907 /* Check if the offsets are consecutive. */
14908 if ((offval_1 != (offval_2 + msize)
14909 || offval_1 != (offval_3 + msize * 2)
14910 || offval_1 != (offval_4 + msize * 3))
14911 && (offval_4 != (offval_3 + msize)
14912 || offval_4 != (offval_2 + msize * 2)
14913 || offval_4 != (offval_1 + msize * 3)))
14914 return false;
14916 /* Check if the addresses are clobbered by load. */
14917 if (load)
14919 if (reg_mentioned_p (reg_1, mem_1)
14920 || reg_mentioned_p (reg_2, mem_2)
14921 || reg_mentioned_p (reg_3, mem_3))
14922 return false;
14924 /* In increasing order, the last load can clobber the address. */
14925 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14926 return false;
14929 /* If we have SImode and slow unaligned ldp,
14930 check the alignment to be at least 8 byte. */
14931 if (mode == SImode
14932 && (aarch64_tune_params.extra_tuning_flags
14933 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14934 && !optimize_size
14935 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14936 return false;
14938 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14939 rclass_1 = FP_REGS;
14940 else
14941 rclass_1 = GENERAL_REGS;
14943 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14944 rclass_2 = FP_REGS;
14945 else
14946 rclass_2 = GENERAL_REGS;
14948 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14949 rclass_3 = FP_REGS;
14950 else
14951 rclass_3 = GENERAL_REGS;
14953 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14954 rclass_4 = FP_REGS;
14955 else
14956 rclass_4 = GENERAL_REGS;
14958 /* Check if the registers are of same class. */
14959 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14960 return false;
14962 return true;
14965 /* Given OPERANDS of consecutive load/store, this function pairs them
14966 into ldp/stp after adjusting the offset. It depends on the fact
14967 that addresses of load/store instructions are in increasing order.
14968 MODE is the mode of memory operands. CODE is the rtl operator
14969 which should be applied to all memory operands, it's SIGN_EXTEND,
14970 ZERO_EXTEND or UNKNOWN. */
14972 bool
14973 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14974 scalar_mode mode, RTX_CODE code)
14976 rtx base, offset, t1, t2;
14977 rtx mem_1, mem_2, mem_3, mem_4;
14978 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14980 if (load)
14982 mem_1 = operands[1];
14983 mem_2 = operands[3];
14984 mem_3 = operands[5];
14985 mem_4 = operands[7];
14987 else
14989 mem_1 = operands[0];
14990 mem_2 = operands[2];
14991 mem_3 = operands[4];
14992 mem_4 = operands[6];
14993 gcc_assert (code == UNKNOWN);
14996 extract_base_offset_in_addr (mem_1, &base, &offset);
14997 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14999 /* Adjust offset thus it can fit in ldp/stp instruction. */
15000 msize = GET_MODE_SIZE (mode);
15001 stp_off_limit = msize * 0x40;
15002 off_val = INTVAL (offset);
15003 abs_off = (off_val < 0) ? -off_val : off_val;
15004 new_off = abs_off % stp_off_limit;
15005 adj_off = abs_off - new_off;
15007 /* Further adjust to make sure all offsets are OK. */
15008 if ((new_off + msize * 2) >= stp_off_limit)
15010 adj_off += stp_off_limit;
15011 new_off -= stp_off_limit;
15014 /* Make sure the adjustment can be done with ADD/SUB instructions. */
15015 if (adj_off >= 0x1000)
15016 return false;
15018 if (off_val < 0)
15020 adj_off = -adj_off;
15021 new_off = -new_off;
15024 /* Create new memory references. */
15025 mem_1 = change_address (mem_1, VOIDmode,
15026 plus_constant (DImode, operands[8], new_off));
15028 /* Check if the adjusted address is OK for ldp/stp. */
15029 if (!aarch64_mem_pair_operand (mem_1, mode))
15030 return false;
15032 msize = GET_MODE_SIZE (mode);
15033 mem_2 = change_address (mem_2, VOIDmode,
15034 plus_constant (DImode,
15035 operands[8],
15036 new_off + msize));
15037 mem_3 = change_address (mem_3, VOIDmode,
15038 plus_constant (DImode,
15039 operands[8],
15040 new_off + msize * 2));
15041 mem_4 = change_address (mem_4, VOIDmode,
15042 plus_constant (DImode,
15043 operands[8],
15044 new_off + msize * 3));
15046 if (code == ZERO_EXTEND)
15048 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15049 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15050 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15051 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15053 else if (code == SIGN_EXTEND)
15055 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15056 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15057 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15058 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15061 if (load)
15063 operands[1] = mem_1;
15064 operands[3] = mem_2;
15065 operands[5] = mem_3;
15066 operands[7] = mem_4;
15068 else
15070 operands[0] = mem_1;
15071 operands[2] = mem_2;
15072 operands[4] = mem_3;
15073 operands[6] = mem_4;
15076 /* Emit adjusting instruction. */
15077 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15078 /* Emit ldp/stp instructions. */
15079 t1 = gen_rtx_SET (operands[0], operands[1]);
15080 t2 = gen_rtx_SET (operands[2], operands[3]);
15081 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15082 t1 = gen_rtx_SET (operands[4], operands[5]);
15083 t2 = gen_rtx_SET (operands[6], operands[7]);
15084 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15085 return true;
15088 /* Return 1 if pseudo register should be created and used to hold
15089 GOT address for PIC code. */
15091 bool
15092 aarch64_use_pseudo_pic_reg (void)
15094 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15097 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15099 static int
15100 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15102 switch (XINT (x, 1))
15104 case UNSPEC_GOTSMALLPIC:
15105 case UNSPEC_GOTSMALLPIC28K:
15106 case UNSPEC_GOTTINYPIC:
15107 return 0;
15108 default:
15109 break;
15112 return default_unspec_may_trap_p (x, flags);
15116 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15117 return the log2 of that value. Otherwise return -1. */
15120 aarch64_fpconst_pow_of_2 (rtx x)
15122 const REAL_VALUE_TYPE *r;
15124 if (!CONST_DOUBLE_P (x))
15125 return -1;
15127 r = CONST_DOUBLE_REAL_VALUE (x);
15129 if (REAL_VALUE_NEGATIVE (*r)
15130 || REAL_VALUE_ISNAN (*r)
15131 || REAL_VALUE_ISINF (*r)
15132 || !real_isinteger (r, DFmode))
15133 return -1;
15135 return exact_log2 (real_to_integer (r));
15138 /* If X is a vector of equal CONST_DOUBLE values and that value is
15139 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15142 aarch64_vec_fpconst_pow_of_2 (rtx x)
15144 if (GET_CODE (x) != CONST_VECTOR)
15145 return -1;
15147 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15148 return -1;
15150 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15151 if (firstval <= 0)
15152 return -1;
15154 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15155 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15156 return -1;
15158 return firstval;
15161 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15162 to float.
15164 __fp16 always promotes through this hook.
15165 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15166 through the generic excess precision logic rather than here. */
15168 static tree
15169 aarch64_promoted_type (const_tree t)
15171 if (SCALAR_FLOAT_TYPE_P (t)
15172 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15173 return float_type_node;
15175 return NULL_TREE;
15178 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15180 static bool
15181 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15182 optimization_type opt_type)
15184 switch (op)
15186 case rsqrt_optab:
15187 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15189 default:
15190 return true;
15194 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15195 if MODE is HFmode, and punt to the generic implementation otherwise. */
15197 static bool
15198 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15200 return (mode == HFmode
15201 ? true
15202 : default_libgcc_floating_mode_supported_p (mode));
15205 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15206 if MODE is HFmode, and punt to the generic implementation otherwise. */
15208 static bool
15209 aarch64_scalar_mode_supported_p (scalar_mode mode)
15211 return (mode == HFmode
15212 ? true
15213 : default_scalar_mode_supported_p (mode));
15216 /* Set the value of FLT_EVAL_METHOD.
15217 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15219 0: evaluate all operations and constants, whose semantic type has at
15220 most the range and precision of type float, to the range and
15221 precision of float; evaluate all other operations and constants to
15222 the range and precision of the semantic type;
15224 N, where _FloatN is a supported interchange floating type
15225 evaluate all operations and constants, whose semantic type has at
15226 most the range and precision of _FloatN type, to the range and
15227 precision of the _FloatN type; evaluate all other operations and
15228 constants to the range and precision of the semantic type;
15230 If we have the ARMv8.2-A extensions then we support _Float16 in native
15231 precision, so we should set this to 16. Otherwise, we support the type,
15232 but want to evaluate expressions in float precision, so set this to
15233 0. */
15235 static enum flt_eval_method
15236 aarch64_excess_precision (enum excess_precision_type type)
15238 switch (type)
15240 case EXCESS_PRECISION_TYPE_FAST:
15241 case EXCESS_PRECISION_TYPE_STANDARD:
15242 /* We can calculate either in 16-bit range and precision or
15243 32-bit range and precision. Make that decision based on whether
15244 we have native support for the ARMv8.2-A 16-bit floating-point
15245 instructions or not. */
15246 return (TARGET_FP_F16INST
15247 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15248 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15249 case EXCESS_PRECISION_TYPE_IMPLICIT:
15250 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15251 default:
15252 gcc_unreachable ();
15254 return FLT_EVAL_METHOD_UNPREDICTABLE;
15257 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15258 scheduled for speculative execution. Reject the long-running division
15259 and square-root instructions. */
15261 static bool
15262 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15264 switch (get_attr_type (insn))
15266 case TYPE_SDIV:
15267 case TYPE_UDIV:
15268 case TYPE_FDIVS:
15269 case TYPE_FDIVD:
15270 case TYPE_FSQRTS:
15271 case TYPE_FSQRTD:
15272 case TYPE_NEON_FP_SQRT_S:
15273 case TYPE_NEON_FP_SQRT_D:
15274 case TYPE_NEON_FP_SQRT_S_Q:
15275 case TYPE_NEON_FP_SQRT_D_Q:
15276 case TYPE_NEON_FP_DIV_S:
15277 case TYPE_NEON_FP_DIV_D:
15278 case TYPE_NEON_FP_DIV_S_Q:
15279 case TYPE_NEON_FP_DIV_D_Q:
15280 return false;
15281 default:
15282 return true;
15286 /* Target-specific selftests. */
15288 #if CHECKING_P
15290 namespace selftest {
15292 /* Selftest for the RTL loader.
15293 Verify that the RTL loader copes with a dump from
15294 print_rtx_function. This is essentially just a test that class
15295 function_reader can handle a real dump, but it also verifies
15296 that lookup_reg_by_dump_name correctly handles hard regs.
15297 The presence of hard reg names in the dump means that the test is
15298 target-specific, hence it is in this file. */
15300 static void
15301 aarch64_test_loading_full_dump ()
15303 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15305 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15307 rtx_insn *insn_1 = get_insn_by_uid (1);
15308 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15310 rtx_insn *insn_15 = get_insn_by_uid (15);
15311 ASSERT_EQ (INSN, GET_CODE (insn_15));
15312 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15314 /* Verify crtl->return_rtx. */
15315 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15316 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15317 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15320 /* Run all target-specific selftests. */
15322 static void
15323 aarch64_run_selftests (void)
15325 aarch64_test_loading_full_dump ();
15328 } // namespace selftest
15330 #endif /* #if CHECKING_P */
15332 #undef TARGET_ADDRESS_COST
15333 #define TARGET_ADDRESS_COST aarch64_address_cost
15335 /* This hook will determines whether unnamed bitfields affect the alignment
15336 of the containing structure. The hook returns true if the structure
15337 should inherit the alignment requirements of an unnamed bitfield's
15338 type. */
15339 #undef TARGET_ALIGN_ANON_BITFIELD
15340 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15342 #undef TARGET_ASM_ALIGNED_DI_OP
15343 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15345 #undef TARGET_ASM_ALIGNED_HI_OP
15346 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15348 #undef TARGET_ASM_ALIGNED_SI_OP
15349 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15351 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15352 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15353 hook_bool_const_tree_hwi_hwi_const_tree_true
15355 #undef TARGET_ASM_FILE_START
15356 #define TARGET_ASM_FILE_START aarch64_start_file
15358 #undef TARGET_ASM_OUTPUT_MI_THUNK
15359 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15361 #undef TARGET_ASM_SELECT_RTX_SECTION
15362 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15364 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15365 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15367 #undef TARGET_BUILD_BUILTIN_VA_LIST
15368 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15370 #undef TARGET_CALLEE_COPIES
15371 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15373 #undef TARGET_CAN_ELIMINATE
15374 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15376 #undef TARGET_CAN_INLINE_P
15377 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15379 #undef TARGET_CANNOT_FORCE_CONST_MEM
15380 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15382 #undef TARGET_CASE_VALUES_THRESHOLD
15383 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15385 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15386 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15388 /* Only the least significant bit is used for initialization guard
15389 variables. */
15390 #undef TARGET_CXX_GUARD_MASK_BIT
15391 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15393 #undef TARGET_C_MODE_FOR_SUFFIX
15394 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15396 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15397 #undef TARGET_DEFAULT_TARGET_FLAGS
15398 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15399 #endif
15401 #undef TARGET_CLASS_MAX_NREGS
15402 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15404 #undef TARGET_BUILTIN_DECL
15405 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15407 #undef TARGET_BUILTIN_RECIPROCAL
15408 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15410 #undef TARGET_C_EXCESS_PRECISION
15411 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15413 #undef TARGET_EXPAND_BUILTIN
15414 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15416 #undef TARGET_EXPAND_BUILTIN_VA_START
15417 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15419 #undef TARGET_FOLD_BUILTIN
15420 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15422 #undef TARGET_FUNCTION_ARG
15423 #define TARGET_FUNCTION_ARG aarch64_function_arg
15425 #undef TARGET_FUNCTION_ARG_ADVANCE
15426 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15428 #undef TARGET_FUNCTION_ARG_BOUNDARY
15429 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15431 #undef TARGET_FUNCTION_ARG_PADDING
15432 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15434 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15435 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15437 #undef TARGET_FUNCTION_VALUE
15438 #define TARGET_FUNCTION_VALUE aarch64_function_value
15440 #undef TARGET_FUNCTION_VALUE_REGNO_P
15441 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15443 #undef TARGET_FRAME_POINTER_REQUIRED
15444 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15446 #undef TARGET_GIMPLE_FOLD_BUILTIN
15447 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15449 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15450 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15452 #undef TARGET_INIT_BUILTINS
15453 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15455 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15456 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15457 aarch64_ira_change_pseudo_allocno_class
15459 #undef TARGET_LEGITIMATE_ADDRESS_P
15460 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15462 #undef TARGET_LEGITIMATE_CONSTANT_P
15463 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15465 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15466 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15467 aarch64_legitimize_address_displacement
15469 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15470 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15472 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15473 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15474 aarch64_libgcc_floating_mode_supported_p
15476 #undef TARGET_MANGLE_TYPE
15477 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15479 #undef TARGET_MEMORY_MOVE_COST
15480 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15482 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15483 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15485 #undef TARGET_MUST_PASS_IN_STACK
15486 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15488 /* This target hook should return true if accesses to volatile bitfields
15489 should use the narrowest mode possible. It should return false if these
15490 accesses should use the bitfield container type. */
15491 #undef TARGET_NARROW_VOLATILE_BITFIELD
15492 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15494 #undef TARGET_OPTION_OVERRIDE
15495 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15497 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15498 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15499 aarch64_override_options_after_change
15501 #undef TARGET_OPTION_SAVE
15502 #define TARGET_OPTION_SAVE aarch64_option_save
15504 #undef TARGET_OPTION_RESTORE
15505 #define TARGET_OPTION_RESTORE aarch64_option_restore
15507 #undef TARGET_OPTION_PRINT
15508 #define TARGET_OPTION_PRINT aarch64_option_print
15510 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15511 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15513 #undef TARGET_SET_CURRENT_FUNCTION
15514 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15516 #undef TARGET_PASS_BY_REFERENCE
15517 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15519 #undef TARGET_PREFERRED_RELOAD_CLASS
15520 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15522 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15523 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15525 #undef TARGET_PROMOTED_TYPE
15526 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15528 #undef TARGET_SECONDARY_RELOAD
15529 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15531 #undef TARGET_SHIFT_TRUNCATION_MASK
15532 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15534 #undef TARGET_SETUP_INCOMING_VARARGS
15535 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15537 #undef TARGET_STRUCT_VALUE_RTX
15538 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15540 #undef TARGET_REGISTER_MOVE_COST
15541 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15543 #undef TARGET_RETURN_IN_MEMORY
15544 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15546 #undef TARGET_RETURN_IN_MSB
15547 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15549 #undef TARGET_RTX_COSTS
15550 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15552 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15553 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15555 #undef TARGET_SCHED_ISSUE_RATE
15556 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15558 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15559 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15560 aarch64_sched_first_cycle_multipass_dfa_lookahead
15562 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15563 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15564 aarch64_first_cycle_multipass_dfa_lookahead_guard
15566 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15567 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15568 aarch64_get_separate_components
15570 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15571 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15572 aarch64_components_for_bb
15574 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15575 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15576 aarch64_disqualify_components
15578 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15579 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15580 aarch64_emit_prologue_components
15582 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15583 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15584 aarch64_emit_epilogue_components
15586 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15587 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15588 aarch64_set_handled_components
15590 #undef TARGET_TRAMPOLINE_INIT
15591 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15593 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15594 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15596 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15597 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15599 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15600 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15601 aarch64_builtin_support_vector_misalignment
15603 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15604 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15606 #undef TARGET_VECTORIZE_ADD_STMT_COST
15607 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15609 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15610 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15611 aarch64_builtin_vectorization_cost
15613 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15614 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15616 #undef TARGET_VECTORIZE_BUILTINS
15617 #define TARGET_VECTORIZE_BUILTINS
15619 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15620 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15621 aarch64_builtin_vectorized_function
15623 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15624 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15625 aarch64_autovectorize_vector_sizes
15627 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15628 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15629 aarch64_atomic_assign_expand_fenv
15631 /* Section anchor support. */
15633 #undef TARGET_MIN_ANCHOR_OFFSET
15634 #define TARGET_MIN_ANCHOR_OFFSET -256
15636 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15637 byte offset; we can do much more for larger data types, but have no way
15638 to determine the size of the access. We assume accesses are aligned. */
15639 #undef TARGET_MAX_ANCHOR_OFFSET
15640 #define TARGET_MAX_ANCHOR_OFFSET 4095
15642 #undef TARGET_VECTOR_ALIGNMENT
15643 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15645 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15646 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15647 aarch64_simd_vector_alignment_reachable
15649 /* vec_perm support. */
15651 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15652 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15653 aarch64_vectorize_vec_perm_const_ok
15655 #undef TARGET_INIT_LIBFUNCS
15656 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15658 #undef TARGET_FIXED_CONDITION_CODE_REGS
15659 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15661 #undef TARGET_FLAGS_REGNUM
15662 #define TARGET_FLAGS_REGNUM CC_REGNUM
15664 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15665 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15667 #undef TARGET_ASAN_SHADOW_OFFSET
15668 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15670 #undef TARGET_LEGITIMIZE_ADDRESS
15671 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15673 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15674 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15675 aarch64_use_by_pieces_infrastructure_p
15677 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15678 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15680 #undef TARGET_CAN_USE_DOLOOP_P
15681 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15683 #undef TARGET_SCHED_ADJUST_PRIORITY
15684 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15686 #undef TARGET_SCHED_MACRO_FUSION_P
15687 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15689 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15690 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15692 #undef TARGET_SCHED_FUSION_PRIORITY
15693 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15695 #undef TARGET_UNSPEC_MAY_TRAP_P
15696 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15698 #undef TARGET_USE_PSEUDO_PIC_REG
15699 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15701 #undef TARGET_PRINT_OPERAND
15702 #define TARGET_PRINT_OPERAND aarch64_print_operand
15704 #undef TARGET_PRINT_OPERAND_ADDRESS
15705 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15707 #undef TARGET_OPTAB_SUPPORTED_P
15708 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15710 #undef TARGET_OMIT_STRUCT_RETURN_REG
15711 #define TARGET_OMIT_STRUCT_RETURN_REG true
15713 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15714 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15715 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15717 #undef TARGET_HARD_REGNO_NREGS
15718 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15719 #undef TARGET_HARD_REGNO_MODE_OK
15720 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15722 #undef TARGET_MODES_TIEABLE_P
15723 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15725 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15726 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15727 aarch64_hard_regno_call_part_clobbered
15729 #undef TARGET_CONSTANT_ALIGNMENT
15730 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15732 #if CHECKING_P
15733 #undef TARGET_RUN_TARGET_SELFTESTS
15734 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15735 #endif /* #if CHECKING_P */
15737 struct gcc_target targetm = TARGET_INITIALIZER;
15739 #include "gt-aarch64.h"