Make more use of REG_NREGS
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob64c03da63a782c0fd204bde5fe8b4cac8224c580
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
145 const unsigned char *sel);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings =
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
869 /* A processor implementing AArch64. */
870 struct processor
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
929 aarch64_cc;
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
992 machine_mode mode;
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
997 if (best_class != ALL_REGS)
998 return best_class;
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1045 return mode == OImode || mode == CImode || mode == XImode;
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1067 return false;
1070 /* Implement HARD_REGNO_NREGS. */
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1075 switch (aarch64_regno_regclass (regno))
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1083 gcc_unreachable ();
1086 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return true;
1106 if (FP_REGNUM_P (regno))
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return
1110 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111 else
1112 return true;
1115 return false;
1118 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1119 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1120 clobbers the top 64 bits when restoring the bottom 64 bits. */
1122 static bool
1123 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1125 return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1128 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1129 machine_mode
1130 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1131 machine_mode mode)
1133 /* Handle modes that fit within single registers. */
1134 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1136 if (GET_MODE_SIZE (mode) >= 4)
1137 return mode;
1138 else
1139 return SImode;
1141 /* Fall back to generic for multi-reg and very large modes. */
1142 else
1143 return choose_hard_reg_mode (regno, nregs, false);
1146 /* Return true if calls to DECL should be treated as
1147 long-calls (ie called via a register). */
1148 static bool
1149 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1151 return false;
1154 /* Return true if calls to symbol-ref SYM should be treated as
1155 long-calls (ie called via a register). */
1156 bool
1157 aarch64_is_long_call_p (rtx sym)
1159 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1162 /* Return true if calls to symbol-ref SYM should not go through
1163 plt stubs. */
1165 bool
1166 aarch64_is_noplt_call_p (rtx sym)
1168 const_tree decl = SYMBOL_REF_DECL (sym);
1170 if (flag_pic
1171 && decl
1172 && (!flag_plt
1173 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1174 && !targetm.binds_local_p (decl))
1175 return true;
1177 return false;
1180 /* Return true if the offsets to a zero/sign-extract operation
1181 represent an expression that matches an extend operation. The
1182 operands represent the paramters from
1184 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1185 bool
1186 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1187 rtx extract_imm)
1189 HOST_WIDE_INT mult_val, extract_val;
1191 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1192 return false;
1194 mult_val = INTVAL (mult_imm);
1195 extract_val = INTVAL (extract_imm);
1197 if (extract_val > 8
1198 && extract_val < GET_MODE_BITSIZE (mode)
1199 && exact_log2 (extract_val & ~7) > 0
1200 && (extract_val & 7) <= 4
1201 && mult_val == (1 << (extract_val & 7)))
1202 return true;
1204 return false;
1207 /* Emit an insn that's a simple single-set. Both the operands must be
1208 known to be valid. */
1209 inline static rtx_insn *
1210 emit_set_insn (rtx x, rtx y)
1212 return emit_insn (gen_rtx_SET (x, y));
1215 /* X and Y are two things to compare using CODE. Emit the compare insn and
1216 return the rtx for register 0 in the proper mode. */
1218 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1220 machine_mode mode = SELECT_CC_MODE (code, x, y);
1221 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1223 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1224 return cc_reg;
1227 /* Build the SYMBOL_REF for __tls_get_addr. */
1229 static GTY(()) rtx tls_get_addr_libfunc;
1232 aarch64_tls_get_addr (void)
1234 if (!tls_get_addr_libfunc)
1235 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1236 return tls_get_addr_libfunc;
1239 /* Return the TLS model to use for ADDR. */
1241 static enum tls_model
1242 tls_symbolic_operand_type (rtx addr)
1244 enum tls_model tls_kind = TLS_MODEL_NONE;
1245 rtx sym, addend;
1247 if (GET_CODE (addr) == CONST)
1249 split_const (addr, &sym, &addend);
1250 if (GET_CODE (sym) == SYMBOL_REF)
1251 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1253 else if (GET_CODE (addr) == SYMBOL_REF)
1254 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1256 return tls_kind;
1259 /* We'll allow lo_sum's in addresses in our legitimate addresses
1260 so that combine would take care of combining addresses where
1261 necessary, but for generation purposes, we'll generate the address
1262 as :
1263 RTL Absolute
1264 tmp = hi (symbol_ref); adrp x1, foo
1265 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1268 PIC TLS
1269 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1270 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1271 bl __tls_get_addr
1274 Load TLS symbol, depending on TLS mechanism and TLS access model.
1276 Global Dynamic - Traditional TLS:
1277 adrp tmp, :tlsgd:imm
1278 add dest, tmp, #:tlsgd_lo12:imm
1279 bl __tls_get_addr
1281 Global Dynamic - TLS Descriptors:
1282 adrp dest, :tlsdesc:imm
1283 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1284 add dest, dest, #:tlsdesc_lo12:imm
1285 blr tmp
1286 mrs tp, tpidr_el0
1287 add dest, dest, tp
1289 Initial Exec:
1290 mrs tp, tpidr_el0
1291 adrp tmp, :gottprel:imm
1292 ldr dest, [tmp, #:gottprel_lo12:imm]
1293 add dest, dest, tp
1295 Local Exec:
1296 mrs tp, tpidr_el0
1297 add t0, tp, #:tprel_hi12:imm, lsl #12
1298 add t0, t0, #:tprel_lo12_nc:imm
1301 static void
1302 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1303 enum aarch64_symbol_type type)
1305 switch (type)
1307 case SYMBOL_SMALL_ABSOLUTE:
1309 /* In ILP32, the mode of dest can be either SImode or DImode. */
1310 rtx tmp_reg = dest;
1311 machine_mode mode = GET_MODE (dest);
1313 gcc_assert (mode == Pmode || mode == ptr_mode);
1315 if (can_create_pseudo_p ())
1316 tmp_reg = gen_reg_rtx (mode);
1318 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1319 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1320 return;
1323 case SYMBOL_TINY_ABSOLUTE:
1324 emit_insn (gen_rtx_SET (dest, imm));
1325 return;
1327 case SYMBOL_SMALL_GOT_28K:
1329 machine_mode mode = GET_MODE (dest);
1330 rtx gp_rtx = pic_offset_table_rtx;
1331 rtx insn;
1332 rtx mem;
1334 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1335 here before rtl expand. Tree IVOPT will generate rtl pattern to
1336 decide rtx costs, in which case pic_offset_table_rtx is not
1337 initialized. For that case no need to generate the first adrp
1338 instruction as the final cost for global variable access is
1339 one instruction. */
1340 if (gp_rtx != NULL)
1342 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1343 using the page base as GOT base, the first page may be wasted,
1344 in the worst scenario, there is only 28K space for GOT).
1346 The generate instruction sequence for accessing global variable
1349 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1351 Only one instruction needed. But we must initialize
1352 pic_offset_table_rtx properly. We generate initialize insn for
1353 every global access, and allow CSE to remove all redundant.
1355 The final instruction sequences will look like the following
1356 for multiply global variables access.
1358 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1360 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1361 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1362 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1363 ... */
1365 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1366 crtl->uses_pic_offset_table = 1;
1367 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1369 if (mode != GET_MODE (gp_rtx))
1370 gp_rtx = gen_lowpart (mode, gp_rtx);
1374 if (mode == ptr_mode)
1376 if (mode == DImode)
1377 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1378 else
1379 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1381 mem = XVECEXP (SET_SRC (insn), 0, 0);
1383 else
1385 gcc_assert (mode == Pmode);
1387 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1388 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1391 /* The operand is expected to be MEM. Whenever the related insn
1392 pattern changed, above code which calculate mem should be
1393 updated. */
1394 gcc_assert (GET_CODE (mem) == MEM);
1395 MEM_READONLY_P (mem) = 1;
1396 MEM_NOTRAP_P (mem) = 1;
1397 emit_insn (insn);
1398 return;
1401 case SYMBOL_SMALL_GOT_4G:
1403 /* In ILP32, the mode of dest can be either SImode or DImode,
1404 while the got entry is always of SImode size. The mode of
1405 dest depends on how dest is used: if dest is assigned to a
1406 pointer (e.g. in the memory), it has SImode; it may have
1407 DImode if dest is dereferenced to access the memeory.
1408 This is why we have to handle three different ldr_got_small
1409 patterns here (two patterns for ILP32). */
1411 rtx insn;
1412 rtx mem;
1413 rtx tmp_reg = dest;
1414 machine_mode mode = GET_MODE (dest);
1416 if (can_create_pseudo_p ())
1417 tmp_reg = gen_reg_rtx (mode);
1419 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1420 if (mode == ptr_mode)
1422 if (mode == DImode)
1423 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1424 else
1425 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1427 mem = XVECEXP (SET_SRC (insn), 0, 0);
1429 else
1431 gcc_assert (mode == Pmode);
1433 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1434 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1437 gcc_assert (GET_CODE (mem) == MEM);
1438 MEM_READONLY_P (mem) = 1;
1439 MEM_NOTRAP_P (mem) = 1;
1440 emit_insn (insn);
1441 return;
1444 case SYMBOL_SMALL_TLSGD:
1446 rtx_insn *insns;
1447 machine_mode mode = GET_MODE (dest);
1448 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1450 start_sequence ();
1451 if (TARGET_ILP32)
1452 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1453 else
1454 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1455 insns = get_insns ();
1456 end_sequence ();
1458 RTL_CONST_CALL_P (insns) = 1;
1459 emit_libcall_block (insns, dest, result, imm);
1460 return;
1463 case SYMBOL_SMALL_TLSDESC:
1465 machine_mode mode = GET_MODE (dest);
1466 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1467 rtx tp;
1469 gcc_assert (mode == Pmode || mode == ptr_mode);
1471 /* In ILP32, the got entry is always of SImode size. Unlike
1472 small GOT, the dest is fixed at reg 0. */
1473 if (TARGET_ILP32)
1474 emit_insn (gen_tlsdesc_small_si (imm));
1475 else
1476 emit_insn (gen_tlsdesc_small_di (imm));
1477 tp = aarch64_load_tp (NULL);
1479 if (mode != Pmode)
1480 tp = gen_lowpart (mode, tp);
1482 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1483 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1484 return;
1487 case SYMBOL_SMALL_TLSIE:
1489 /* In ILP32, the mode of dest can be either SImode or DImode,
1490 while the got entry is always of SImode size. The mode of
1491 dest depends on how dest is used: if dest is assigned to a
1492 pointer (e.g. in the memory), it has SImode; it may have
1493 DImode if dest is dereferenced to access the memeory.
1494 This is why we have to handle three different tlsie_small
1495 patterns here (two patterns for ILP32). */
1496 machine_mode mode = GET_MODE (dest);
1497 rtx tmp_reg = gen_reg_rtx (mode);
1498 rtx tp = aarch64_load_tp (NULL);
1500 if (mode == ptr_mode)
1502 if (mode == DImode)
1503 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1504 else
1506 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1507 tp = gen_lowpart (mode, tp);
1510 else
1512 gcc_assert (mode == Pmode);
1513 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1516 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1517 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518 return;
1521 case SYMBOL_TLSLE12:
1522 case SYMBOL_TLSLE24:
1523 case SYMBOL_TLSLE32:
1524 case SYMBOL_TLSLE48:
1526 machine_mode mode = GET_MODE (dest);
1527 rtx tp = aarch64_load_tp (NULL);
1529 if (mode != Pmode)
1530 tp = gen_lowpart (mode, tp);
1532 switch (type)
1534 case SYMBOL_TLSLE12:
1535 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1536 (dest, tp, imm));
1537 break;
1538 case SYMBOL_TLSLE24:
1539 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1540 (dest, tp, imm));
1541 break;
1542 case SYMBOL_TLSLE32:
1543 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1544 (dest, imm));
1545 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1546 (dest, dest, tp));
1547 break;
1548 case SYMBOL_TLSLE48:
1549 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1550 (dest, imm));
1551 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1552 (dest, dest, tp));
1553 break;
1554 default:
1555 gcc_unreachable ();
1558 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1559 return;
1562 case SYMBOL_TINY_GOT:
1563 emit_insn (gen_ldr_got_tiny (dest, imm));
1564 return;
1566 case SYMBOL_TINY_TLSIE:
1568 machine_mode mode = GET_MODE (dest);
1569 rtx tp = aarch64_load_tp (NULL);
1571 if (mode == ptr_mode)
1573 if (mode == DImode)
1574 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1575 else
1577 tp = gen_lowpart (mode, tp);
1578 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1581 else
1583 gcc_assert (mode == Pmode);
1584 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1587 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1588 return;
1591 default:
1592 gcc_unreachable ();
1596 /* Emit a move from SRC to DEST. Assume that the move expanders can
1597 handle all moves if !can_create_pseudo_p (). The distinction is
1598 important because, unlike emit_move_insn, the move expanders know
1599 how to force Pmode objects into the constant pool even when the
1600 constant pool address is not itself legitimate. */
1601 static rtx
1602 aarch64_emit_move (rtx dest, rtx src)
1604 return (can_create_pseudo_p ()
1605 ? emit_move_insn (dest, src)
1606 : emit_move_insn_1 (dest, src));
1609 /* Split a 128-bit move operation into two 64-bit move operations,
1610 taking care to handle partial overlap of register to register
1611 copies. Special cases are needed when moving between GP regs and
1612 FP regs. SRC can be a register, constant or memory; DST a register
1613 or memory. If either operand is memory it must not have any side
1614 effects. */
1615 void
1616 aarch64_split_128bit_move (rtx dst, rtx src)
1618 rtx dst_lo, dst_hi;
1619 rtx src_lo, src_hi;
1621 machine_mode mode = GET_MODE (dst);
1623 gcc_assert (mode == TImode || mode == TFmode);
1624 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1625 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1627 if (REG_P (dst) && REG_P (src))
1629 int src_regno = REGNO (src);
1630 int dst_regno = REGNO (dst);
1632 /* Handle FP <-> GP regs. */
1633 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1635 src_lo = gen_lowpart (word_mode, src);
1636 src_hi = gen_highpart (word_mode, src);
1638 if (mode == TImode)
1640 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1641 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1643 else
1645 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1646 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1648 return;
1650 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1652 dst_lo = gen_lowpart (word_mode, dst);
1653 dst_hi = gen_highpart (word_mode, dst);
1655 if (mode == TImode)
1657 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1658 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1660 else
1662 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1663 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1665 return;
1669 dst_lo = gen_lowpart (word_mode, dst);
1670 dst_hi = gen_highpart (word_mode, dst);
1671 src_lo = gen_lowpart (word_mode, src);
1672 src_hi = gen_highpart_mode (word_mode, mode, src);
1674 /* At most one pairing may overlap. */
1675 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1677 aarch64_emit_move (dst_hi, src_hi);
1678 aarch64_emit_move (dst_lo, src_lo);
1680 else
1682 aarch64_emit_move (dst_lo, src_lo);
1683 aarch64_emit_move (dst_hi, src_hi);
1687 bool
1688 aarch64_split_128bit_move_p (rtx dst, rtx src)
1690 return (! REG_P (src)
1691 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1694 /* Split a complex SIMD combine. */
1696 void
1697 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1699 machine_mode src_mode = GET_MODE (src1);
1700 machine_mode dst_mode = GET_MODE (dst);
1702 gcc_assert (VECTOR_MODE_P (dst_mode));
1703 gcc_assert (register_operand (dst, dst_mode)
1704 && register_operand (src1, src_mode)
1705 && register_operand (src2, src_mode));
1707 rtx (*gen) (rtx, rtx, rtx);
1709 switch (src_mode)
1711 case E_V8QImode:
1712 gen = gen_aarch64_simd_combinev8qi;
1713 break;
1714 case E_V4HImode:
1715 gen = gen_aarch64_simd_combinev4hi;
1716 break;
1717 case E_V2SImode:
1718 gen = gen_aarch64_simd_combinev2si;
1719 break;
1720 case E_V4HFmode:
1721 gen = gen_aarch64_simd_combinev4hf;
1722 break;
1723 case E_V2SFmode:
1724 gen = gen_aarch64_simd_combinev2sf;
1725 break;
1726 case E_DImode:
1727 gen = gen_aarch64_simd_combinedi;
1728 break;
1729 case E_DFmode:
1730 gen = gen_aarch64_simd_combinedf;
1731 break;
1732 default:
1733 gcc_unreachable ();
1736 emit_insn (gen (dst, src1, src2));
1737 return;
1740 /* Split a complex SIMD move. */
1742 void
1743 aarch64_split_simd_move (rtx dst, rtx src)
1745 machine_mode src_mode = GET_MODE (src);
1746 machine_mode dst_mode = GET_MODE (dst);
1748 gcc_assert (VECTOR_MODE_P (dst_mode));
1750 if (REG_P (dst) && REG_P (src))
1752 rtx (*gen) (rtx, rtx);
1754 gcc_assert (VECTOR_MODE_P (src_mode));
1756 switch (src_mode)
1758 case E_V16QImode:
1759 gen = gen_aarch64_split_simd_movv16qi;
1760 break;
1761 case E_V8HImode:
1762 gen = gen_aarch64_split_simd_movv8hi;
1763 break;
1764 case E_V4SImode:
1765 gen = gen_aarch64_split_simd_movv4si;
1766 break;
1767 case E_V2DImode:
1768 gen = gen_aarch64_split_simd_movv2di;
1769 break;
1770 case E_V8HFmode:
1771 gen = gen_aarch64_split_simd_movv8hf;
1772 break;
1773 case E_V4SFmode:
1774 gen = gen_aarch64_split_simd_movv4sf;
1775 break;
1776 case E_V2DFmode:
1777 gen = gen_aarch64_split_simd_movv2df;
1778 break;
1779 default:
1780 gcc_unreachable ();
1783 emit_insn (gen (dst, src));
1784 return;
1788 bool
1789 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1790 machine_mode ymode, rtx y)
1792 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1793 gcc_assert (r != NULL);
1794 return rtx_equal_p (x, r);
1798 static rtx
1799 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1801 if (can_create_pseudo_p ())
1802 return force_reg (mode, value);
1803 else
1805 x = aarch64_emit_move (x, value);
1806 return x;
1811 static rtx
1812 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1813 HOST_WIDE_INT offset)
1815 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1817 rtx high;
1818 /* Load the full offset into a register. This
1819 might be improvable in the future. */
1820 high = GEN_INT (offset);
1821 offset = 0;
1822 high = aarch64_force_temporary (mode, temp, high);
1823 reg = aarch64_force_temporary (mode, temp,
1824 gen_rtx_PLUS (mode, high, reg));
1826 return plus_constant (mode, reg, offset);
1829 static int
1830 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1831 scalar_int_mode mode)
1833 int i;
1834 unsigned HOST_WIDE_INT val, val2, mask;
1835 int one_match, zero_match;
1836 int num_insns;
1838 val = INTVAL (imm);
1840 if (aarch64_move_imm (val, mode))
1842 if (generate)
1843 emit_insn (gen_rtx_SET (dest, imm));
1844 return 1;
1847 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1848 (with XXXX non-zero). In that case check to see if the move can be done in
1849 a smaller mode. */
1850 val2 = val & 0xffffffff;
1851 if (mode == DImode
1852 && aarch64_move_imm (val2, SImode)
1853 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1855 if (generate)
1856 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1858 /* Check if we have to emit a second instruction by checking to see
1859 if any of the upper 32 bits of the original DI mode value is set. */
1860 if (val == val2)
1861 return 1;
1863 i = (val >> 48) ? 48 : 32;
1865 if (generate)
1866 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1867 GEN_INT ((val >> i) & 0xffff)));
1869 return 2;
1872 if ((val >> 32) == 0 || mode == SImode)
1874 if (generate)
1876 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1877 if (mode == SImode)
1878 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1879 GEN_INT ((val >> 16) & 0xffff)));
1880 else
1881 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1882 GEN_INT ((val >> 16) & 0xffff)));
1884 return 2;
1887 /* Remaining cases are all for DImode. */
1889 mask = 0xffff;
1890 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1891 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1892 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1893 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1895 if (zero_match != 2 && one_match != 2)
1897 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1898 For a 64-bit bitmask try whether changing 16 bits to all ones or
1899 zeroes creates a valid bitmask. To check any repeated bitmask,
1900 try using 16 bits from the other 32-bit half of val. */
1902 for (i = 0; i < 64; i += 16, mask <<= 16)
1904 val2 = val & ~mask;
1905 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1906 break;
1907 val2 = val | mask;
1908 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1909 break;
1910 val2 = val2 & ~mask;
1911 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1912 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1913 break;
1915 if (i != 64)
1917 if (generate)
1919 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1920 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1921 GEN_INT ((val >> i) & 0xffff)));
1923 return 2;
1927 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1928 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1929 otherwise skip zero bits. */
1931 num_insns = 1;
1932 mask = 0xffff;
1933 val2 = one_match > zero_match ? ~val : val;
1934 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1936 if (generate)
1937 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1938 ? (val | ~(mask << i))
1939 : (val & (mask << i)))));
1940 for (i += 16; i < 64; i += 16)
1942 if ((val2 & (mask << i)) == 0)
1943 continue;
1944 if (generate)
1945 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1946 GEN_INT ((val >> i) & 0xffff)));
1947 num_insns ++;
1950 return num_insns;
1954 void
1955 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1957 machine_mode mode = GET_MODE (dest);
1959 gcc_assert (mode == SImode || mode == DImode);
1961 /* Check on what type of symbol it is. */
1962 scalar_int_mode int_mode;
1963 if ((GET_CODE (imm) == SYMBOL_REF
1964 || GET_CODE (imm) == LABEL_REF
1965 || GET_CODE (imm) == CONST)
1966 && is_a <scalar_int_mode> (mode, &int_mode))
1968 rtx mem, base, offset;
1969 enum aarch64_symbol_type sty;
1971 /* If we have (const (plus symbol offset)), separate out the offset
1972 before we start classifying the symbol. */
1973 split_const (imm, &base, &offset);
1975 sty = aarch64_classify_symbol (base, offset);
1976 switch (sty)
1978 case SYMBOL_FORCE_TO_MEM:
1979 if (offset != const0_rtx
1980 && targetm.cannot_force_const_mem (int_mode, imm))
1982 gcc_assert (can_create_pseudo_p ());
1983 base = aarch64_force_temporary (int_mode, dest, base);
1984 base = aarch64_add_offset (int_mode, NULL, base,
1985 INTVAL (offset));
1986 aarch64_emit_move (dest, base);
1987 return;
1990 mem = force_const_mem (ptr_mode, imm);
1991 gcc_assert (mem);
1993 /* If we aren't generating PC relative literals, then
1994 we need to expand the literal pool access carefully.
1995 This is something that needs to be done in a number
1996 of places, so could well live as a separate function. */
1997 if (!aarch64_pcrelative_literal_loads)
1999 gcc_assert (can_create_pseudo_p ());
2000 base = gen_reg_rtx (ptr_mode);
2001 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2002 if (ptr_mode != Pmode)
2003 base = convert_memory_address (Pmode, base);
2004 mem = gen_rtx_MEM (ptr_mode, base);
2007 if (int_mode != ptr_mode)
2008 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2010 emit_insn (gen_rtx_SET (dest, mem));
2012 return;
2014 case SYMBOL_SMALL_TLSGD:
2015 case SYMBOL_SMALL_TLSDESC:
2016 case SYMBOL_SMALL_TLSIE:
2017 case SYMBOL_SMALL_GOT_28K:
2018 case SYMBOL_SMALL_GOT_4G:
2019 case SYMBOL_TINY_GOT:
2020 case SYMBOL_TINY_TLSIE:
2021 if (offset != const0_rtx)
2023 gcc_assert(can_create_pseudo_p ());
2024 base = aarch64_force_temporary (int_mode, dest, base);
2025 base = aarch64_add_offset (int_mode, NULL, base,
2026 INTVAL (offset));
2027 aarch64_emit_move (dest, base);
2028 return;
2030 /* FALLTHRU */
2032 case SYMBOL_SMALL_ABSOLUTE:
2033 case SYMBOL_TINY_ABSOLUTE:
2034 case SYMBOL_TLSLE12:
2035 case SYMBOL_TLSLE24:
2036 case SYMBOL_TLSLE32:
2037 case SYMBOL_TLSLE48:
2038 aarch64_load_symref_appropriately (dest, imm, sty);
2039 return;
2041 default:
2042 gcc_unreachable ();
2046 if (!CONST_INT_P (imm))
2048 if (GET_CODE (imm) == HIGH)
2049 emit_insn (gen_rtx_SET (dest, imm));
2050 else
2052 rtx mem = force_const_mem (mode, imm);
2053 gcc_assert (mem);
2054 emit_insn (gen_rtx_SET (dest, mem));
2057 return;
2060 aarch64_internal_mov_immediate (dest, imm, true,
2061 as_a <scalar_int_mode> (mode));
2064 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2065 temporary value if necessary. FRAME_RELATED_P should be true if
2066 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2067 to the generated instructions. If SCRATCHREG is known to hold
2068 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2069 immediate again.
2071 Since this function may be used to adjust the stack pointer, we must
2072 ensure that it cannot cause transient stack deallocation (for example
2073 by first incrementing SP and then decrementing when adjusting by a
2074 large immediate). */
2076 static void
2077 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2078 int scratchreg, HOST_WIDE_INT delta,
2079 bool frame_related_p, bool emit_move_imm)
2081 HOST_WIDE_INT mdelta = abs_hwi (delta);
2082 rtx this_rtx = gen_rtx_REG (mode, regnum);
2083 rtx_insn *insn;
2085 if (!mdelta)
2086 return;
2088 /* Single instruction adjustment. */
2089 if (aarch64_uimm12_shift (mdelta))
2091 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2092 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2093 return;
2096 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2097 Only do this if mdelta is not a 16-bit move as adjusting using a move
2098 is better. */
2099 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2101 HOST_WIDE_INT low_off = mdelta & 0xfff;
2103 low_off = delta < 0 ? -low_off : low_off;
2104 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2105 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2106 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2107 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2108 return;
2111 /* Emit a move immediate if required and an addition/subtraction. */
2112 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2113 if (emit_move_imm)
2114 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2115 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2116 : gen_add2_insn (this_rtx, scratch_rtx));
2117 if (frame_related_p)
2119 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2120 rtx adj = plus_constant (mode, this_rtx, delta);
2121 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2125 static inline void
2126 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2127 HOST_WIDE_INT delta)
2129 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2132 static inline void
2133 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2135 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2136 true, emit_move_imm);
2139 static inline void
2140 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2142 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2143 frame_related_p, true);
2146 static bool
2147 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2148 tree exp ATTRIBUTE_UNUSED)
2150 /* Currently, always true. */
2151 return true;
2154 /* Implement TARGET_PASS_BY_REFERENCE. */
2156 static bool
2157 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2158 machine_mode mode,
2159 const_tree type,
2160 bool named ATTRIBUTE_UNUSED)
2162 HOST_WIDE_INT size;
2163 machine_mode dummymode;
2164 int nregs;
2166 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2167 size = (mode == BLKmode && type)
2168 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2170 /* Aggregates are passed by reference based on their size. */
2171 if (type && AGGREGATE_TYPE_P (type))
2173 size = int_size_in_bytes (type);
2176 /* Variable sized arguments are always returned by reference. */
2177 if (size < 0)
2178 return true;
2180 /* Can this be a candidate to be passed in fp/simd register(s)? */
2181 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2182 &dummymode, &nregs,
2183 NULL))
2184 return false;
2186 /* Arguments which are variable sized or larger than 2 registers are
2187 passed by reference unless they are a homogenous floating point
2188 aggregate. */
2189 return size > 2 * UNITS_PER_WORD;
2192 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2193 static bool
2194 aarch64_return_in_msb (const_tree valtype)
2196 machine_mode dummy_mode;
2197 int dummy_int;
2199 /* Never happens in little-endian mode. */
2200 if (!BYTES_BIG_ENDIAN)
2201 return false;
2203 /* Only composite types smaller than or equal to 16 bytes can
2204 be potentially returned in registers. */
2205 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2206 || int_size_in_bytes (valtype) <= 0
2207 || int_size_in_bytes (valtype) > 16)
2208 return false;
2210 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2211 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2212 is always passed/returned in the least significant bits of fp/simd
2213 register(s). */
2214 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2215 &dummy_mode, &dummy_int, NULL))
2216 return false;
2218 return true;
2221 /* Implement TARGET_FUNCTION_VALUE.
2222 Define how to find the value returned by a function. */
2224 static rtx
2225 aarch64_function_value (const_tree type, const_tree func,
2226 bool outgoing ATTRIBUTE_UNUSED)
2228 machine_mode mode;
2229 int unsignedp;
2230 int count;
2231 machine_mode ag_mode;
2233 mode = TYPE_MODE (type);
2234 if (INTEGRAL_TYPE_P (type))
2235 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2237 if (aarch64_return_in_msb (type))
2239 HOST_WIDE_INT size = int_size_in_bytes (type);
2241 if (size % UNITS_PER_WORD != 0)
2243 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2244 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2248 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2249 &ag_mode, &count, NULL))
2251 if (!aarch64_composite_type_p (type, mode))
2253 gcc_assert (count == 1 && mode == ag_mode);
2254 return gen_rtx_REG (mode, V0_REGNUM);
2256 else
2258 int i;
2259 rtx par;
2261 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2262 for (i = 0; i < count; i++)
2264 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2265 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2266 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2267 XVECEXP (par, 0, i) = tmp;
2269 return par;
2272 else
2273 return gen_rtx_REG (mode, R0_REGNUM);
2276 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2277 Return true if REGNO is the number of a hard register in which the values
2278 of called function may come back. */
2280 static bool
2281 aarch64_function_value_regno_p (const unsigned int regno)
2283 /* Maximum of 16 bytes can be returned in the general registers. Examples
2284 of 16-byte return values are: 128-bit integers and 16-byte small
2285 structures (excluding homogeneous floating-point aggregates). */
2286 if (regno == R0_REGNUM || regno == R1_REGNUM)
2287 return true;
2289 /* Up to four fp/simd registers can return a function value, e.g. a
2290 homogeneous floating-point aggregate having four members. */
2291 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2292 return TARGET_FLOAT;
2294 return false;
2297 /* Implement TARGET_RETURN_IN_MEMORY.
2299 If the type T of the result of a function is such that
2300 void func (T arg)
2301 would require that arg be passed as a value in a register (or set of
2302 registers) according to the parameter passing rules, then the result
2303 is returned in the same registers as would be used for such an
2304 argument. */
2306 static bool
2307 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2309 HOST_WIDE_INT size;
2310 machine_mode ag_mode;
2311 int count;
2313 if (!AGGREGATE_TYPE_P (type)
2314 && TREE_CODE (type) != COMPLEX_TYPE
2315 && TREE_CODE (type) != VECTOR_TYPE)
2316 /* Simple scalar types always returned in registers. */
2317 return false;
2319 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2320 type,
2321 &ag_mode,
2322 &count,
2323 NULL))
2324 return false;
2326 /* Types larger than 2 registers returned in memory. */
2327 size = int_size_in_bytes (type);
2328 return (size < 0 || size > 2 * UNITS_PER_WORD);
2331 static bool
2332 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2333 const_tree type, int *nregs)
2335 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2336 return aarch64_vfp_is_call_or_return_candidate (mode,
2337 type,
2338 &pcum->aapcs_vfp_rmode,
2339 nregs,
2340 NULL);
2343 /* Given MODE and TYPE of a function argument, return the alignment in
2344 bits. The idea is to suppress any stronger alignment requested by
2345 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2346 This is a helper function for local use only. */
2348 static unsigned int
2349 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2351 if (!type)
2352 return GET_MODE_ALIGNMENT (mode);
2354 if (integer_zerop (TYPE_SIZE (type)))
2355 return 0;
2357 gcc_assert (TYPE_MODE (type) == mode);
2359 if (!AGGREGATE_TYPE_P (type))
2360 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2362 if (TREE_CODE (type) == ARRAY_TYPE)
2363 return TYPE_ALIGN (TREE_TYPE (type));
2365 unsigned int alignment = 0;
2366 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2367 if (TREE_CODE (field) == FIELD_DECL)
2368 alignment = std::max (alignment, DECL_ALIGN (field));
2370 return alignment;
2373 /* Layout a function argument according to the AAPCS64 rules. The rule
2374 numbers refer to the rule numbers in the AAPCS64. */
2376 static void
2377 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2378 const_tree type,
2379 bool named ATTRIBUTE_UNUSED)
2381 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2382 int ncrn, nvrn, nregs;
2383 bool allocate_ncrn, allocate_nvrn;
2384 HOST_WIDE_INT size;
2386 /* We need to do this once per argument. */
2387 if (pcum->aapcs_arg_processed)
2388 return;
2390 pcum->aapcs_arg_processed = true;
2392 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2393 size
2394 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2395 UNITS_PER_WORD);
2397 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2398 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2399 mode,
2400 type,
2401 &nregs);
2403 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2404 The following code thus handles passing by SIMD/FP registers first. */
2406 nvrn = pcum->aapcs_nvrn;
2408 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2409 and homogenous short-vector aggregates (HVA). */
2410 if (allocate_nvrn)
2412 if (!TARGET_FLOAT)
2413 aarch64_err_no_fpadvsimd (mode, "argument");
2415 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2417 pcum->aapcs_nextnvrn = nvrn + nregs;
2418 if (!aarch64_composite_type_p (type, mode))
2420 gcc_assert (nregs == 1);
2421 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2423 else
2425 rtx par;
2426 int i;
2427 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2428 for (i = 0; i < nregs; i++)
2430 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2431 V0_REGNUM + nvrn + i);
2432 tmp = gen_rtx_EXPR_LIST
2433 (VOIDmode, tmp,
2434 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2435 XVECEXP (par, 0, i) = tmp;
2437 pcum->aapcs_reg = par;
2439 return;
2441 else
2443 /* C.3 NSRN is set to 8. */
2444 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2445 goto on_stack;
2449 ncrn = pcum->aapcs_ncrn;
2450 nregs = size / UNITS_PER_WORD;
2452 /* C6 - C9. though the sign and zero extension semantics are
2453 handled elsewhere. This is the case where the argument fits
2454 entirely general registers. */
2455 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2458 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2460 /* C.8 if the argument has an alignment of 16 then the NGRN is
2461 rounded up to the next even number. */
2462 if (nregs == 2
2463 && ncrn % 2
2464 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2465 comparison is there because for > 16 * BITS_PER_UNIT
2466 alignment nregs should be > 2 and therefore it should be
2467 passed by reference rather than value. */
2468 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2470 ++ncrn;
2471 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2474 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2475 A reg is still generated for it, but the caller should be smart
2476 enough not to use it. */
2477 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2478 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2479 else
2481 rtx par;
2482 int i;
2484 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2485 for (i = 0; i < nregs; i++)
2487 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2488 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2489 GEN_INT (i * UNITS_PER_WORD));
2490 XVECEXP (par, 0, i) = tmp;
2492 pcum->aapcs_reg = par;
2495 pcum->aapcs_nextncrn = ncrn + nregs;
2496 return;
2499 /* C.11 */
2500 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2502 /* The argument is passed on stack; record the needed number of words for
2503 this argument and align the total size if necessary. */
2504 on_stack:
2505 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2507 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2508 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2509 16 / UNITS_PER_WORD);
2510 return;
2513 /* Implement TARGET_FUNCTION_ARG. */
2515 static rtx
2516 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2517 const_tree type, bool named)
2519 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2520 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2522 if (mode == VOIDmode)
2523 return NULL_RTX;
2525 aarch64_layout_arg (pcum_v, mode, type, named);
2526 return pcum->aapcs_reg;
2529 void
2530 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2531 const_tree fntype ATTRIBUTE_UNUSED,
2532 rtx libname ATTRIBUTE_UNUSED,
2533 const_tree fndecl ATTRIBUTE_UNUSED,
2534 unsigned n_named ATTRIBUTE_UNUSED)
2536 pcum->aapcs_ncrn = 0;
2537 pcum->aapcs_nvrn = 0;
2538 pcum->aapcs_nextncrn = 0;
2539 pcum->aapcs_nextnvrn = 0;
2540 pcum->pcs_variant = ARM_PCS_AAPCS64;
2541 pcum->aapcs_reg = NULL_RTX;
2542 pcum->aapcs_arg_processed = false;
2543 pcum->aapcs_stack_words = 0;
2544 pcum->aapcs_stack_size = 0;
2546 if (!TARGET_FLOAT
2547 && fndecl && TREE_PUBLIC (fndecl)
2548 && fntype && fntype != error_mark_node)
2550 const_tree type = TREE_TYPE (fntype);
2551 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2552 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2553 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2554 &mode, &nregs, NULL))
2555 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2557 return;
2560 static void
2561 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2562 machine_mode mode,
2563 const_tree type,
2564 bool named)
2566 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2567 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2569 aarch64_layout_arg (pcum_v, mode, type, named);
2570 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2571 != (pcum->aapcs_stack_words != 0));
2572 pcum->aapcs_arg_processed = false;
2573 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2574 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2575 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2576 pcum->aapcs_stack_words = 0;
2577 pcum->aapcs_reg = NULL_RTX;
2581 bool
2582 aarch64_function_arg_regno_p (unsigned regno)
2584 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2585 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2588 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2589 PARM_BOUNDARY bits of alignment, but will be given anything up
2590 to STACK_BOUNDARY bits if the type requires it. This makes sure
2591 that both before and after the layout of each argument, the Next
2592 Stacked Argument Address (NSAA) will have a minimum alignment of
2593 8 bytes. */
2595 static unsigned int
2596 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2598 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2599 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2602 /* Implement TARGET_FUNCTION_ARG_PADDING.
2604 Small aggregate types are placed in the lowest memory address.
2606 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2608 static pad_direction
2609 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2611 /* On little-endian targets, the least significant byte of every stack
2612 argument is passed at the lowest byte address of the stack slot. */
2613 if (!BYTES_BIG_ENDIAN)
2614 return PAD_UPWARD;
2616 /* Otherwise, integral, floating-point and pointer types are padded downward:
2617 the least significant byte of a stack argument is passed at the highest
2618 byte address of the stack slot. */
2619 if (type
2620 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2621 || POINTER_TYPE_P (type))
2622 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2623 return PAD_DOWNWARD;
2625 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2626 return PAD_UPWARD;
2629 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2631 It specifies padding for the last (may also be the only)
2632 element of a block move between registers and memory. If
2633 assuming the block is in the memory, padding upward means that
2634 the last element is padded after its highest significant byte,
2635 while in downward padding, the last element is padded at the
2636 its least significant byte side.
2638 Small aggregates and small complex types are always padded
2639 upwards.
2641 We don't need to worry about homogeneous floating-point or
2642 short-vector aggregates; their move is not affected by the
2643 padding direction determined here. Regardless of endianness,
2644 each element of such an aggregate is put in the least
2645 significant bits of a fp/simd register.
2647 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2648 register has useful data, and return the opposite if the most
2649 significant byte does. */
2651 bool
2652 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2653 bool first ATTRIBUTE_UNUSED)
2656 /* Small composite types are always padded upward. */
2657 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2659 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2660 : GET_MODE_SIZE (mode));
2661 if (size < 2 * UNITS_PER_WORD)
2662 return true;
2665 /* Otherwise, use the default padding. */
2666 return !BYTES_BIG_ENDIAN;
2669 static scalar_int_mode
2670 aarch64_libgcc_cmp_return_mode (void)
2672 return SImode;
2675 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2677 /* We use the 12-bit shifted immediate arithmetic instructions so values
2678 must be multiple of (1 << 12), i.e. 4096. */
2679 #define ARITH_FACTOR 4096
2681 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2682 #error Cannot use simple address calculation for stack probing
2683 #endif
2685 /* The pair of scratch registers used for stack probing. */
2686 #define PROBE_STACK_FIRST_REG 9
2687 #define PROBE_STACK_SECOND_REG 10
2689 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2690 inclusive. These are offsets from the current stack pointer. */
2692 static void
2693 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2695 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2697 /* See the same assertion on PROBE_INTERVAL above. */
2698 gcc_assert ((first % ARITH_FACTOR) == 0);
2700 /* See if we have a constant small number of probes to generate. If so,
2701 that's the easy case. */
2702 if (size <= PROBE_INTERVAL)
2704 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2706 emit_set_insn (reg1,
2707 plus_constant (Pmode,
2708 stack_pointer_rtx, -(first + base)));
2709 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2712 /* The run-time loop is made up of 8 insns in the generic case while the
2713 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2714 else if (size <= 4 * PROBE_INTERVAL)
2716 HOST_WIDE_INT i, rem;
2718 emit_set_insn (reg1,
2719 plus_constant (Pmode,
2720 stack_pointer_rtx,
2721 -(first + PROBE_INTERVAL)));
2722 emit_stack_probe (reg1);
2724 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2725 it exceeds SIZE. If only two probes are needed, this will not
2726 generate any code. Then probe at FIRST + SIZE. */
2727 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2729 emit_set_insn (reg1,
2730 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2731 emit_stack_probe (reg1);
2734 rem = size - (i - PROBE_INTERVAL);
2735 if (rem > 256)
2737 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2739 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2740 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2742 else
2743 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2746 /* Otherwise, do the same as above, but in a loop. Note that we must be
2747 extra careful with variables wrapping around because we might be at
2748 the very top (or the very bottom) of the address space and we have
2749 to be able to handle this case properly; in particular, we use an
2750 equality test for the loop condition. */
2751 else
2753 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2755 /* Step 1: round SIZE to the previous multiple of the interval. */
2757 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2760 /* Step 2: compute initial and final value of the loop counter. */
2762 /* TEST_ADDR = SP + FIRST. */
2763 emit_set_insn (reg1,
2764 plus_constant (Pmode, stack_pointer_rtx, -first));
2766 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2767 HOST_WIDE_INT adjustment = - (first + rounded_size);
2768 if (! aarch64_uimm12_shift (adjustment))
2770 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2771 true, Pmode);
2772 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2774 else
2776 emit_set_insn (reg2,
2777 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2780 /* Step 3: the loop
2784 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2785 probe at TEST_ADDR
2787 while (TEST_ADDR != LAST_ADDR)
2789 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2790 until it is equal to ROUNDED_SIZE. */
2792 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2795 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2796 that SIZE is equal to ROUNDED_SIZE. */
2798 if (size != rounded_size)
2800 HOST_WIDE_INT rem = size - rounded_size;
2802 if (rem > 256)
2804 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2806 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2807 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2809 else
2810 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2814 /* Make sure nothing is scheduled before we are done. */
2815 emit_insn (gen_blockage ());
2818 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2819 absolute addresses. */
2821 const char *
2822 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2824 static int labelno = 0;
2825 char loop_lab[32];
2826 rtx xops[2];
2828 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2830 /* Loop. */
2831 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2833 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2834 xops[0] = reg1;
2835 xops[1] = GEN_INT (PROBE_INTERVAL);
2836 output_asm_insn ("sub\t%0, %0, %1", xops);
2838 /* Probe at TEST_ADDR. */
2839 output_asm_insn ("str\txzr, [%0]", xops);
2841 /* Test if TEST_ADDR == LAST_ADDR. */
2842 xops[1] = reg2;
2843 output_asm_insn ("cmp\t%0, %1", xops);
2845 /* Branch. */
2846 fputs ("\tb.ne\t", asm_out_file);
2847 assemble_name_raw (asm_out_file, loop_lab);
2848 fputc ('\n', asm_out_file);
2850 return "";
2853 static bool
2854 aarch64_frame_pointer_required (void)
2856 /* In aarch64_override_options_after_change
2857 flag_omit_leaf_frame_pointer turns off the frame pointer by
2858 default. Turn it back on now if we've not got a leaf
2859 function. */
2860 if (flag_omit_leaf_frame_pointer
2861 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2862 return true;
2864 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2865 if (crtl->calls_eh_return)
2866 return true;
2868 return false;
2871 /* Mark the registers that need to be saved by the callee and calculate
2872 the size of the callee-saved registers area and frame record (both FP
2873 and LR may be omitted). */
2874 static void
2875 aarch64_layout_frame (void)
2877 HOST_WIDE_INT offset = 0;
2878 int regno, last_fp_reg = INVALID_REGNUM;
2880 if (reload_completed && cfun->machine->frame.laid_out)
2881 return;
2883 #define SLOT_NOT_REQUIRED (-2)
2884 #define SLOT_REQUIRED (-1)
2886 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2887 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2889 /* First mark all the registers that really need to be saved... */
2890 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2891 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2893 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2894 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2896 /* ... that includes the eh data registers (if needed)... */
2897 if (crtl->calls_eh_return)
2898 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2899 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2900 = SLOT_REQUIRED;
2902 /* ... and any callee saved register that dataflow says is live. */
2903 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2904 if (df_regs_ever_live_p (regno)
2905 && (regno == R30_REGNUM
2906 || !call_used_regs[regno]))
2907 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2909 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2910 if (df_regs_ever_live_p (regno)
2911 && !call_used_regs[regno])
2913 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2914 last_fp_reg = regno;
2917 if (frame_pointer_needed)
2919 /* FP and LR are placed in the linkage record. */
2920 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2921 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2922 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2923 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2924 offset += 2 * UNITS_PER_WORD;
2927 /* Now assign stack slots for them. */
2928 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2929 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2931 cfun->machine->frame.reg_offset[regno] = offset;
2932 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2933 cfun->machine->frame.wb_candidate1 = regno;
2934 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2935 cfun->machine->frame.wb_candidate2 = regno;
2936 offset += UNITS_PER_WORD;
2939 HOST_WIDE_INT max_int_offset = offset;
2940 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2941 bool has_align_gap = offset != max_int_offset;
2943 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2944 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2946 /* If there is an alignment gap between integer and fp callee-saves,
2947 allocate the last fp register to it if possible. */
2948 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2950 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2951 break;
2954 cfun->machine->frame.reg_offset[regno] = offset;
2955 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2956 cfun->machine->frame.wb_candidate1 = regno;
2957 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2958 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2959 cfun->machine->frame.wb_candidate2 = regno;
2960 offset += UNITS_PER_WORD;
2963 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2965 cfun->machine->frame.saved_regs_size = offset;
2967 HOST_WIDE_INT varargs_and_saved_regs_size
2968 = offset + cfun->machine->frame.saved_varargs_size;
2970 cfun->machine->frame.hard_fp_offset
2971 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2972 STACK_BOUNDARY / BITS_PER_UNIT);
2974 cfun->machine->frame.frame_size
2975 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2976 + crtl->outgoing_args_size,
2977 STACK_BOUNDARY / BITS_PER_UNIT);
2979 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2981 cfun->machine->frame.initial_adjust = 0;
2982 cfun->machine->frame.final_adjust = 0;
2983 cfun->machine->frame.callee_adjust = 0;
2984 cfun->machine->frame.callee_offset = 0;
2986 HOST_WIDE_INT max_push_offset = 0;
2987 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2988 max_push_offset = 512;
2989 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2990 max_push_offset = 256;
2992 if (cfun->machine->frame.frame_size < max_push_offset
2993 && crtl->outgoing_args_size == 0)
2995 /* Simple, small frame with no outgoing arguments:
2996 stp reg1, reg2, [sp, -frame_size]!
2997 stp reg3, reg4, [sp, 16] */
2998 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3000 else if ((crtl->outgoing_args_size
3001 + cfun->machine->frame.saved_regs_size < 512)
3002 && !(cfun->calls_alloca
3003 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3005 /* Frame with small outgoing arguments:
3006 sub sp, sp, frame_size
3007 stp reg1, reg2, [sp, outgoing_args_size]
3008 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3009 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3010 cfun->machine->frame.callee_offset
3011 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3013 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3015 /* Frame with large outgoing arguments but a small local area:
3016 stp reg1, reg2, [sp, -hard_fp_offset]!
3017 stp reg3, reg4, [sp, 16]
3018 sub sp, sp, outgoing_args_size */
3019 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3020 cfun->machine->frame.final_adjust
3021 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3023 else if (!frame_pointer_needed
3024 && varargs_and_saved_regs_size < max_push_offset)
3026 /* Frame with large local area and outgoing arguments (this pushes the
3027 callee-saves first, followed by the locals and outgoing area):
3028 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3029 stp reg3, reg4, [sp, 16]
3030 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3031 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3032 cfun->machine->frame.final_adjust
3033 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3034 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3035 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3037 else
3039 /* Frame with large local area and outgoing arguments using frame pointer:
3040 sub sp, sp, hard_fp_offset
3041 stp x29, x30, [sp, 0]
3042 add x29, sp, 0
3043 stp reg3, reg4, [sp, 16]
3044 sub sp, sp, outgoing_args_size */
3045 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3046 cfun->machine->frame.final_adjust
3047 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3050 cfun->machine->frame.laid_out = true;
3053 /* Return true if the register REGNO is saved on entry to
3054 the current function. */
3056 static bool
3057 aarch64_register_saved_on_entry (int regno)
3059 return cfun->machine->frame.reg_offset[regno] >= 0;
3062 /* Return the next register up from REGNO up to LIMIT for the callee
3063 to save. */
3065 static unsigned
3066 aarch64_next_callee_save (unsigned regno, unsigned limit)
3068 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3069 regno ++;
3070 return regno;
3073 /* Push the register number REGNO of mode MODE to the stack with write-back
3074 adjusting the stack by ADJUSTMENT. */
3076 static void
3077 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3078 HOST_WIDE_INT adjustment)
3080 rtx base_rtx = stack_pointer_rtx;
3081 rtx insn, reg, mem;
3083 reg = gen_rtx_REG (mode, regno);
3084 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3085 plus_constant (Pmode, base_rtx, -adjustment));
3086 mem = gen_frame_mem (mode, mem);
3088 insn = emit_move_insn (mem, reg);
3089 RTX_FRAME_RELATED_P (insn) = 1;
3092 /* Generate and return an instruction to store the pair of registers
3093 REG and REG2 of mode MODE to location BASE with write-back adjusting
3094 the stack location BASE by ADJUSTMENT. */
3096 static rtx
3097 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3098 HOST_WIDE_INT adjustment)
3100 switch (mode)
3102 case E_DImode:
3103 return gen_storewb_pairdi_di (base, base, reg, reg2,
3104 GEN_INT (-adjustment),
3105 GEN_INT (UNITS_PER_WORD - adjustment));
3106 case E_DFmode:
3107 return gen_storewb_pairdf_di (base, base, reg, reg2,
3108 GEN_INT (-adjustment),
3109 GEN_INT (UNITS_PER_WORD - adjustment));
3110 default:
3111 gcc_unreachable ();
3115 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3116 stack pointer by ADJUSTMENT. */
3118 static void
3119 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3121 rtx_insn *insn;
3122 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3124 if (regno2 == INVALID_REGNUM)
3125 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3127 rtx reg1 = gen_rtx_REG (mode, regno1);
3128 rtx reg2 = gen_rtx_REG (mode, regno2);
3130 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3131 reg2, adjustment));
3132 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3133 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3134 RTX_FRAME_RELATED_P (insn) = 1;
3137 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3138 adjusting it by ADJUSTMENT afterwards. */
3140 static rtx
3141 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3142 HOST_WIDE_INT adjustment)
3144 switch (mode)
3146 case E_DImode:
3147 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3148 GEN_INT (UNITS_PER_WORD));
3149 case E_DFmode:
3150 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3151 GEN_INT (UNITS_PER_WORD));
3152 default:
3153 gcc_unreachable ();
3157 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3158 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3159 into CFI_OPS. */
3161 static void
3162 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3163 rtx *cfi_ops)
3165 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3166 rtx reg1 = gen_rtx_REG (mode, regno1);
3168 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3170 if (regno2 == INVALID_REGNUM)
3172 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3173 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3174 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3176 else
3178 rtx reg2 = gen_rtx_REG (mode, regno2);
3179 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3180 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3181 reg2, adjustment));
3185 /* Generate and return a store pair instruction of mode MODE to store
3186 register REG1 to MEM1 and register REG2 to MEM2. */
3188 static rtx
3189 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3190 rtx reg2)
3192 switch (mode)
3194 case E_DImode:
3195 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3197 case E_DFmode:
3198 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3200 default:
3201 gcc_unreachable ();
3205 /* Generate and regurn a load pair isntruction of mode MODE to load register
3206 REG1 from MEM1 and register REG2 from MEM2. */
3208 static rtx
3209 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3210 rtx mem2)
3212 switch (mode)
3214 case E_DImode:
3215 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3217 case E_DFmode:
3218 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3220 default:
3221 gcc_unreachable ();
3225 /* Return TRUE if return address signing should be enabled for the current
3226 function, otherwise return FALSE. */
3228 bool
3229 aarch64_return_address_signing_enabled (void)
3231 /* This function should only be called after frame laid out. */
3232 gcc_assert (cfun->machine->frame.laid_out);
3234 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3235 if it's LR is pushed onto stack. */
3236 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3237 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3238 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3241 /* Emit code to save the callee-saved registers from register number START
3242 to LIMIT to the stack at the location starting at offset START_OFFSET,
3243 skipping any write-back candidates if SKIP_WB is true. */
3245 static void
3246 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3247 unsigned start, unsigned limit, bool skip_wb)
3249 rtx_insn *insn;
3250 unsigned regno;
3251 unsigned regno2;
3253 for (regno = aarch64_next_callee_save (start, limit);
3254 regno <= limit;
3255 regno = aarch64_next_callee_save (regno + 1, limit))
3257 rtx reg, mem;
3258 HOST_WIDE_INT offset;
3260 if (skip_wb
3261 && (regno == cfun->machine->frame.wb_candidate1
3262 || regno == cfun->machine->frame.wb_candidate2))
3263 continue;
3265 if (cfun->machine->reg_is_wrapped_separately[regno])
3266 continue;
3268 reg = gen_rtx_REG (mode, regno);
3269 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3270 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3271 offset));
3273 regno2 = aarch64_next_callee_save (regno + 1, limit);
3275 if (regno2 <= limit
3276 && !cfun->machine->reg_is_wrapped_separately[regno2]
3277 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3278 == cfun->machine->frame.reg_offset[regno2]))
3281 rtx reg2 = gen_rtx_REG (mode, regno2);
3282 rtx mem2;
3284 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3285 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3286 offset));
3287 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3288 reg2));
3290 /* The first part of a frame-related parallel insn is
3291 always assumed to be relevant to the frame
3292 calculations; subsequent parts, are only
3293 frame-related if explicitly marked. */
3294 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3295 regno = regno2;
3297 else
3298 insn = emit_move_insn (mem, reg);
3300 RTX_FRAME_RELATED_P (insn) = 1;
3304 /* Emit code to restore the callee registers of mode MODE from register
3305 number START up to and including LIMIT. Restore from the stack offset
3306 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3307 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3309 static void
3310 aarch64_restore_callee_saves (machine_mode mode,
3311 HOST_WIDE_INT start_offset, unsigned start,
3312 unsigned limit, bool skip_wb, rtx *cfi_ops)
3314 rtx base_rtx = stack_pointer_rtx;
3315 unsigned regno;
3316 unsigned regno2;
3317 HOST_WIDE_INT offset;
3319 for (regno = aarch64_next_callee_save (start, limit);
3320 regno <= limit;
3321 regno = aarch64_next_callee_save (regno + 1, limit))
3323 if (cfun->machine->reg_is_wrapped_separately[regno])
3324 continue;
3326 rtx reg, mem;
3328 if (skip_wb
3329 && (regno == cfun->machine->frame.wb_candidate1
3330 || regno == cfun->machine->frame.wb_candidate2))
3331 continue;
3333 reg = gen_rtx_REG (mode, regno);
3334 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3335 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3337 regno2 = aarch64_next_callee_save (regno + 1, limit);
3339 if (regno2 <= limit
3340 && !cfun->machine->reg_is_wrapped_separately[regno2]
3341 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3342 == cfun->machine->frame.reg_offset[regno2]))
3344 rtx reg2 = gen_rtx_REG (mode, regno2);
3345 rtx mem2;
3347 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3348 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3349 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3351 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3352 regno = regno2;
3354 else
3355 emit_move_insn (reg, mem);
3356 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3360 static inline bool
3361 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3362 HOST_WIDE_INT offset)
3364 return offset >= -256 && offset < 256;
3367 static inline bool
3368 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3370 return (offset >= 0
3371 && offset < 4096 * GET_MODE_SIZE (mode)
3372 && offset % GET_MODE_SIZE (mode) == 0);
3375 bool
3376 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3378 return (offset >= -64 * GET_MODE_SIZE (mode)
3379 && offset < 64 * GET_MODE_SIZE (mode)
3380 && offset % GET_MODE_SIZE (mode) == 0);
3383 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3385 static sbitmap
3386 aarch64_get_separate_components (void)
3388 aarch64_layout_frame ();
3390 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3391 bitmap_clear (components);
3393 /* The registers we need saved to the frame. */
3394 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3395 if (aarch64_register_saved_on_entry (regno))
3397 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3398 if (!frame_pointer_needed)
3399 offset += cfun->machine->frame.frame_size
3400 - cfun->machine->frame.hard_fp_offset;
3401 /* Check that we can access the stack slot of the register with one
3402 direct load with no adjustments needed. */
3403 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3404 bitmap_set_bit (components, regno);
3407 /* Don't mess with the hard frame pointer. */
3408 if (frame_pointer_needed)
3409 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3411 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3412 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3413 /* If aarch64_layout_frame has chosen registers to store/restore with
3414 writeback don't interfere with them to avoid having to output explicit
3415 stack adjustment instructions. */
3416 if (reg2 != INVALID_REGNUM)
3417 bitmap_clear_bit (components, reg2);
3418 if (reg1 != INVALID_REGNUM)
3419 bitmap_clear_bit (components, reg1);
3421 bitmap_clear_bit (components, LR_REGNUM);
3422 bitmap_clear_bit (components, SP_REGNUM);
3424 return components;
3427 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3429 static sbitmap
3430 aarch64_components_for_bb (basic_block bb)
3432 bitmap in = DF_LIVE_IN (bb);
3433 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3434 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3436 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3437 bitmap_clear (components);
3439 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3440 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3441 if ((!call_used_regs[regno])
3442 && (bitmap_bit_p (in, regno)
3443 || bitmap_bit_p (gen, regno)
3444 || bitmap_bit_p (kill, regno)))
3445 bitmap_set_bit (components, regno);
3447 return components;
3450 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3451 Nothing to do for aarch64. */
3453 static void
3454 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3458 /* Return the next set bit in BMP from START onwards. Return the total number
3459 of bits in BMP if no set bit is found at or after START. */
3461 static unsigned int
3462 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3464 unsigned int nbits = SBITMAP_SIZE (bmp);
3465 if (start == nbits)
3466 return start;
3468 gcc_assert (start < nbits);
3469 for (unsigned int i = start; i < nbits; i++)
3470 if (bitmap_bit_p (bmp, i))
3471 return i;
3473 return nbits;
3476 /* Do the work for aarch64_emit_prologue_components and
3477 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3478 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3479 for these components or the epilogue sequence. That is, it determines
3480 whether we should emit stores or loads and what kind of CFA notes to attach
3481 to the insns. Otherwise the logic for the two sequences is very
3482 similar. */
3484 static void
3485 aarch64_process_components (sbitmap components, bool prologue_p)
3487 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3488 ? HARD_FRAME_POINTER_REGNUM
3489 : STACK_POINTER_REGNUM);
3491 unsigned last_regno = SBITMAP_SIZE (components);
3492 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3493 rtx_insn *insn = NULL;
3495 while (regno != last_regno)
3497 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3498 so DFmode for the vector registers is enough. */
3499 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3500 rtx reg = gen_rtx_REG (mode, regno);
3501 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3502 if (!frame_pointer_needed)
3503 offset += cfun->machine->frame.frame_size
3504 - cfun->machine->frame.hard_fp_offset;
3505 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3506 rtx mem = gen_frame_mem (mode, addr);
3508 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3509 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3510 /* No more registers to handle after REGNO.
3511 Emit a single save/restore and exit. */
3512 if (regno2 == last_regno)
3514 insn = emit_insn (set);
3515 RTX_FRAME_RELATED_P (insn) = 1;
3516 if (prologue_p)
3517 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3518 else
3519 add_reg_note (insn, REG_CFA_RESTORE, reg);
3520 break;
3523 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3524 /* The next register is not of the same class or its offset is not
3525 mergeable with the current one into a pair. */
3526 if (!satisfies_constraint_Ump (mem)
3527 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3528 || (offset2 - cfun->machine->frame.reg_offset[regno])
3529 != GET_MODE_SIZE (mode))
3531 insn = emit_insn (set);
3532 RTX_FRAME_RELATED_P (insn) = 1;
3533 if (prologue_p)
3534 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3535 else
3536 add_reg_note (insn, REG_CFA_RESTORE, reg);
3538 regno = regno2;
3539 continue;
3542 /* REGNO2 can be saved/restored in a pair with REGNO. */
3543 rtx reg2 = gen_rtx_REG (mode, regno2);
3544 if (!frame_pointer_needed)
3545 offset2 += cfun->machine->frame.frame_size
3546 - cfun->machine->frame.hard_fp_offset;
3547 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3548 rtx mem2 = gen_frame_mem (mode, addr2);
3549 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3550 : gen_rtx_SET (reg2, mem2);
3552 if (prologue_p)
3553 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3554 else
3555 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3557 RTX_FRAME_RELATED_P (insn) = 1;
3558 if (prologue_p)
3560 add_reg_note (insn, REG_CFA_OFFSET, set);
3561 add_reg_note (insn, REG_CFA_OFFSET, set2);
3563 else
3565 add_reg_note (insn, REG_CFA_RESTORE, reg);
3566 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3569 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3573 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3575 static void
3576 aarch64_emit_prologue_components (sbitmap components)
3578 aarch64_process_components (components, true);
3581 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3583 static void
3584 aarch64_emit_epilogue_components (sbitmap components)
3586 aarch64_process_components (components, false);
3589 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3591 static void
3592 aarch64_set_handled_components (sbitmap components)
3594 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3595 if (bitmap_bit_p (components, regno))
3596 cfun->machine->reg_is_wrapped_separately[regno] = true;
3599 /* AArch64 stack frames generated by this compiler look like:
3601 +-------------------------------+
3603 | incoming stack arguments |
3605 +-------------------------------+
3606 | | <-- incoming stack pointer (aligned)
3607 | callee-allocated save area |
3608 | for register varargs |
3610 +-------------------------------+
3611 | local variables | <-- frame_pointer_rtx
3613 +-------------------------------+
3614 | padding0 | \
3615 +-------------------------------+ |
3616 | callee-saved registers | | frame.saved_regs_size
3617 +-------------------------------+ |
3618 | LR' | |
3619 +-------------------------------+ |
3620 | FP' | / <- hard_frame_pointer_rtx (aligned)
3621 +-------------------------------+
3622 | dynamic allocation |
3623 +-------------------------------+
3624 | padding |
3625 +-------------------------------+
3626 | outgoing stack arguments | <-- arg_pointer
3628 +-------------------------------+
3629 | | <-- stack_pointer_rtx (aligned)
3631 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3632 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3633 unchanged. */
3635 /* Generate the prologue instructions for entry into a function.
3636 Establish the stack frame by decreasing the stack pointer with a
3637 properly calculated size and, if necessary, create a frame record
3638 filled with the values of LR and previous frame pointer. The
3639 current FP is also set up if it is in use. */
3641 void
3642 aarch64_expand_prologue (void)
3644 aarch64_layout_frame ();
3646 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3647 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3648 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3649 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3650 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3651 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3652 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3653 rtx_insn *insn;
3655 /* Sign return address for functions. */
3656 if (aarch64_return_address_signing_enabled ())
3658 insn = emit_insn (gen_pacisp ());
3659 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3660 RTX_FRAME_RELATED_P (insn) = 1;
3663 if (flag_stack_usage_info)
3664 current_function_static_stack_size = frame_size;
3666 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3668 if (crtl->is_leaf && !cfun->calls_alloca)
3670 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3671 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3672 frame_size - STACK_CHECK_PROTECT);
3674 else if (frame_size > 0)
3675 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3678 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3680 if (callee_adjust != 0)
3681 aarch64_push_regs (reg1, reg2, callee_adjust);
3683 if (frame_pointer_needed)
3685 if (callee_adjust == 0)
3686 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3687 R30_REGNUM, false);
3688 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3689 stack_pointer_rtx,
3690 GEN_INT (callee_offset)));
3691 RTX_FRAME_RELATED_P (insn) = 1;
3692 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3695 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3696 callee_adjust != 0 || frame_pointer_needed);
3697 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3698 callee_adjust != 0 || frame_pointer_needed);
3699 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3702 /* Return TRUE if we can use a simple_return insn.
3704 This function checks whether the callee saved stack is empty, which
3705 means no restore actions are need. The pro_and_epilogue will use
3706 this to check whether shrink-wrapping opt is feasible. */
3708 bool
3709 aarch64_use_return_insn_p (void)
3711 if (!reload_completed)
3712 return false;
3714 if (crtl->profile)
3715 return false;
3717 aarch64_layout_frame ();
3719 return cfun->machine->frame.frame_size == 0;
3722 /* Generate the epilogue instructions for returning from a function.
3723 This is almost exactly the reverse of the prolog sequence, except
3724 that we need to insert barriers to avoid scheduling loads that read
3725 from a deallocated stack, and we optimize the unwind records by
3726 emitting them all together if possible. */
3727 void
3728 aarch64_expand_epilogue (bool for_sibcall)
3730 aarch64_layout_frame ();
3732 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3733 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3734 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3735 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3736 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3737 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3738 rtx cfi_ops = NULL;
3739 rtx_insn *insn;
3741 /* We need to add memory barrier to prevent read from deallocated stack. */
3742 bool need_barrier_p = (get_frame_size ()
3743 + cfun->machine->frame.saved_varargs_size) != 0;
3745 /* Emit a barrier to prevent loads from a deallocated stack. */
3746 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3747 || crtl->calls_eh_return)
3749 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3750 need_barrier_p = false;
3753 /* Restore the stack pointer from the frame pointer if it may not
3754 be the same as the stack pointer. */
3755 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3757 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3758 hard_frame_pointer_rtx,
3759 GEN_INT (-callee_offset)));
3760 /* If writeback is used when restoring callee-saves, the CFA
3761 is restored on the instruction doing the writeback. */
3762 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3764 else
3765 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3767 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3768 callee_adjust != 0, &cfi_ops);
3769 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3770 callee_adjust != 0, &cfi_ops);
3772 if (need_barrier_p)
3773 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3775 if (callee_adjust != 0)
3776 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3778 if (callee_adjust != 0 || initial_adjust > 65536)
3780 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3781 insn = get_last_insn ();
3782 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3783 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3784 RTX_FRAME_RELATED_P (insn) = 1;
3785 cfi_ops = NULL;
3788 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3790 if (cfi_ops)
3792 /* Emit delayed restores and reset the CFA to be SP. */
3793 insn = get_last_insn ();
3794 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3795 REG_NOTES (insn) = cfi_ops;
3796 RTX_FRAME_RELATED_P (insn) = 1;
3799 /* We prefer to emit the combined return/authenticate instruction RETAA,
3800 however there are three cases in which we must instead emit an explicit
3801 authentication instruction.
3803 1) Sibcalls don't return in a normal way, so if we're about to call one
3804 we must authenticate.
3806 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3807 generating code for !TARGET_ARMV8_3 we can't use it and must
3808 explicitly authenticate.
3810 3) On an eh_return path we make extra stack adjustments to update the
3811 canonical frame address to be the exception handler's CFA. We want
3812 to authenticate using the CFA of the function which calls eh_return.
3814 if (aarch64_return_address_signing_enabled ()
3815 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3817 insn = emit_insn (gen_autisp ());
3818 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3819 RTX_FRAME_RELATED_P (insn) = 1;
3822 /* Stack adjustment for exception handler. */
3823 if (crtl->calls_eh_return)
3825 /* We need to unwind the stack by the offset computed by
3826 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3827 to be SP; letting the CFA move during this adjustment
3828 is just as correct as retaining the CFA from the body
3829 of the function. Therefore, do nothing special. */
3830 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3833 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3834 if (!for_sibcall)
3835 emit_jump_insn (ret_rtx);
3838 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3839 normally or return to a previous frame after unwinding.
3841 An EH return uses a single shared return sequence. The epilogue is
3842 exactly like a normal epilogue except that it has an extra input
3843 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3844 that must be applied after the frame has been destroyed. An extra label
3845 is inserted before the epilogue which initializes this register to zero,
3846 and this is the entry point for a normal return.
3848 An actual EH return updates the return address, initializes the stack
3849 adjustment and jumps directly into the epilogue (bypassing the zeroing
3850 of the adjustment). Since the return address is typically saved on the
3851 stack when a function makes a call, the saved LR must be updated outside
3852 the epilogue.
3854 This poses problems as the store is generated well before the epilogue,
3855 so the offset of LR is not known yet. Also optimizations will remove the
3856 store as it appears dead, even after the epilogue is generated (as the
3857 base or offset for loading LR is different in many cases).
3859 To avoid these problems this implementation forces the frame pointer
3860 in eh_return functions so that the location of LR is fixed and known early.
3861 It also marks the store volatile, so no optimization is permitted to
3862 remove the store. */
3864 aarch64_eh_return_handler_rtx (void)
3866 rtx tmp = gen_frame_mem (Pmode,
3867 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3869 /* Mark the store volatile, so no optimization is permitted to remove it. */
3870 MEM_VOLATILE_P (tmp) = true;
3871 return tmp;
3874 /* Output code to add DELTA to the first argument, and then jump
3875 to FUNCTION. Used for C++ multiple inheritance. */
3876 static void
3877 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3878 HOST_WIDE_INT delta,
3879 HOST_WIDE_INT vcall_offset,
3880 tree function)
3882 /* The this pointer is always in x0. Note that this differs from
3883 Arm where the this pointer maybe bumped to r1 if r0 is required
3884 to return a pointer to an aggregate. On AArch64 a result value
3885 pointer will be in x8. */
3886 int this_regno = R0_REGNUM;
3887 rtx this_rtx, temp0, temp1, addr, funexp;
3888 rtx_insn *insn;
3890 reload_completed = 1;
3891 emit_note (NOTE_INSN_PROLOGUE_END);
3893 if (vcall_offset == 0)
3894 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3895 else
3897 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3899 this_rtx = gen_rtx_REG (Pmode, this_regno);
3900 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3901 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3903 addr = this_rtx;
3904 if (delta != 0)
3906 if (delta >= -256 && delta < 256)
3907 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3908 plus_constant (Pmode, this_rtx, delta));
3909 else
3910 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3913 if (Pmode == ptr_mode)
3914 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3915 else
3916 aarch64_emit_move (temp0,
3917 gen_rtx_ZERO_EXTEND (Pmode,
3918 gen_rtx_MEM (ptr_mode, addr)));
3920 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3921 addr = plus_constant (Pmode, temp0, vcall_offset);
3922 else
3924 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3925 Pmode);
3926 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3929 if (Pmode == ptr_mode)
3930 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3931 else
3932 aarch64_emit_move (temp1,
3933 gen_rtx_SIGN_EXTEND (Pmode,
3934 gen_rtx_MEM (ptr_mode, addr)));
3936 emit_insn (gen_add2_insn (this_rtx, temp1));
3939 /* Generate a tail call to the target function. */
3940 if (!TREE_USED (function))
3942 assemble_external (function);
3943 TREE_USED (function) = 1;
3945 funexp = XEXP (DECL_RTL (function), 0);
3946 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3947 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3948 SIBLING_CALL_P (insn) = 1;
3950 insn = get_insns ();
3951 shorten_branches (insn);
3952 final_start_function (insn, file, 1);
3953 final (insn, file, 1);
3954 final_end_function ();
3956 /* Stop pretending to be a post-reload pass. */
3957 reload_completed = 0;
3960 static bool
3961 aarch64_tls_referenced_p (rtx x)
3963 if (!TARGET_HAVE_TLS)
3964 return false;
3965 subrtx_iterator::array_type array;
3966 FOR_EACH_SUBRTX (iter, array, x, ALL)
3968 const_rtx x = *iter;
3969 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3970 return true;
3971 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3972 TLS offsets, not real symbol references. */
3973 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3974 iter.skip_subrtxes ();
3976 return false;
3980 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3981 a left shift of 0 or 12 bits. */
3982 bool
3983 aarch64_uimm12_shift (HOST_WIDE_INT val)
3985 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3986 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3991 /* Return true if val is an immediate that can be loaded into a
3992 register by a MOVZ instruction. */
3993 static bool
3994 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
3996 if (GET_MODE_SIZE (mode) > 4)
3998 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3999 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4000 return 1;
4002 else
4004 /* Ignore sign extension. */
4005 val &= (HOST_WIDE_INT) 0xffffffff;
4007 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4008 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4011 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4013 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4015 0x0000000100000001ull,
4016 0x0001000100010001ull,
4017 0x0101010101010101ull,
4018 0x1111111111111111ull,
4019 0x5555555555555555ull,
4023 /* Return true if val is a valid bitmask immediate. */
4025 bool
4026 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4028 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4029 int bits;
4031 /* Check for a single sequence of one bits and return quickly if so.
4032 The special cases of all ones and all zeroes returns false. */
4033 val = (unsigned HOST_WIDE_INT) val_in;
4034 tmp = val + (val & -val);
4036 if (tmp == (tmp & -tmp))
4037 return (val + 1) > 1;
4039 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4040 if (mode == SImode)
4041 val = (val << 32) | (val & 0xffffffff);
4043 /* Invert if the immediate doesn't start with a zero bit - this means we
4044 only need to search for sequences of one bits. */
4045 if (val & 1)
4046 val = ~val;
4048 /* Find the first set bit and set tmp to val with the first sequence of one
4049 bits removed. Return success if there is a single sequence of ones. */
4050 first_one = val & -val;
4051 tmp = val & (val + first_one);
4053 if (tmp == 0)
4054 return true;
4056 /* Find the next set bit and compute the difference in bit position. */
4057 next_one = tmp & -tmp;
4058 bits = clz_hwi (first_one) - clz_hwi (next_one);
4059 mask = val ^ tmp;
4061 /* Check the bit position difference is a power of 2, and that the first
4062 sequence of one bits fits within 'bits' bits. */
4063 if ((mask >> bits) != 0 || bits != (bits & -bits))
4064 return false;
4066 /* Check the sequence of one bits is repeated 64/bits times. */
4067 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4070 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4071 Assumed precondition: VAL_IN Is not zero. */
4073 unsigned HOST_WIDE_INT
4074 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4076 int lowest_bit_set = ctz_hwi (val_in);
4077 int highest_bit_set = floor_log2 (val_in);
4078 gcc_assert (val_in != 0);
4080 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4081 (HOST_WIDE_INT_1U << lowest_bit_set));
4084 /* Create constant where bits outside of lowest bit set to highest bit set
4085 are set to 1. */
4087 unsigned HOST_WIDE_INT
4088 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4090 return val_in | ~aarch64_and_split_imm1 (val_in);
4093 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4095 bool
4096 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4098 scalar_int_mode int_mode;
4099 if (!is_a <scalar_int_mode> (mode, &int_mode))
4100 return false;
4102 if (aarch64_bitmask_imm (val_in, int_mode))
4103 return false;
4105 if (aarch64_move_imm (val_in, int_mode))
4106 return false;
4108 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4110 return aarch64_bitmask_imm (imm2, int_mode);
4113 /* Return true if val is an immediate that can be loaded into a
4114 register in a single instruction. */
4115 bool
4116 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4118 scalar_int_mode int_mode;
4119 if (!is_a <scalar_int_mode> (mode, &int_mode))
4120 return false;
4122 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4123 return 1;
4124 return aarch64_bitmask_imm (val, int_mode);
4127 static bool
4128 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4130 rtx base, offset;
4132 if (GET_CODE (x) == HIGH)
4133 return true;
4135 split_const (x, &base, &offset);
4136 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4138 if (aarch64_classify_symbol (base, offset)
4139 != SYMBOL_FORCE_TO_MEM)
4140 return true;
4141 else
4142 /* Avoid generating a 64-bit relocation in ILP32; leave
4143 to aarch64_expand_mov_immediate to handle it properly. */
4144 return mode != ptr_mode;
4147 return aarch64_tls_referenced_p (x);
4150 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4151 The expansion for a table switch is quite expensive due to the number
4152 of instructions, the table lookup and hard to predict indirect jump.
4153 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4154 set, otherwise use tables for > 16 cases as a tradeoff between size and
4155 performance. When optimizing for size, use the default setting. */
4157 static unsigned int
4158 aarch64_case_values_threshold (void)
4160 /* Use the specified limit for the number of cases before using jump
4161 tables at higher optimization levels. */
4162 if (optimize > 2
4163 && selected_cpu->tune->max_case_values != 0)
4164 return selected_cpu->tune->max_case_values;
4165 else
4166 return optimize_size ? default_case_values_threshold () : 17;
4169 /* Return true if register REGNO is a valid index register.
4170 STRICT_P is true if REG_OK_STRICT is in effect. */
4172 bool
4173 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4175 if (!HARD_REGISTER_NUM_P (regno))
4177 if (!strict_p)
4178 return true;
4180 if (!reg_renumber)
4181 return false;
4183 regno = reg_renumber[regno];
4185 return GP_REGNUM_P (regno);
4188 /* Return true if register REGNO is a valid base register for mode MODE.
4189 STRICT_P is true if REG_OK_STRICT is in effect. */
4191 bool
4192 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4194 if (!HARD_REGISTER_NUM_P (regno))
4196 if (!strict_p)
4197 return true;
4199 if (!reg_renumber)
4200 return false;
4202 regno = reg_renumber[regno];
4205 /* The fake registers will be eliminated to either the stack or
4206 hard frame pointer, both of which are usually valid base registers.
4207 Reload deals with the cases where the eliminated form isn't valid. */
4208 return (GP_REGNUM_P (regno)
4209 || regno == SP_REGNUM
4210 || regno == FRAME_POINTER_REGNUM
4211 || regno == ARG_POINTER_REGNUM);
4214 /* Return true if X is a valid base register for mode MODE.
4215 STRICT_P is true if REG_OK_STRICT is in effect. */
4217 static bool
4218 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4220 if (!strict_p
4221 && GET_CODE (x) == SUBREG
4222 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4223 x = SUBREG_REG (x);
4225 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4228 /* Return true if address offset is a valid index. If it is, fill in INFO
4229 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4231 static bool
4232 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4233 machine_mode mode, bool strict_p)
4235 enum aarch64_address_type type;
4236 rtx index;
4237 int shift;
4239 /* (reg:P) */
4240 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4241 && GET_MODE (x) == Pmode)
4243 type = ADDRESS_REG_REG;
4244 index = x;
4245 shift = 0;
4247 /* (sign_extend:DI (reg:SI)) */
4248 else if ((GET_CODE (x) == SIGN_EXTEND
4249 || GET_CODE (x) == ZERO_EXTEND)
4250 && GET_MODE (x) == DImode
4251 && GET_MODE (XEXP (x, 0)) == SImode)
4253 type = (GET_CODE (x) == SIGN_EXTEND)
4254 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4255 index = XEXP (x, 0);
4256 shift = 0;
4258 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4259 else if (GET_CODE (x) == MULT
4260 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4261 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4262 && GET_MODE (XEXP (x, 0)) == DImode
4263 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4264 && CONST_INT_P (XEXP (x, 1)))
4266 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4267 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4268 index = XEXP (XEXP (x, 0), 0);
4269 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4271 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4272 else if (GET_CODE (x) == ASHIFT
4273 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4274 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4275 && GET_MODE (XEXP (x, 0)) == DImode
4276 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4277 && CONST_INT_P (XEXP (x, 1)))
4279 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4280 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4281 index = XEXP (XEXP (x, 0), 0);
4282 shift = INTVAL (XEXP (x, 1));
4284 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4285 else if ((GET_CODE (x) == SIGN_EXTRACT
4286 || GET_CODE (x) == ZERO_EXTRACT)
4287 && GET_MODE (x) == DImode
4288 && GET_CODE (XEXP (x, 0)) == MULT
4289 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4290 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4292 type = (GET_CODE (x) == SIGN_EXTRACT)
4293 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4294 index = XEXP (XEXP (x, 0), 0);
4295 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4296 if (INTVAL (XEXP (x, 1)) != 32 + shift
4297 || INTVAL (XEXP (x, 2)) != 0)
4298 shift = -1;
4300 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4301 (const_int 0xffffffff<<shift)) */
4302 else if (GET_CODE (x) == AND
4303 && GET_MODE (x) == DImode
4304 && GET_CODE (XEXP (x, 0)) == MULT
4305 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4306 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4307 && CONST_INT_P (XEXP (x, 1)))
4309 type = ADDRESS_REG_UXTW;
4310 index = XEXP (XEXP (x, 0), 0);
4311 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4312 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4313 shift = -1;
4315 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4316 else if ((GET_CODE (x) == SIGN_EXTRACT
4317 || GET_CODE (x) == ZERO_EXTRACT)
4318 && GET_MODE (x) == DImode
4319 && GET_CODE (XEXP (x, 0)) == ASHIFT
4320 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4321 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4323 type = (GET_CODE (x) == SIGN_EXTRACT)
4324 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4325 index = XEXP (XEXP (x, 0), 0);
4326 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4327 if (INTVAL (XEXP (x, 1)) != 32 + shift
4328 || INTVAL (XEXP (x, 2)) != 0)
4329 shift = -1;
4331 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4332 (const_int 0xffffffff<<shift)) */
4333 else if (GET_CODE (x) == AND
4334 && GET_MODE (x) == DImode
4335 && GET_CODE (XEXP (x, 0)) == ASHIFT
4336 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4337 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4338 && CONST_INT_P (XEXP (x, 1)))
4340 type = ADDRESS_REG_UXTW;
4341 index = XEXP (XEXP (x, 0), 0);
4342 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4343 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4344 shift = -1;
4346 /* (mult:P (reg:P) (const_int scale)) */
4347 else if (GET_CODE (x) == MULT
4348 && GET_MODE (x) == Pmode
4349 && GET_MODE (XEXP (x, 0)) == Pmode
4350 && CONST_INT_P (XEXP (x, 1)))
4352 type = ADDRESS_REG_REG;
4353 index = XEXP (x, 0);
4354 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4356 /* (ashift:P (reg:P) (const_int shift)) */
4357 else if (GET_CODE (x) == ASHIFT
4358 && GET_MODE (x) == Pmode
4359 && GET_MODE (XEXP (x, 0)) == Pmode
4360 && CONST_INT_P (XEXP (x, 1)))
4362 type = ADDRESS_REG_REG;
4363 index = XEXP (x, 0);
4364 shift = INTVAL (XEXP (x, 1));
4366 else
4367 return false;
4369 if (!strict_p
4370 && GET_CODE (index) == SUBREG
4371 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4372 index = SUBREG_REG (index);
4374 if ((shift == 0 ||
4375 (shift > 0 && shift <= 3
4376 && (1 << shift) == GET_MODE_SIZE (mode)))
4377 && REG_P (index)
4378 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4380 info->type = type;
4381 info->offset = index;
4382 info->shift = shift;
4383 return true;
4386 return false;
4389 /* Return true if MODE is one of the modes for which we
4390 support LDP/STP operations. */
4392 static bool
4393 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4395 return mode == SImode || mode == DImode
4396 || mode == SFmode || mode == DFmode
4397 || (aarch64_vector_mode_supported_p (mode)
4398 && GET_MODE_SIZE (mode) == 8);
4401 /* Return true if REGNO is a virtual pointer register, or an eliminable
4402 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4403 include stack_pointer or hard_frame_pointer. */
4404 static bool
4405 virt_or_elim_regno_p (unsigned regno)
4407 return ((regno >= FIRST_VIRTUAL_REGISTER
4408 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4409 || regno == FRAME_POINTER_REGNUM
4410 || regno == ARG_POINTER_REGNUM);
4413 /* Return true if X is a valid address for machine mode MODE. If it is,
4414 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4415 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4417 static bool
4418 aarch64_classify_address (struct aarch64_address_info *info,
4419 rtx x, machine_mode mode,
4420 RTX_CODE outer_code, bool strict_p)
4422 enum rtx_code code = GET_CODE (x);
4423 rtx op0, op1;
4425 /* On BE, we use load/store pair for all large int mode load/stores.
4426 TI/TFmode may also use a load/store pair. */
4427 bool load_store_pair_p = (outer_code == PARALLEL
4428 || mode == TImode
4429 || mode == TFmode
4430 || (BYTES_BIG_ENDIAN
4431 && aarch64_vect_struct_mode_p (mode)));
4433 bool allow_reg_index_p =
4434 !load_store_pair_p
4435 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4436 && !aarch64_vect_struct_mode_p (mode);
4438 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4439 REG addressing. */
4440 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4441 && (code != POST_INC && code != REG))
4442 return false;
4444 switch (code)
4446 case REG:
4447 case SUBREG:
4448 info->type = ADDRESS_REG_IMM;
4449 info->base = x;
4450 info->offset = const0_rtx;
4451 return aarch64_base_register_rtx_p (x, strict_p);
4453 case PLUS:
4454 op0 = XEXP (x, 0);
4455 op1 = XEXP (x, 1);
4457 if (! strict_p
4458 && REG_P (op0)
4459 && virt_or_elim_regno_p (REGNO (op0))
4460 && CONST_INT_P (op1))
4462 info->type = ADDRESS_REG_IMM;
4463 info->base = op0;
4464 info->offset = op1;
4466 return true;
4469 if (GET_MODE_SIZE (mode) != 0
4470 && CONST_INT_P (op1)
4471 && aarch64_base_register_rtx_p (op0, strict_p))
4473 HOST_WIDE_INT offset = INTVAL (op1);
4475 info->type = ADDRESS_REG_IMM;
4476 info->base = op0;
4477 info->offset = op1;
4479 /* TImode and TFmode values are allowed in both pairs of X
4480 registers and individual Q registers. The available
4481 address modes are:
4482 X,X: 7-bit signed scaled offset
4483 Q: 9-bit signed offset
4484 We conservatively require an offset representable in either mode.
4485 When performing the check for pairs of X registers i.e. LDP/STP
4486 pass down DImode since that is the natural size of the LDP/STP
4487 instruction memory accesses. */
4488 if (mode == TImode || mode == TFmode)
4489 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4490 && (offset_9bit_signed_unscaled_p (mode, offset)
4491 || offset_12bit_unsigned_scaled_p (mode, offset)));
4493 /* A 7bit offset check because OImode will emit a ldp/stp
4494 instruction (only big endian will get here).
4495 For ldp/stp instructions, the offset is scaled for the size of a
4496 single element of the pair. */
4497 if (mode == OImode)
4498 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4500 /* Three 9/12 bit offsets checks because CImode will emit three
4501 ldr/str instructions (only big endian will get here). */
4502 if (mode == CImode)
4503 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4504 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4505 || offset_12bit_unsigned_scaled_p (V16QImode,
4506 offset + 32)));
4508 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4509 instructions (only big endian will get here). */
4510 if (mode == XImode)
4511 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4512 && aarch64_offset_7bit_signed_scaled_p (TImode,
4513 offset + 32));
4515 if (load_store_pair_p)
4516 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4517 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4518 else
4519 return (offset_9bit_signed_unscaled_p (mode, offset)
4520 || offset_12bit_unsigned_scaled_p (mode, offset));
4523 if (allow_reg_index_p)
4525 /* Look for base + (scaled/extended) index register. */
4526 if (aarch64_base_register_rtx_p (op0, strict_p)
4527 && aarch64_classify_index (info, op1, mode, strict_p))
4529 info->base = op0;
4530 return true;
4532 if (aarch64_base_register_rtx_p (op1, strict_p)
4533 && aarch64_classify_index (info, op0, mode, strict_p))
4535 info->base = op1;
4536 return true;
4540 return false;
4542 case POST_INC:
4543 case POST_DEC:
4544 case PRE_INC:
4545 case PRE_DEC:
4546 info->type = ADDRESS_REG_WB;
4547 info->base = XEXP (x, 0);
4548 info->offset = NULL_RTX;
4549 return aarch64_base_register_rtx_p (info->base, strict_p);
4551 case POST_MODIFY:
4552 case PRE_MODIFY:
4553 info->type = ADDRESS_REG_WB;
4554 info->base = XEXP (x, 0);
4555 if (GET_CODE (XEXP (x, 1)) == PLUS
4556 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4557 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4558 && aarch64_base_register_rtx_p (info->base, strict_p))
4560 HOST_WIDE_INT offset;
4561 info->offset = XEXP (XEXP (x, 1), 1);
4562 offset = INTVAL (info->offset);
4564 /* TImode and TFmode values are allowed in both pairs of X
4565 registers and individual Q registers. The available
4566 address modes are:
4567 X,X: 7-bit signed scaled offset
4568 Q: 9-bit signed offset
4569 We conservatively require an offset representable in either mode.
4571 if (mode == TImode || mode == TFmode)
4572 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4573 && offset_9bit_signed_unscaled_p (mode, offset));
4575 if (load_store_pair_p)
4576 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4577 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4578 else
4579 return offset_9bit_signed_unscaled_p (mode, offset);
4581 return false;
4583 case CONST:
4584 case SYMBOL_REF:
4585 case LABEL_REF:
4586 /* load literal: pc-relative constant pool entry. Only supported
4587 for SI mode or larger. */
4588 info->type = ADDRESS_SYMBOLIC;
4590 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4592 rtx sym, addend;
4594 split_const (x, &sym, &addend);
4595 return ((GET_CODE (sym) == LABEL_REF
4596 || (GET_CODE (sym) == SYMBOL_REF
4597 && CONSTANT_POOL_ADDRESS_P (sym)
4598 && aarch64_pcrelative_literal_loads)));
4600 return false;
4602 case LO_SUM:
4603 info->type = ADDRESS_LO_SUM;
4604 info->base = XEXP (x, 0);
4605 info->offset = XEXP (x, 1);
4606 if (allow_reg_index_p
4607 && aarch64_base_register_rtx_p (info->base, strict_p))
4609 rtx sym, offs;
4610 split_const (info->offset, &sym, &offs);
4611 if (GET_CODE (sym) == SYMBOL_REF
4612 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4614 /* The symbol and offset must be aligned to the access size. */
4615 unsigned int align;
4616 unsigned int ref_size;
4618 if (CONSTANT_POOL_ADDRESS_P (sym))
4619 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4620 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4622 tree exp = SYMBOL_REF_DECL (sym);
4623 align = TYPE_ALIGN (TREE_TYPE (exp));
4624 align = CONSTANT_ALIGNMENT (exp, align);
4626 else if (SYMBOL_REF_DECL (sym))
4627 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4628 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4629 && SYMBOL_REF_BLOCK (sym) != NULL)
4630 align = SYMBOL_REF_BLOCK (sym)->alignment;
4631 else
4632 align = BITS_PER_UNIT;
4634 ref_size = GET_MODE_SIZE (mode);
4635 if (ref_size == 0)
4636 ref_size = GET_MODE_SIZE (DImode);
4638 return ((INTVAL (offs) & (ref_size - 1)) == 0
4639 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4642 return false;
4644 default:
4645 return false;
4649 /* Return true if the address X is valid for a PRFM instruction.
4650 STRICT_P is true if we should do strict checking with
4651 aarch64_classify_address. */
4653 bool
4654 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4656 struct aarch64_address_info addr;
4658 /* PRFM accepts the same addresses as DImode... */
4659 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4660 if (!res)
4661 return false;
4663 /* ... except writeback forms. */
4664 return addr.type != ADDRESS_REG_WB;
4667 bool
4668 aarch64_symbolic_address_p (rtx x)
4670 rtx offset;
4672 split_const (x, &x, &offset);
4673 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4676 /* Classify the base of symbolic expression X. */
4678 enum aarch64_symbol_type
4679 aarch64_classify_symbolic_expression (rtx x)
4681 rtx offset;
4683 split_const (x, &x, &offset);
4684 return aarch64_classify_symbol (x, offset);
4688 /* Return TRUE if X is a legitimate address for accessing memory in
4689 mode MODE. */
4690 static bool
4691 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4693 struct aarch64_address_info addr;
4695 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4698 /* Return TRUE if X is a legitimate address for accessing memory in
4699 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4700 pair operation. */
4701 bool
4702 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4703 RTX_CODE outer_code, bool strict_p)
4705 struct aarch64_address_info addr;
4707 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4710 /* Split an out-of-range address displacement into a base and offset.
4711 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4712 to increase opportunities for sharing the base address of different sizes.
4713 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4714 static bool
4715 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4717 HOST_WIDE_INT offset = INTVAL (*disp);
4718 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4720 if (mode == TImode || mode == TFmode
4721 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4722 base = (offset + 0x100) & ~0x1ff;
4724 *off = GEN_INT (base);
4725 *disp = GEN_INT (offset - base);
4726 return true;
4729 /* Return the binary representation of floating point constant VALUE in INTVAL.
4730 If the value cannot be converted, return false without setting INTVAL.
4731 The conversion is done in the given MODE. */
4732 bool
4733 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4736 /* We make a general exception for 0. */
4737 if (aarch64_float_const_zero_rtx_p (value))
4739 *intval = 0;
4740 return true;
4743 machine_mode mode = GET_MODE (value);
4744 if (GET_CODE (value) != CONST_DOUBLE
4745 || !SCALAR_FLOAT_MODE_P (mode)
4746 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4747 /* Only support up to DF mode. */
4748 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4749 return false;
4751 unsigned HOST_WIDE_INT ival = 0;
4753 long res[2];
4754 real_to_target (res,
4755 CONST_DOUBLE_REAL_VALUE (value),
4756 REAL_MODE_FORMAT (mode));
4758 if (mode == DFmode)
4760 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4761 ival = zext_hwi (res[order], 32);
4762 ival |= (zext_hwi (res[1 - order], 32) << 32);
4764 else
4765 ival = zext_hwi (res[0], 32);
4767 *intval = ival;
4768 return true;
4771 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4772 single MOV(+MOVK) followed by an FMOV. */
4773 bool
4774 aarch64_float_const_rtx_p (rtx x)
4776 machine_mode mode = GET_MODE (x);
4777 if (mode == VOIDmode)
4778 return false;
4780 /* Determine whether it's cheaper to write float constants as
4781 mov/movk pairs over ldr/adrp pairs. */
4782 unsigned HOST_WIDE_INT ival;
4784 if (GET_CODE (x) == CONST_DOUBLE
4785 && SCALAR_FLOAT_MODE_P (mode)
4786 && aarch64_reinterpret_float_as_int (x, &ival))
4788 scalar_int_mode imode = (mode == HFmode
4789 ? SImode
4790 : int_mode_for_mode (mode).require ());
4791 int num_instr = aarch64_internal_mov_immediate
4792 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4793 return num_instr < 3;
4796 return false;
4799 /* Return TRUE if rtx X is immediate constant 0.0 */
4800 bool
4801 aarch64_float_const_zero_rtx_p (rtx x)
4803 if (GET_MODE (x) == VOIDmode)
4804 return false;
4806 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4807 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4808 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4811 /* Return TRUE if rtx X is immediate constant that fits in a single
4812 MOVI immediate operation. */
4813 bool
4814 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4816 if (!TARGET_SIMD)
4817 return false;
4819 machine_mode vmode;
4820 scalar_int_mode imode;
4821 unsigned HOST_WIDE_INT ival;
4823 if (GET_CODE (x) == CONST_DOUBLE
4824 && SCALAR_FLOAT_MODE_P (mode))
4826 if (!aarch64_reinterpret_float_as_int (x, &ival))
4827 return false;
4829 /* We make a general exception for 0. */
4830 if (aarch64_float_const_zero_rtx_p (x))
4831 return true;
4833 imode = int_mode_for_mode (mode).require ();
4835 else if (GET_CODE (x) == CONST_INT
4836 && is_a <scalar_int_mode> (mode, &imode))
4837 ival = INTVAL (x);
4838 else
4839 return false;
4841 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4842 a 128 bit vector mode. */
4843 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4845 vmode = aarch64_simd_container_mode (imode, width);
4846 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4848 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4852 /* Return the fixed registers used for condition codes. */
4854 static bool
4855 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4857 *p1 = CC_REGNUM;
4858 *p2 = INVALID_REGNUM;
4859 return true;
4862 /* This function is used by the call expanders of the machine description.
4863 RESULT is the register in which the result is returned. It's NULL for
4864 "call" and "sibcall".
4865 MEM is the location of the function call.
4866 SIBCALL indicates whether this function call is normal call or sibling call.
4867 It will generate different pattern accordingly. */
4869 void
4870 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4872 rtx call, callee, tmp;
4873 rtvec vec;
4874 machine_mode mode;
4876 gcc_assert (MEM_P (mem));
4877 callee = XEXP (mem, 0);
4878 mode = GET_MODE (callee);
4879 gcc_assert (mode == Pmode);
4881 /* Decide if we should generate indirect calls by loading the
4882 address of the callee into a register before performing
4883 the branch-and-link. */
4884 if (SYMBOL_REF_P (callee)
4885 ? (aarch64_is_long_call_p (callee)
4886 || aarch64_is_noplt_call_p (callee))
4887 : !REG_P (callee))
4888 XEXP (mem, 0) = force_reg (mode, callee);
4890 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4892 if (result != NULL_RTX)
4893 call = gen_rtx_SET (result, call);
4895 if (sibcall)
4896 tmp = ret_rtx;
4897 else
4898 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4900 vec = gen_rtvec (2, call, tmp);
4901 call = gen_rtx_PARALLEL (VOIDmode, vec);
4903 aarch64_emit_call_insn (call);
4906 /* Emit call insn with PAT and do aarch64-specific handling. */
4908 void
4909 aarch64_emit_call_insn (rtx pat)
4911 rtx insn = emit_call_insn (pat);
4913 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4914 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4915 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4918 machine_mode
4919 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4921 /* All floating point compares return CCFP if it is an equality
4922 comparison, and CCFPE otherwise. */
4923 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4925 switch (code)
4927 case EQ:
4928 case NE:
4929 case UNORDERED:
4930 case ORDERED:
4931 case UNLT:
4932 case UNLE:
4933 case UNGT:
4934 case UNGE:
4935 case UNEQ:
4936 case LTGT:
4937 return CCFPmode;
4939 case LT:
4940 case LE:
4941 case GT:
4942 case GE:
4943 return CCFPEmode;
4945 default:
4946 gcc_unreachable ();
4950 /* Equality comparisons of short modes against zero can be performed
4951 using the TST instruction with the appropriate bitmask. */
4952 if (y == const0_rtx && REG_P (x)
4953 && (code == EQ || code == NE)
4954 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4955 return CC_NZmode;
4957 /* Similarly, comparisons of zero_extends from shorter modes can
4958 be performed using an ANDS with an immediate mask. */
4959 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4960 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4961 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4962 && (code == EQ || code == NE))
4963 return CC_NZmode;
4965 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4966 && y == const0_rtx
4967 && (code == EQ || code == NE || code == LT || code == GE)
4968 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4969 || GET_CODE (x) == NEG
4970 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4971 && CONST_INT_P (XEXP (x, 2)))))
4972 return CC_NZmode;
4974 /* A compare with a shifted operand. Because of canonicalization,
4975 the comparison will have to be swapped when we emit the assembly
4976 code. */
4977 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4978 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4979 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4980 || GET_CODE (x) == LSHIFTRT
4981 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4982 return CC_SWPmode;
4984 /* Similarly for a negated operand, but we can only do this for
4985 equalities. */
4986 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4987 && (REG_P (y) || GET_CODE (y) == SUBREG)
4988 && (code == EQ || code == NE)
4989 && GET_CODE (x) == NEG)
4990 return CC_Zmode;
4992 /* A test for unsigned overflow. */
4993 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4994 && code == NE
4995 && GET_CODE (x) == PLUS
4996 && GET_CODE (y) == ZERO_EXTEND)
4997 return CC_Cmode;
4999 /* For everything else, return CCmode. */
5000 return CCmode;
5003 static int
5004 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5007 aarch64_get_condition_code (rtx x)
5009 machine_mode mode = GET_MODE (XEXP (x, 0));
5010 enum rtx_code comp_code = GET_CODE (x);
5012 if (GET_MODE_CLASS (mode) != MODE_CC)
5013 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5014 return aarch64_get_condition_code_1 (mode, comp_code);
5017 static int
5018 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5020 switch (mode)
5022 case E_CCFPmode:
5023 case E_CCFPEmode:
5024 switch (comp_code)
5026 case GE: return AARCH64_GE;
5027 case GT: return AARCH64_GT;
5028 case LE: return AARCH64_LS;
5029 case LT: return AARCH64_MI;
5030 case NE: return AARCH64_NE;
5031 case EQ: return AARCH64_EQ;
5032 case ORDERED: return AARCH64_VC;
5033 case UNORDERED: return AARCH64_VS;
5034 case UNLT: return AARCH64_LT;
5035 case UNLE: return AARCH64_LE;
5036 case UNGT: return AARCH64_HI;
5037 case UNGE: return AARCH64_PL;
5038 default: return -1;
5040 break;
5042 case E_CCmode:
5043 switch (comp_code)
5045 case NE: return AARCH64_NE;
5046 case EQ: return AARCH64_EQ;
5047 case GE: return AARCH64_GE;
5048 case GT: return AARCH64_GT;
5049 case LE: return AARCH64_LE;
5050 case LT: return AARCH64_LT;
5051 case GEU: return AARCH64_CS;
5052 case GTU: return AARCH64_HI;
5053 case LEU: return AARCH64_LS;
5054 case LTU: return AARCH64_CC;
5055 default: return -1;
5057 break;
5059 case E_CC_SWPmode:
5060 switch (comp_code)
5062 case NE: return AARCH64_NE;
5063 case EQ: return AARCH64_EQ;
5064 case GE: return AARCH64_LE;
5065 case GT: return AARCH64_LT;
5066 case LE: return AARCH64_GE;
5067 case LT: return AARCH64_GT;
5068 case GEU: return AARCH64_LS;
5069 case GTU: return AARCH64_CC;
5070 case LEU: return AARCH64_CS;
5071 case LTU: return AARCH64_HI;
5072 default: return -1;
5074 break;
5076 case E_CC_NZmode:
5077 switch (comp_code)
5079 case NE: return AARCH64_NE;
5080 case EQ: return AARCH64_EQ;
5081 case GE: return AARCH64_PL;
5082 case LT: return AARCH64_MI;
5083 default: return -1;
5085 break;
5087 case E_CC_Zmode:
5088 switch (comp_code)
5090 case NE: return AARCH64_NE;
5091 case EQ: return AARCH64_EQ;
5092 default: return -1;
5094 break;
5096 case E_CC_Cmode:
5097 switch (comp_code)
5099 case NE: return AARCH64_CS;
5100 case EQ: return AARCH64_CC;
5101 default: return -1;
5103 break;
5105 default:
5106 return -1;
5109 return -1;
5112 bool
5113 aarch64_const_vec_all_same_in_range_p (rtx x,
5114 HOST_WIDE_INT minval,
5115 HOST_WIDE_INT maxval)
5117 HOST_WIDE_INT firstval;
5118 int count, i;
5120 if (GET_CODE (x) != CONST_VECTOR
5121 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5122 return false;
5124 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5125 if (firstval < minval || firstval > maxval)
5126 return false;
5128 count = CONST_VECTOR_NUNITS (x);
5129 for (i = 1; i < count; i++)
5130 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5131 return false;
5133 return true;
5136 bool
5137 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5139 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5143 /* N Z C V. */
5144 #define AARCH64_CC_V 1
5145 #define AARCH64_CC_C (1 << 1)
5146 #define AARCH64_CC_Z (1 << 2)
5147 #define AARCH64_CC_N (1 << 3)
5149 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5150 static const int aarch64_nzcv_codes[] =
5152 0, /* EQ, Z == 1. */
5153 AARCH64_CC_Z, /* NE, Z == 0. */
5154 0, /* CS, C == 1. */
5155 AARCH64_CC_C, /* CC, C == 0. */
5156 0, /* MI, N == 1. */
5157 AARCH64_CC_N, /* PL, N == 0. */
5158 0, /* VS, V == 1. */
5159 AARCH64_CC_V, /* VC, V == 0. */
5160 0, /* HI, C ==1 && Z == 0. */
5161 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5162 AARCH64_CC_V, /* GE, N == V. */
5163 0, /* LT, N != V. */
5164 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5165 0, /* LE, !(Z == 0 && N == V). */
5166 0, /* AL, Any. */
5167 0 /* NV, Any. */
5170 /* Print operand X to file F in a target specific manner according to CODE.
5171 The acceptable formatting commands given by CODE are:
5172 'c': An integer or symbol address without a preceding #
5173 sign.
5174 'e': Print the sign/zero-extend size as a character 8->b,
5175 16->h, 32->w.
5176 'p': Prints N such that 2^N == X (X must be power of 2 and
5177 const int).
5178 'P': Print the number of non-zero bits in X (a const_int).
5179 'H': Print the higher numbered register of a pair (TImode)
5180 of regs.
5181 'm': Print a condition (eq, ne, etc).
5182 'M': Same as 'm', but invert condition.
5183 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5184 'S/T/U/V': Print a FP/SIMD register name for a register list.
5185 The register printed is the FP/SIMD register name
5186 of X + 0/1/2/3 for S/T/U/V.
5187 'R': Print a scalar FP/SIMD register name + 1.
5188 'X': Print bottom 16 bits of integer constant in hex.
5189 'w/x': Print a general register name or the zero register
5190 (32-bit or 64-bit).
5191 '0': Print a normal operand, if it's a general register,
5192 then we assume DImode.
5193 'k': Print NZCV for conditional compare instructions.
5194 'A': Output address constant representing the first
5195 argument of X, specifying a relocation offset
5196 if appropriate.
5197 'L': Output constant address specified by X
5198 with a relocation offset if appropriate.
5199 'G': Prints address of X, specifying a PC relative
5200 relocation mode if appropriate. */
5202 static void
5203 aarch64_print_operand (FILE *f, rtx x, int code)
5205 switch (code)
5207 case 'c':
5208 switch (GET_CODE (x))
5210 case CONST_INT:
5211 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5212 break;
5214 case SYMBOL_REF:
5215 output_addr_const (f, x);
5216 break;
5218 case CONST:
5219 if (GET_CODE (XEXP (x, 0)) == PLUS
5220 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5222 output_addr_const (f, x);
5223 break;
5225 /* Fall through. */
5227 default:
5228 output_operand_lossage ("Unsupported operand for code '%c'", code);
5230 break;
5232 case 'e':
5234 int n;
5236 if (!CONST_INT_P (x)
5237 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5239 output_operand_lossage ("invalid operand for '%%%c'", code);
5240 return;
5243 switch (n)
5245 case 3:
5246 fputc ('b', f);
5247 break;
5248 case 4:
5249 fputc ('h', f);
5250 break;
5251 case 5:
5252 fputc ('w', f);
5253 break;
5254 default:
5255 output_operand_lossage ("invalid operand for '%%%c'", code);
5256 return;
5259 break;
5261 case 'p':
5263 int n;
5265 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5267 output_operand_lossage ("invalid operand for '%%%c'", code);
5268 return;
5271 asm_fprintf (f, "%d", n);
5273 break;
5275 case 'P':
5276 if (!CONST_INT_P (x))
5278 output_operand_lossage ("invalid operand for '%%%c'", code);
5279 return;
5282 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5283 break;
5285 case 'H':
5286 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5288 output_operand_lossage ("invalid operand for '%%%c'", code);
5289 return;
5292 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5293 break;
5295 case 'M':
5296 case 'm':
5298 int cond_code;
5299 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5300 if (x == const_true_rtx)
5302 if (code == 'M')
5303 fputs ("nv", f);
5304 return;
5307 if (!COMPARISON_P (x))
5309 output_operand_lossage ("invalid operand for '%%%c'", code);
5310 return;
5313 cond_code = aarch64_get_condition_code (x);
5314 gcc_assert (cond_code >= 0);
5315 if (code == 'M')
5316 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5317 fputs (aarch64_condition_codes[cond_code], f);
5319 break;
5321 case 'b':
5322 case 'h':
5323 case 's':
5324 case 'd':
5325 case 'q':
5326 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5328 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5329 return;
5331 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5332 break;
5334 case 'S':
5335 case 'T':
5336 case 'U':
5337 case 'V':
5338 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5340 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5341 return;
5343 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5344 break;
5346 case 'R':
5347 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5349 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5350 return;
5352 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5353 break;
5355 case 'X':
5356 if (!CONST_INT_P (x))
5358 output_operand_lossage ("invalid operand for '%%%c'", code);
5359 return;
5361 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5362 break;
5364 case 'w':
5365 case 'x':
5366 if (x == const0_rtx
5367 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5369 asm_fprintf (f, "%czr", code);
5370 break;
5373 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5375 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5376 break;
5379 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5381 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5382 break;
5385 /* Fall through */
5387 case 0:
5388 if (x == NULL)
5390 output_operand_lossage ("missing operand");
5391 return;
5394 switch (GET_CODE (x))
5396 case REG:
5397 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5398 break;
5400 case MEM:
5401 output_address (GET_MODE (x), XEXP (x, 0));
5402 /* Check all memory references are Pmode - even with ILP32. */
5403 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5404 break;
5406 case CONST:
5407 case LABEL_REF:
5408 case SYMBOL_REF:
5409 output_addr_const (asm_out_file, x);
5410 break;
5412 case CONST_INT:
5413 asm_fprintf (f, "%wd", INTVAL (x));
5414 break;
5416 case CONST_VECTOR:
5417 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5419 gcc_assert (
5420 aarch64_const_vec_all_same_in_range_p (x,
5421 HOST_WIDE_INT_MIN,
5422 HOST_WIDE_INT_MAX));
5423 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5425 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5427 fputc ('0', f);
5429 else
5430 gcc_unreachable ();
5431 break;
5433 case CONST_DOUBLE:
5434 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5435 be getting CONST_DOUBLEs holding integers. */
5436 gcc_assert (GET_MODE (x) != VOIDmode);
5437 if (aarch64_float_const_zero_rtx_p (x))
5439 fputc ('0', f);
5440 break;
5442 else if (aarch64_float_const_representable_p (x))
5444 #define buf_size 20
5445 char float_buf[buf_size] = {'\0'};
5446 real_to_decimal_for_mode (float_buf,
5447 CONST_DOUBLE_REAL_VALUE (x),
5448 buf_size, buf_size,
5449 1, GET_MODE (x));
5450 asm_fprintf (asm_out_file, "%s", float_buf);
5451 break;
5452 #undef buf_size
5454 output_operand_lossage ("invalid constant");
5455 return;
5456 default:
5457 output_operand_lossage ("invalid operand");
5458 return;
5460 break;
5462 case 'A':
5463 if (GET_CODE (x) == HIGH)
5464 x = XEXP (x, 0);
5466 switch (aarch64_classify_symbolic_expression (x))
5468 case SYMBOL_SMALL_GOT_4G:
5469 asm_fprintf (asm_out_file, ":got:");
5470 break;
5472 case SYMBOL_SMALL_TLSGD:
5473 asm_fprintf (asm_out_file, ":tlsgd:");
5474 break;
5476 case SYMBOL_SMALL_TLSDESC:
5477 asm_fprintf (asm_out_file, ":tlsdesc:");
5478 break;
5480 case SYMBOL_SMALL_TLSIE:
5481 asm_fprintf (asm_out_file, ":gottprel:");
5482 break;
5484 case SYMBOL_TLSLE24:
5485 asm_fprintf (asm_out_file, ":tprel:");
5486 break;
5488 case SYMBOL_TINY_GOT:
5489 gcc_unreachable ();
5490 break;
5492 default:
5493 break;
5495 output_addr_const (asm_out_file, x);
5496 break;
5498 case 'L':
5499 switch (aarch64_classify_symbolic_expression (x))
5501 case SYMBOL_SMALL_GOT_4G:
5502 asm_fprintf (asm_out_file, ":lo12:");
5503 break;
5505 case SYMBOL_SMALL_TLSGD:
5506 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5507 break;
5509 case SYMBOL_SMALL_TLSDESC:
5510 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5511 break;
5513 case SYMBOL_SMALL_TLSIE:
5514 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5515 break;
5517 case SYMBOL_TLSLE12:
5518 asm_fprintf (asm_out_file, ":tprel_lo12:");
5519 break;
5521 case SYMBOL_TLSLE24:
5522 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5523 break;
5525 case SYMBOL_TINY_GOT:
5526 asm_fprintf (asm_out_file, ":got:");
5527 break;
5529 case SYMBOL_TINY_TLSIE:
5530 asm_fprintf (asm_out_file, ":gottprel:");
5531 break;
5533 default:
5534 break;
5536 output_addr_const (asm_out_file, x);
5537 break;
5539 case 'G':
5540 switch (aarch64_classify_symbolic_expression (x))
5542 case SYMBOL_TLSLE24:
5543 asm_fprintf (asm_out_file, ":tprel_hi12:");
5544 break;
5545 default:
5546 break;
5548 output_addr_const (asm_out_file, x);
5549 break;
5551 case 'k':
5553 HOST_WIDE_INT cond_code;
5555 if (!CONST_INT_P (x))
5557 output_operand_lossage ("invalid operand for '%%%c'", code);
5558 return;
5561 cond_code = INTVAL (x);
5562 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5563 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5565 break;
5567 default:
5568 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5569 return;
5573 static void
5574 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5576 struct aarch64_address_info addr;
5578 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5579 switch (addr.type)
5581 case ADDRESS_REG_IMM:
5582 if (addr.offset == const0_rtx)
5583 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5584 else
5585 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5586 INTVAL (addr.offset));
5587 return;
5589 case ADDRESS_REG_REG:
5590 if (addr.shift == 0)
5591 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5592 reg_names [REGNO (addr.offset)]);
5593 else
5594 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5595 reg_names [REGNO (addr.offset)], addr.shift);
5596 return;
5598 case ADDRESS_REG_UXTW:
5599 if (addr.shift == 0)
5600 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5601 REGNO (addr.offset) - R0_REGNUM);
5602 else
5603 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5604 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5605 return;
5607 case ADDRESS_REG_SXTW:
5608 if (addr.shift == 0)
5609 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5610 REGNO (addr.offset) - R0_REGNUM);
5611 else
5612 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5613 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5614 return;
5616 case ADDRESS_REG_WB:
5617 switch (GET_CODE (x))
5619 case PRE_INC:
5620 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5621 GET_MODE_SIZE (mode));
5622 return;
5623 case POST_INC:
5624 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5625 GET_MODE_SIZE (mode));
5626 return;
5627 case PRE_DEC:
5628 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5629 GET_MODE_SIZE (mode));
5630 return;
5631 case POST_DEC:
5632 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5633 GET_MODE_SIZE (mode));
5634 return;
5635 case PRE_MODIFY:
5636 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5637 INTVAL (addr.offset));
5638 return;
5639 case POST_MODIFY:
5640 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5641 INTVAL (addr.offset));
5642 return;
5643 default:
5644 break;
5646 break;
5648 case ADDRESS_LO_SUM:
5649 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5650 output_addr_const (f, addr.offset);
5651 asm_fprintf (f, "]");
5652 return;
5654 case ADDRESS_SYMBOLIC:
5655 break;
5658 output_addr_const (f, x);
5661 bool
5662 aarch64_label_mentioned_p (rtx x)
5664 const char *fmt;
5665 int i;
5667 if (GET_CODE (x) == LABEL_REF)
5668 return true;
5670 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5671 referencing instruction, but they are constant offsets, not
5672 symbols. */
5673 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5674 return false;
5676 fmt = GET_RTX_FORMAT (GET_CODE (x));
5677 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5679 if (fmt[i] == 'E')
5681 int j;
5683 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5684 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5685 return 1;
5687 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5688 return 1;
5691 return 0;
5694 /* Implement REGNO_REG_CLASS. */
5696 enum reg_class
5697 aarch64_regno_regclass (unsigned regno)
5699 if (GP_REGNUM_P (regno))
5700 return GENERAL_REGS;
5702 if (regno == SP_REGNUM)
5703 return STACK_REG;
5705 if (regno == FRAME_POINTER_REGNUM
5706 || regno == ARG_POINTER_REGNUM)
5707 return POINTER_REGS;
5709 if (FP_REGNUM_P (regno))
5710 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5712 return NO_REGS;
5715 static rtx
5716 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5718 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5719 where mask is selected by alignment and size of the offset.
5720 We try to pick as large a range for the offset as possible to
5721 maximize the chance of a CSE. However, for aligned addresses
5722 we limit the range to 4k so that structures with different sized
5723 elements are likely to use the same base. We need to be careful
5724 not to split a CONST for some forms of address expression, otherwise
5725 it will generate sub-optimal code. */
5727 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5729 rtx base = XEXP (x, 0);
5730 rtx offset_rtx = XEXP (x, 1);
5731 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5733 if (GET_CODE (base) == PLUS)
5735 rtx op0 = XEXP (base, 0);
5736 rtx op1 = XEXP (base, 1);
5738 /* Force any scaling into a temp for CSE. */
5739 op0 = force_reg (Pmode, op0);
5740 op1 = force_reg (Pmode, op1);
5742 /* Let the pointer register be in op0. */
5743 if (REG_POINTER (op1))
5744 std::swap (op0, op1);
5746 /* If the pointer is virtual or frame related, then we know that
5747 virtual register instantiation or register elimination is going
5748 to apply a second constant. We want the two constants folded
5749 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5750 if (virt_or_elim_regno_p (REGNO (op0)))
5752 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5753 NULL_RTX, true, OPTAB_DIRECT);
5754 return gen_rtx_PLUS (Pmode, base, op1);
5757 /* Otherwise, in order to encourage CSE (and thence loop strength
5758 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5759 base = expand_binop (Pmode, add_optab, op0, op1,
5760 NULL_RTX, true, OPTAB_DIRECT);
5761 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5764 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5765 HOST_WIDE_INT base_offset;
5766 if (GET_MODE_SIZE (mode) > 16)
5767 base_offset = (offset + 0x400) & ~0x7f0;
5768 /* For offsets aren't a multiple of the access size, the limit is
5769 -256...255. */
5770 else if (offset & (GET_MODE_SIZE (mode) - 1))
5772 base_offset = (offset + 0x100) & ~0x1ff;
5774 /* BLKmode typically uses LDP of X-registers. */
5775 if (mode == BLKmode)
5776 base_offset = (offset + 512) & ~0x3ff;
5778 /* Small negative offsets are supported. */
5779 else if (IN_RANGE (offset, -256, 0))
5780 base_offset = 0;
5781 else if (mode == TImode || mode == TFmode)
5782 base_offset = (offset + 0x100) & ~0x1ff;
5783 /* Use 12-bit offset by access size. */
5784 else
5785 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5787 if (base_offset != 0)
5789 base = plus_constant (Pmode, base, base_offset);
5790 base = force_operand (base, NULL_RTX);
5791 return plus_constant (Pmode, base, offset - base_offset);
5795 return x;
5798 /* Return the reload icode required for a constant pool in mode. */
5799 static enum insn_code
5800 aarch64_constant_pool_reload_icode (machine_mode mode)
5802 switch (mode)
5804 case E_SFmode:
5805 return CODE_FOR_aarch64_reload_movcpsfdi;
5807 case E_DFmode:
5808 return CODE_FOR_aarch64_reload_movcpdfdi;
5810 case E_TFmode:
5811 return CODE_FOR_aarch64_reload_movcptfdi;
5813 case E_V8QImode:
5814 return CODE_FOR_aarch64_reload_movcpv8qidi;
5816 case E_V16QImode:
5817 return CODE_FOR_aarch64_reload_movcpv16qidi;
5819 case E_V4HImode:
5820 return CODE_FOR_aarch64_reload_movcpv4hidi;
5822 case E_V8HImode:
5823 return CODE_FOR_aarch64_reload_movcpv8hidi;
5825 case E_V2SImode:
5826 return CODE_FOR_aarch64_reload_movcpv2sidi;
5828 case E_V4SImode:
5829 return CODE_FOR_aarch64_reload_movcpv4sidi;
5831 case E_V2DImode:
5832 return CODE_FOR_aarch64_reload_movcpv2didi;
5834 case E_V2DFmode:
5835 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5837 default:
5838 gcc_unreachable ();
5841 gcc_unreachable ();
5843 static reg_class_t
5844 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5845 reg_class_t rclass,
5846 machine_mode mode,
5847 secondary_reload_info *sri)
5850 /* If we have to disable direct literal pool loads and stores because the
5851 function is too big, then we need a scratch register. */
5852 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5853 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5854 || targetm.vector_mode_supported_p (GET_MODE (x)))
5855 && !aarch64_pcrelative_literal_loads)
5857 sri->icode = aarch64_constant_pool_reload_icode (mode);
5858 return NO_REGS;
5861 /* Without the TARGET_SIMD instructions we cannot move a Q register
5862 to a Q register directly. We need a scratch. */
5863 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5864 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5865 && reg_class_subset_p (rclass, FP_REGS))
5867 if (mode == TFmode)
5868 sri->icode = CODE_FOR_aarch64_reload_movtf;
5869 else if (mode == TImode)
5870 sri->icode = CODE_FOR_aarch64_reload_movti;
5871 return NO_REGS;
5874 /* A TFmode or TImode memory access should be handled via an FP_REGS
5875 because AArch64 has richer addressing modes for LDR/STR instructions
5876 than LDP/STP instructions. */
5877 if (TARGET_FLOAT && rclass == GENERAL_REGS
5878 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5879 return FP_REGS;
5881 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5882 return GENERAL_REGS;
5884 return NO_REGS;
5887 static bool
5888 aarch64_can_eliminate (const int from, const int to)
5890 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5891 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5893 if (frame_pointer_needed)
5895 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5896 return true;
5897 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5898 return false;
5899 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5900 && !cfun->calls_alloca)
5901 return true;
5902 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5903 return true;
5905 return false;
5907 else
5909 /* If we decided that we didn't need a leaf frame pointer but then used
5910 LR in the function, then we'll want a frame pointer after all, so
5911 prevent this elimination to ensure a frame pointer is used. */
5912 if (to == STACK_POINTER_REGNUM
5913 && flag_omit_leaf_frame_pointer
5914 && df_regs_ever_live_p (LR_REGNUM))
5915 return false;
5918 return true;
5921 HOST_WIDE_INT
5922 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5924 aarch64_layout_frame ();
5926 if (to == HARD_FRAME_POINTER_REGNUM)
5928 if (from == ARG_POINTER_REGNUM)
5929 return cfun->machine->frame.hard_fp_offset;
5931 if (from == FRAME_POINTER_REGNUM)
5932 return cfun->machine->frame.hard_fp_offset
5933 - cfun->machine->frame.locals_offset;
5936 if (to == STACK_POINTER_REGNUM)
5938 if (from == FRAME_POINTER_REGNUM)
5939 return cfun->machine->frame.frame_size
5940 - cfun->machine->frame.locals_offset;
5943 return cfun->machine->frame.frame_size;
5946 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5947 previous frame. */
5950 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5952 if (count != 0)
5953 return const0_rtx;
5954 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5958 static void
5959 aarch64_asm_trampoline_template (FILE *f)
5961 if (TARGET_ILP32)
5963 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5964 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5966 else
5968 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5969 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5971 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5972 assemble_aligned_integer (4, const0_rtx);
5973 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5974 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5977 static void
5978 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5980 rtx fnaddr, mem, a_tramp;
5981 const int tramp_code_sz = 16;
5983 /* Don't need to copy the trailing D-words, we fill those in below. */
5984 emit_block_move (m_tramp, assemble_trampoline_template (),
5985 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5986 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5987 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5988 if (GET_MODE (fnaddr) != ptr_mode)
5989 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5990 emit_move_insn (mem, fnaddr);
5992 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5993 emit_move_insn (mem, chain_value);
5995 /* XXX We should really define a "clear_cache" pattern and use
5996 gen_clear_cache(). */
5997 a_tramp = XEXP (m_tramp, 0);
5998 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5999 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6000 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6001 ptr_mode);
6004 static unsigned char
6005 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6007 switch (regclass)
6009 case CALLER_SAVE_REGS:
6010 case POINTER_REGS:
6011 case GENERAL_REGS:
6012 case ALL_REGS:
6013 case FP_REGS:
6014 case FP_LO_REGS:
6015 return
6016 aarch64_vector_mode_p (mode)
6017 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6018 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6019 case STACK_REG:
6020 return 1;
6022 case NO_REGS:
6023 return 0;
6025 default:
6026 break;
6028 gcc_unreachable ();
6031 static reg_class_t
6032 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6034 if (regclass == POINTER_REGS)
6035 return GENERAL_REGS;
6037 if (regclass == STACK_REG)
6039 if (REG_P(x)
6040 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6041 return regclass;
6043 return NO_REGS;
6046 /* Register eliminiation can result in a request for
6047 SP+constant->FP_REGS. We cannot support such operations which
6048 use SP as source and an FP_REG as destination, so reject out
6049 right now. */
6050 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6052 rtx lhs = XEXP (x, 0);
6054 /* Look through a possible SUBREG introduced by ILP32. */
6055 if (GET_CODE (lhs) == SUBREG)
6056 lhs = SUBREG_REG (lhs);
6058 gcc_assert (REG_P (lhs));
6059 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6060 POINTER_REGS));
6061 return NO_REGS;
6064 return regclass;
6067 void
6068 aarch64_asm_output_labelref (FILE* f, const char *name)
6070 asm_fprintf (f, "%U%s", name);
6073 static void
6074 aarch64_elf_asm_constructor (rtx symbol, int priority)
6076 if (priority == DEFAULT_INIT_PRIORITY)
6077 default_ctor_section_asm_out_constructor (symbol, priority);
6078 else
6080 section *s;
6081 /* While priority is known to be in range [0, 65535], so 18 bytes
6082 would be enough, the compiler might not know that. To avoid
6083 -Wformat-truncation false positive, use a larger size. */
6084 char buf[23];
6085 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6086 s = get_section (buf, SECTION_WRITE, NULL);
6087 switch_to_section (s);
6088 assemble_align (POINTER_SIZE);
6089 assemble_aligned_integer (POINTER_BYTES, symbol);
6093 static void
6094 aarch64_elf_asm_destructor (rtx symbol, int priority)
6096 if (priority == DEFAULT_INIT_PRIORITY)
6097 default_dtor_section_asm_out_destructor (symbol, priority);
6098 else
6100 section *s;
6101 /* While priority is known to be in range [0, 65535], so 18 bytes
6102 would be enough, the compiler might not know that. To avoid
6103 -Wformat-truncation false positive, use a larger size. */
6104 char buf[23];
6105 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6106 s = get_section (buf, SECTION_WRITE, NULL);
6107 switch_to_section (s);
6108 assemble_align (POINTER_SIZE);
6109 assemble_aligned_integer (POINTER_BYTES, symbol);
6113 const char*
6114 aarch64_output_casesi (rtx *operands)
6116 char buf[100];
6117 char label[100];
6118 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6119 int index;
6120 static const char *const patterns[4][2] =
6123 "ldrb\t%w3, [%0,%w1,uxtw]",
6124 "add\t%3, %4, %w3, sxtb #2"
6127 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6128 "add\t%3, %4, %w3, sxth #2"
6131 "ldr\t%w3, [%0,%w1,uxtw #2]",
6132 "add\t%3, %4, %w3, sxtw #2"
6134 /* We assume that DImode is only generated when not optimizing and
6135 that we don't really need 64-bit address offsets. That would
6136 imply an object file with 8GB of code in a single function! */
6138 "ldr\t%w3, [%0,%w1,uxtw #2]",
6139 "add\t%3, %4, %w3, sxtw #2"
6143 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6145 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6146 index = exact_log2 (GET_MODE_SIZE (mode));
6148 gcc_assert (index >= 0 && index <= 3);
6150 /* Need to implement table size reduction, by chaning the code below. */
6151 output_asm_insn (patterns[index][0], operands);
6152 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6153 snprintf (buf, sizeof (buf),
6154 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6155 output_asm_insn (buf, operands);
6156 output_asm_insn (patterns[index][1], operands);
6157 output_asm_insn ("br\t%3", operands);
6158 assemble_label (asm_out_file, label);
6159 return "";
6163 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6164 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6165 operator. */
6168 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6170 if (shift >= 0 && shift <= 3)
6172 int size;
6173 for (size = 8; size <= 32; size *= 2)
6175 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6176 if (mask == bits << shift)
6177 return size;
6180 return 0;
6183 /* Constant pools are per function only when PC relative
6184 literal loads are true or we are in the large memory
6185 model. */
6187 static inline bool
6188 aarch64_can_use_per_function_literal_pools_p (void)
6190 return (aarch64_pcrelative_literal_loads
6191 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6194 static bool
6195 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6197 /* Fixme:: In an ideal world this would work similar
6198 to the logic in aarch64_select_rtx_section but this
6199 breaks bootstrap in gcc go. For now we workaround
6200 this by returning false here. */
6201 return false;
6204 /* Select appropriate section for constants depending
6205 on where we place literal pools. */
6207 static section *
6208 aarch64_select_rtx_section (machine_mode mode,
6209 rtx x,
6210 unsigned HOST_WIDE_INT align)
6212 if (aarch64_can_use_per_function_literal_pools_p ())
6213 return function_section (current_function_decl);
6215 return default_elf_select_rtx_section (mode, x, align);
6218 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6219 void
6220 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6221 HOST_WIDE_INT offset)
6223 /* When using per-function literal pools, we must ensure that any code
6224 section is aligned to the minimal instruction length, lest we get
6225 errors from the assembler re "unaligned instructions". */
6226 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6227 ASM_OUTPUT_ALIGN (f, 2);
6230 /* Costs. */
6232 /* Helper function for rtx cost calculation. Strip a shift expression
6233 from X. Returns the inner operand if successful, or the original
6234 expression on failure. */
6235 static rtx
6236 aarch64_strip_shift (rtx x)
6238 rtx op = x;
6240 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6241 we can convert both to ROR during final output. */
6242 if ((GET_CODE (op) == ASHIFT
6243 || GET_CODE (op) == ASHIFTRT
6244 || GET_CODE (op) == LSHIFTRT
6245 || GET_CODE (op) == ROTATERT
6246 || GET_CODE (op) == ROTATE)
6247 && CONST_INT_P (XEXP (op, 1)))
6248 return XEXP (op, 0);
6250 if (GET_CODE (op) == MULT
6251 && CONST_INT_P (XEXP (op, 1))
6252 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6253 return XEXP (op, 0);
6255 return x;
6258 /* Helper function for rtx cost calculation. Strip an extend
6259 expression from X. Returns the inner operand if successful, or the
6260 original expression on failure. We deal with a number of possible
6261 canonicalization variations here. If STRIP_SHIFT is true, then
6262 we can strip off a shift also. */
6263 static rtx
6264 aarch64_strip_extend (rtx x, bool strip_shift)
6266 scalar_int_mode mode;
6267 rtx op = x;
6269 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6270 return op;
6272 /* Zero and sign extraction of a widened value. */
6273 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6274 && XEXP (op, 2) == const0_rtx
6275 && GET_CODE (XEXP (op, 0)) == MULT
6276 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6277 XEXP (op, 1)))
6278 return XEXP (XEXP (op, 0), 0);
6280 /* It can also be represented (for zero-extend) as an AND with an
6281 immediate. */
6282 if (GET_CODE (op) == AND
6283 && GET_CODE (XEXP (op, 0)) == MULT
6284 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6285 && CONST_INT_P (XEXP (op, 1))
6286 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6287 INTVAL (XEXP (op, 1))) != 0)
6288 return XEXP (XEXP (op, 0), 0);
6290 /* Now handle extended register, as this may also have an optional
6291 left shift by 1..4. */
6292 if (strip_shift
6293 && GET_CODE (op) == ASHIFT
6294 && CONST_INT_P (XEXP (op, 1))
6295 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6296 op = XEXP (op, 0);
6298 if (GET_CODE (op) == ZERO_EXTEND
6299 || GET_CODE (op) == SIGN_EXTEND)
6300 op = XEXP (op, 0);
6302 if (op != x)
6303 return op;
6305 return x;
6308 /* Return true iff CODE is a shift supported in combination
6309 with arithmetic instructions. */
6311 static bool
6312 aarch64_shift_p (enum rtx_code code)
6314 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6318 /* Return true iff X is a cheap shift without a sign extend. */
6320 static bool
6321 aarch64_cheap_mult_shift_p (rtx x)
6323 rtx op0, op1;
6325 op0 = XEXP (x, 0);
6326 op1 = XEXP (x, 1);
6328 if (!(aarch64_tune_params.extra_tuning_flags
6329 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6330 return false;
6332 if (GET_CODE (op0) == SIGN_EXTEND)
6333 return false;
6335 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6336 && UINTVAL (op1) <= 4)
6337 return true;
6339 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6340 return false;
6342 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6344 if (l2 > 0 && l2 <= 4)
6345 return true;
6347 return false;
6350 /* Helper function for rtx cost calculation. Calculate the cost of
6351 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6352 Return the calculated cost of the expression, recursing manually in to
6353 operands where needed. */
6355 static int
6356 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6358 rtx op0, op1;
6359 const struct cpu_cost_table *extra_cost
6360 = aarch64_tune_params.insn_extra_cost;
6361 int cost = 0;
6362 bool compound_p = (outer == PLUS || outer == MINUS);
6363 machine_mode mode = GET_MODE (x);
6365 gcc_checking_assert (code == MULT);
6367 op0 = XEXP (x, 0);
6368 op1 = XEXP (x, 1);
6370 if (VECTOR_MODE_P (mode))
6371 mode = GET_MODE_INNER (mode);
6373 /* Integer multiply/fma. */
6374 if (GET_MODE_CLASS (mode) == MODE_INT)
6376 /* The multiply will be canonicalized as a shift, cost it as such. */
6377 if (aarch64_shift_p (GET_CODE (x))
6378 || (CONST_INT_P (op1)
6379 && exact_log2 (INTVAL (op1)) > 0))
6381 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6382 || GET_CODE (op0) == SIGN_EXTEND;
6383 if (speed)
6385 if (compound_p)
6387 /* If the shift is considered cheap,
6388 then don't add any cost. */
6389 if (aarch64_cheap_mult_shift_p (x))
6391 else if (REG_P (op1))
6392 /* ARITH + shift-by-register. */
6393 cost += extra_cost->alu.arith_shift_reg;
6394 else if (is_extend)
6395 /* ARITH + extended register. We don't have a cost field
6396 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6397 cost += extra_cost->alu.extend_arith;
6398 else
6399 /* ARITH + shift-by-immediate. */
6400 cost += extra_cost->alu.arith_shift;
6402 else
6403 /* LSL (immediate). */
6404 cost += extra_cost->alu.shift;
6407 /* Strip extends as we will have costed them in the case above. */
6408 if (is_extend)
6409 op0 = aarch64_strip_extend (op0, true);
6411 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6413 return cost;
6416 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6417 compound and let the below cases handle it. After all, MNEG is a
6418 special-case alias of MSUB. */
6419 if (GET_CODE (op0) == NEG)
6421 op0 = XEXP (op0, 0);
6422 compound_p = true;
6425 /* Integer multiplies or FMAs have zero/sign extending variants. */
6426 if ((GET_CODE (op0) == ZERO_EXTEND
6427 && GET_CODE (op1) == ZERO_EXTEND)
6428 || (GET_CODE (op0) == SIGN_EXTEND
6429 && GET_CODE (op1) == SIGN_EXTEND))
6431 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6432 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6434 if (speed)
6436 if (compound_p)
6437 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6438 cost += extra_cost->mult[0].extend_add;
6439 else
6440 /* MUL/SMULL/UMULL. */
6441 cost += extra_cost->mult[0].extend;
6444 return cost;
6447 /* This is either an integer multiply or a MADD. In both cases
6448 we want to recurse and cost the operands. */
6449 cost += rtx_cost (op0, mode, MULT, 0, speed);
6450 cost += rtx_cost (op1, mode, MULT, 1, speed);
6452 if (speed)
6454 if (compound_p)
6455 /* MADD/MSUB. */
6456 cost += extra_cost->mult[mode == DImode].add;
6457 else
6458 /* MUL. */
6459 cost += extra_cost->mult[mode == DImode].simple;
6462 return cost;
6464 else
6466 if (speed)
6468 /* Floating-point FMA/FMUL can also support negations of the
6469 operands, unless the rounding mode is upward or downward in
6470 which case FNMUL is different than FMUL with operand negation. */
6471 bool neg0 = GET_CODE (op0) == NEG;
6472 bool neg1 = GET_CODE (op1) == NEG;
6473 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6475 if (neg0)
6476 op0 = XEXP (op0, 0);
6477 if (neg1)
6478 op1 = XEXP (op1, 0);
6481 if (compound_p)
6482 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6483 cost += extra_cost->fp[mode == DFmode].fma;
6484 else
6485 /* FMUL/FNMUL. */
6486 cost += extra_cost->fp[mode == DFmode].mult;
6489 cost += rtx_cost (op0, mode, MULT, 0, speed);
6490 cost += rtx_cost (op1, mode, MULT, 1, speed);
6491 return cost;
6495 static int
6496 aarch64_address_cost (rtx x,
6497 machine_mode mode,
6498 addr_space_t as ATTRIBUTE_UNUSED,
6499 bool speed)
6501 enum rtx_code c = GET_CODE (x);
6502 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6503 struct aarch64_address_info info;
6504 int cost = 0;
6505 info.shift = 0;
6507 if (!aarch64_classify_address (&info, x, mode, c, false))
6509 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6511 /* This is a CONST or SYMBOL ref which will be split
6512 in a different way depending on the code model in use.
6513 Cost it through the generic infrastructure. */
6514 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6515 /* Divide through by the cost of one instruction to
6516 bring it to the same units as the address costs. */
6517 cost_symbol_ref /= COSTS_N_INSNS (1);
6518 /* The cost is then the cost of preparing the address,
6519 followed by an immediate (possibly 0) offset. */
6520 return cost_symbol_ref + addr_cost->imm_offset;
6522 else
6524 /* This is most likely a jump table from a case
6525 statement. */
6526 return addr_cost->register_offset;
6530 switch (info.type)
6532 case ADDRESS_LO_SUM:
6533 case ADDRESS_SYMBOLIC:
6534 case ADDRESS_REG_IMM:
6535 cost += addr_cost->imm_offset;
6536 break;
6538 case ADDRESS_REG_WB:
6539 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6540 cost += addr_cost->pre_modify;
6541 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6542 cost += addr_cost->post_modify;
6543 else
6544 gcc_unreachable ();
6546 break;
6548 case ADDRESS_REG_REG:
6549 cost += addr_cost->register_offset;
6550 break;
6552 case ADDRESS_REG_SXTW:
6553 cost += addr_cost->register_sextend;
6554 break;
6556 case ADDRESS_REG_UXTW:
6557 cost += addr_cost->register_zextend;
6558 break;
6560 default:
6561 gcc_unreachable ();
6565 if (info.shift > 0)
6567 /* For the sake of calculating the cost of the shifted register
6568 component, we can treat same sized modes in the same way. */
6569 switch (GET_MODE_BITSIZE (mode))
6571 case 16:
6572 cost += addr_cost->addr_scale_costs.hi;
6573 break;
6575 case 32:
6576 cost += addr_cost->addr_scale_costs.si;
6577 break;
6579 case 64:
6580 cost += addr_cost->addr_scale_costs.di;
6581 break;
6583 /* We can't tell, or this is a 128-bit vector. */
6584 default:
6585 cost += addr_cost->addr_scale_costs.ti;
6586 break;
6590 return cost;
6593 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6594 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6595 to be taken. */
6598 aarch64_branch_cost (bool speed_p, bool predictable_p)
6600 /* When optimizing for speed, use the cost of unpredictable branches. */
6601 const struct cpu_branch_cost *branch_costs =
6602 aarch64_tune_params.branch_costs;
6604 if (!speed_p || predictable_p)
6605 return branch_costs->predictable;
6606 else
6607 return branch_costs->unpredictable;
6610 /* Return true if the RTX X in mode MODE is a zero or sign extract
6611 usable in an ADD or SUB (extended register) instruction. */
6612 static bool
6613 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6615 /* Catch add with a sign extract.
6616 This is add_<optab><mode>_multp2. */
6617 if (GET_CODE (x) == SIGN_EXTRACT
6618 || GET_CODE (x) == ZERO_EXTRACT)
6620 rtx op0 = XEXP (x, 0);
6621 rtx op1 = XEXP (x, 1);
6622 rtx op2 = XEXP (x, 2);
6624 if (GET_CODE (op0) == MULT
6625 && CONST_INT_P (op1)
6626 && op2 == const0_rtx
6627 && CONST_INT_P (XEXP (op0, 1))
6628 && aarch64_is_extend_from_extract (mode,
6629 XEXP (op0, 1),
6630 op1))
6632 return true;
6635 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6636 No shift. */
6637 else if (GET_CODE (x) == SIGN_EXTEND
6638 || GET_CODE (x) == ZERO_EXTEND)
6639 return REG_P (XEXP (x, 0));
6641 return false;
6644 static bool
6645 aarch64_frint_unspec_p (unsigned int u)
6647 switch (u)
6649 case UNSPEC_FRINTZ:
6650 case UNSPEC_FRINTP:
6651 case UNSPEC_FRINTM:
6652 case UNSPEC_FRINTA:
6653 case UNSPEC_FRINTN:
6654 case UNSPEC_FRINTX:
6655 case UNSPEC_FRINTI:
6656 return true;
6658 default:
6659 return false;
6663 /* Return true iff X is an rtx that will match an extr instruction
6664 i.e. as described in the *extr<mode>5_insn family of patterns.
6665 OP0 and OP1 will be set to the operands of the shifts involved
6666 on success and will be NULL_RTX otherwise. */
6668 static bool
6669 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6671 rtx op0, op1;
6672 scalar_int_mode mode;
6673 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6674 return false;
6676 *res_op0 = NULL_RTX;
6677 *res_op1 = NULL_RTX;
6679 if (GET_CODE (x) != IOR)
6680 return false;
6682 op0 = XEXP (x, 0);
6683 op1 = XEXP (x, 1);
6685 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6686 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6688 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6689 if (GET_CODE (op1) == ASHIFT)
6690 std::swap (op0, op1);
6692 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6693 return false;
6695 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6696 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6698 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6699 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6701 *res_op0 = XEXP (op0, 0);
6702 *res_op1 = XEXP (op1, 0);
6703 return true;
6707 return false;
6710 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6711 storing it in *COST. Result is true if the total cost of the operation
6712 has now been calculated. */
6713 static bool
6714 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6716 rtx inner;
6717 rtx comparator;
6718 enum rtx_code cmpcode;
6720 if (COMPARISON_P (op0))
6722 inner = XEXP (op0, 0);
6723 comparator = XEXP (op0, 1);
6724 cmpcode = GET_CODE (op0);
6726 else
6728 inner = op0;
6729 comparator = const0_rtx;
6730 cmpcode = NE;
6733 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6735 /* Conditional branch. */
6736 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6737 return true;
6738 else
6740 if (cmpcode == NE || cmpcode == EQ)
6742 if (comparator == const0_rtx)
6744 /* TBZ/TBNZ/CBZ/CBNZ. */
6745 if (GET_CODE (inner) == ZERO_EXTRACT)
6746 /* TBZ/TBNZ. */
6747 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6748 ZERO_EXTRACT, 0, speed);
6749 else
6750 /* CBZ/CBNZ. */
6751 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6753 return true;
6756 else if (cmpcode == LT || cmpcode == GE)
6758 /* TBZ/TBNZ. */
6759 if (comparator == const0_rtx)
6760 return true;
6764 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6766 /* CCMP. */
6767 if (GET_CODE (op1) == COMPARE)
6769 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6770 if (XEXP (op1, 1) == const0_rtx)
6771 *cost += 1;
6772 if (speed)
6774 machine_mode mode = GET_MODE (XEXP (op1, 0));
6775 const struct cpu_cost_table *extra_cost
6776 = aarch64_tune_params.insn_extra_cost;
6778 if (GET_MODE_CLASS (mode) == MODE_INT)
6779 *cost += extra_cost->alu.arith;
6780 else
6781 *cost += extra_cost->fp[mode == DFmode].compare;
6783 return true;
6786 /* It's a conditional operation based on the status flags,
6787 so it must be some flavor of CSEL. */
6789 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6790 if (GET_CODE (op1) == NEG
6791 || GET_CODE (op1) == NOT
6792 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6793 op1 = XEXP (op1, 0);
6794 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6796 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6797 op1 = XEXP (op1, 0);
6798 op2 = XEXP (op2, 0);
6801 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6802 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6803 return true;
6806 /* We don't know what this is, cost all operands. */
6807 return false;
6810 /* Check whether X is a bitfield operation of the form shift + extend that
6811 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6812 operand to which the bitfield operation is applied. Otherwise return
6813 NULL_RTX. */
6815 static rtx
6816 aarch64_extend_bitfield_pattern_p (rtx x)
6818 rtx_code outer_code = GET_CODE (x);
6819 machine_mode outer_mode = GET_MODE (x);
6821 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6822 && outer_mode != SImode && outer_mode != DImode)
6823 return NULL_RTX;
6825 rtx inner = XEXP (x, 0);
6826 rtx_code inner_code = GET_CODE (inner);
6827 machine_mode inner_mode = GET_MODE (inner);
6828 rtx op = NULL_RTX;
6830 switch (inner_code)
6832 case ASHIFT:
6833 if (CONST_INT_P (XEXP (inner, 1))
6834 && (inner_mode == QImode || inner_mode == HImode))
6835 op = XEXP (inner, 0);
6836 break;
6837 case LSHIFTRT:
6838 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6839 && (inner_mode == QImode || inner_mode == HImode))
6840 op = XEXP (inner, 0);
6841 break;
6842 case ASHIFTRT:
6843 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6844 && (inner_mode == QImode || inner_mode == HImode))
6845 op = XEXP (inner, 0);
6846 break;
6847 default:
6848 break;
6851 return op;
6854 /* Return true if the mask and a shift amount from an RTX of the form
6855 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6856 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6858 bool
6859 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6860 rtx shft_amnt)
6862 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6863 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6864 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6865 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6868 /* Calculate the cost of calculating X, storing it in *COST. Result
6869 is true if the total cost of the operation has now been calculated. */
6870 static bool
6871 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6872 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6874 rtx op0, op1, op2;
6875 const struct cpu_cost_table *extra_cost
6876 = aarch64_tune_params.insn_extra_cost;
6877 int code = GET_CODE (x);
6878 scalar_int_mode int_mode;
6880 /* By default, assume that everything has equivalent cost to the
6881 cheapest instruction. Any additional costs are applied as a delta
6882 above this default. */
6883 *cost = COSTS_N_INSNS (1);
6885 switch (code)
6887 case SET:
6888 /* The cost depends entirely on the operands to SET. */
6889 *cost = 0;
6890 op0 = SET_DEST (x);
6891 op1 = SET_SRC (x);
6893 switch (GET_CODE (op0))
6895 case MEM:
6896 if (speed)
6898 rtx address = XEXP (op0, 0);
6899 if (VECTOR_MODE_P (mode))
6900 *cost += extra_cost->ldst.storev;
6901 else if (GET_MODE_CLASS (mode) == MODE_INT)
6902 *cost += extra_cost->ldst.store;
6903 else if (mode == SFmode)
6904 *cost += extra_cost->ldst.storef;
6905 else if (mode == DFmode)
6906 *cost += extra_cost->ldst.stored;
6908 *cost +=
6909 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6910 0, speed));
6913 *cost += rtx_cost (op1, mode, SET, 1, speed);
6914 return true;
6916 case SUBREG:
6917 if (! REG_P (SUBREG_REG (op0)))
6918 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6920 /* Fall through. */
6921 case REG:
6922 /* The cost is one per vector-register copied. */
6923 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6925 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6926 / GET_MODE_SIZE (V4SImode);
6927 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6929 /* const0_rtx is in general free, but we will use an
6930 instruction to set a register to 0. */
6931 else if (REG_P (op1) || op1 == const0_rtx)
6933 /* The cost is 1 per register copied. */
6934 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6935 / UNITS_PER_WORD;
6936 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6938 else
6939 /* Cost is just the cost of the RHS of the set. */
6940 *cost += rtx_cost (op1, mode, SET, 1, speed);
6941 return true;
6943 case ZERO_EXTRACT:
6944 case SIGN_EXTRACT:
6945 /* Bit-field insertion. Strip any redundant widening of
6946 the RHS to meet the width of the target. */
6947 if (GET_CODE (op1) == SUBREG)
6948 op1 = SUBREG_REG (op1);
6949 if ((GET_CODE (op1) == ZERO_EXTEND
6950 || GET_CODE (op1) == SIGN_EXTEND)
6951 && CONST_INT_P (XEXP (op0, 1))
6952 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6953 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6954 op1 = XEXP (op1, 0);
6956 if (CONST_INT_P (op1))
6958 /* MOV immediate is assumed to always be cheap. */
6959 *cost = COSTS_N_INSNS (1);
6961 else
6963 /* BFM. */
6964 if (speed)
6965 *cost += extra_cost->alu.bfi;
6966 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6969 return true;
6971 default:
6972 /* We can't make sense of this, assume default cost. */
6973 *cost = COSTS_N_INSNS (1);
6974 return false;
6976 return false;
6978 case CONST_INT:
6979 /* If an instruction can incorporate a constant within the
6980 instruction, the instruction's expression avoids calling
6981 rtx_cost() on the constant. If rtx_cost() is called on a
6982 constant, then it is usually because the constant must be
6983 moved into a register by one or more instructions.
6985 The exception is constant 0, which can be expressed
6986 as XZR/WZR and is therefore free. The exception to this is
6987 if we have (set (reg) (const0_rtx)) in which case we must cost
6988 the move. However, we can catch that when we cost the SET, so
6989 we don't need to consider that here. */
6990 if (x == const0_rtx)
6991 *cost = 0;
6992 else
6994 /* To an approximation, building any other constant is
6995 proportionally expensive to the number of instructions
6996 required to build that constant. This is true whether we
6997 are compiling for SPEED or otherwise. */
6998 if (!is_a <scalar_int_mode> (mode, &int_mode))
6999 int_mode = word_mode;
7000 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7001 (NULL_RTX, x, false, int_mode));
7003 return true;
7005 case CONST_DOUBLE:
7007 /* First determine number of instructions to do the move
7008 as an integer constant. */
7009 if (!aarch64_float_const_representable_p (x)
7010 && !aarch64_can_const_movi_rtx_p (x, mode)
7011 && aarch64_float_const_rtx_p (x))
7013 unsigned HOST_WIDE_INT ival;
7014 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7015 gcc_assert (succeed);
7017 scalar_int_mode imode = (mode == HFmode
7018 ? SImode
7019 : int_mode_for_mode (mode).require ());
7020 int ncost = aarch64_internal_mov_immediate
7021 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7022 *cost += COSTS_N_INSNS (ncost);
7023 return true;
7026 if (speed)
7028 /* mov[df,sf]_aarch64. */
7029 if (aarch64_float_const_representable_p (x))
7030 /* FMOV (scalar immediate). */
7031 *cost += extra_cost->fp[mode == DFmode].fpconst;
7032 else if (!aarch64_float_const_zero_rtx_p (x))
7034 /* This will be a load from memory. */
7035 if (mode == DFmode)
7036 *cost += extra_cost->ldst.loadd;
7037 else
7038 *cost += extra_cost->ldst.loadf;
7040 else
7041 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7042 or MOV v0.s[0], wzr - neither of which are modeled by the
7043 cost tables. Just use the default cost. */
7048 return true;
7050 case MEM:
7051 if (speed)
7053 /* For loads we want the base cost of a load, plus an
7054 approximation for the additional cost of the addressing
7055 mode. */
7056 rtx address = XEXP (x, 0);
7057 if (VECTOR_MODE_P (mode))
7058 *cost += extra_cost->ldst.loadv;
7059 else if (GET_MODE_CLASS (mode) == MODE_INT)
7060 *cost += extra_cost->ldst.load;
7061 else if (mode == SFmode)
7062 *cost += extra_cost->ldst.loadf;
7063 else if (mode == DFmode)
7064 *cost += extra_cost->ldst.loadd;
7066 *cost +=
7067 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7068 0, speed));
7071 return true;
7073 case NEG:
7074 op0 = XEXP (x, 0);
7076 if (VECTOR_MODE_P (mode))
7078 if (speed)
7080 /* FNEG. */
7081 *cost += extra_cost->vect.alu;
7083 return false;
7086 if (GET_MODE_CLASS (mode) == MODE_INT)
7088 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7089 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7091 /* CSETM. */
7092 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7093 return true;
7096 /* Cost this as SUB wzr, X. */
7097 op0 = CONST0_RTX (mode);
7098 op1 = XEXP (x, 0);
7099 goto cost_minus;
7102 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7104 /* Support (neg(fma...)) as a single instruction only if
7105 sign of zeros is unimportant. This matches the decision
7106 making in aarch64.md. */
7107 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7109 /* FNMADD. */
7110 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7111 return true;
7113 if (GET_CODE (op0) == MULT)
7115 /* FNMUL. */
7116 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7117 return true;
7119 if (speed)
7120 /* FNEG. */
7121 *cost += extra_cost->fp[mode == DFmode].neg;
7122 return false;
7125 return false;
7127 case CLRSB:
7128 case CLZ:
7129 if (speed)
7131 if (VECTOR_MODE_P (mode))
7132 *cost += extra_cost->vect.alu;
7133 else
7134 *cost += extra_cost->alu.clz;
7137 return false;
7139 case COMPARE:
7140 op0 = XEXP (x, 0);
7141 op1 = XEXP (x, 1);
7143 if (op1 == const0_rtx
7144 && GET_CODE (op0) == AND)
7146 x = op0;
7147 mode = GET_MODE (op0);
7148 goto cost_logic;
7151 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7153 /* TODO: A write to the CC flags possibly costs extra, this
7154 needs encoding in the cost tables. */
7156 mode = GET_MODE (op0);
7157 /* ANDS. */
7158 if (GET_CODE (op0) == AND)
7160 x = op0;
7161 goto cost_logic;
7164 if (GET_CODE (op0) == PLUS)
7166 /* ADDS (and CMN alias). */
7167 x = op0;
7168 goto cost_plus;
7171 if (GET_CODE (op0) == MINUS)
7173 /* SUBS. */
7174 x = op0;
7175 goto cost_minus;
7178 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7179 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7180 && CONST_INT_P (XEXP (op0, 2)))
7182 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7183 Handle it here directly rather than going to cost_logic
7184 since we know the immediate generated for the TST is valid
7185 so we can avoid creating an intermediate rtx for it only
7186 for costing purposes. */
7187 if (speed)
7188 *cost += extra_cost->alu.logical;
7190 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7191 ZERO_EXTRACT, 0, speed);
7192 return true;
7195 if (GET_CODE (op1) == NEG)
7197 /* CMN. */
7198 if (speed)
7199 *cost += extra_cost->alu.arith;
7201 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7202 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7203 return true;
7206 /* CMP.
7208 Compare can freely swap the order of operands, and
7209 canonicalization puts the more complex operation first.
7210 But the integer MINUS logic expects the shift/extend
7211 operation in op1. */
7212 if (! (REG_P (op0)
7213 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7215 op0 = XEXP (x, 1);
7216 op1 = XEXP (x, 0);
7218 goto cost_minus;
7221 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7223 /* FCMP. */
7224 if (speed)
7225 *cost += extra_cost->fp[mode == DFmode].compare;
7227 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7229 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7230 /* FCMP supports constant 0.0 for no extra cost. */
7231 return true;
7233 return false;
7236 if (VECTOR_MODE_P (mode))
7238 /* Vector compare. */
7239 if (speed)
7240 *cost += extra_cost->vect.alu;
7242 if (aarch64_float_const_zero_rtx_p (op1))
7244 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7245 cost. */
7246 return true;
7248 return false;
7250 return false;
7252 case MINUS:
7254 op0 = XEXP (x, 0);
7255 op1 = XEXP (x, 1);
7257 cost_minus:
7258 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7260 /* Detect valid immediates. */
7261 if ((GET_MODE_CLASS (mode) == MODE_INT
7262 || (GET_MODE_CLASS (mode) == MODE_CC
7263 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7264 && CONST_INT_P (op1)
7265 && aarch64_uimm12_shift (INTVAL (op1)))
7267 if (speed)
7268 /* SUB(S) (immediate). */
7269 *cost += extra_cost->alu.arith;
7270 return true;
7273 /* Look for SUB (extended register). */
7274 if (is_a <scalar_int_mode> (mode, &int_mode)
7275 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7277 if (speed)
7278 *cost += extra_cost->alu.extend_arith;
7280 op1 = aarch64_strip_extend (op1, true);
7281 *cost += rtx_cost (op1, VOIDmode,
7282 (enum rtx_code) GET_CODE (op1), 0, speed);
7283 return true;
7286 rtx new_op1 = aarch64_strip_extend (op1, false);
7288 /* Cost this as an FMA-alike operation. */
7289 if ((GET_CODE (new_op1) == MULT
7290 || aarch64_shift_p (GET_CODE (new_op1)))
7291 && code != COMPARE)
7293 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7294 (enum rtx_code) code,
7295 speed);
7296 return true;
7299 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7301 if (speed)
7303 if (VECTOR_MODE_P (mode))
7305 /* Vector SUB. */
7306 *cost += extra_cost->vect.alu;
7308 else if (GET_MODE_CLASS (mode) == MODE_INT)
7310 /* SUB(S). */
7311 *cost += extra_cost->alu.arith;
7313 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7315 /* FSUB. */
7316 *cost += extra_cost->fp[mode == DFmode].addsub;
7319 return true;
7322 case PLUS:
7324 rtx new_op0;
7326 op0 = XEXP (x, 0);
7327 op1 = XEXP (x, 1);
7329 cost_plus:
7330 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7331 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7333 /* CSINC. */
7334 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7335 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7336 return true;
7339 if (GET_MODE_CLASS (mode) == MODE_INT
7340 && CONST_INT_P (op1)
7341 && aarch64_uimm12_shift (INTVAL (op1)))
7343 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7345 if (speed)
7346 /* ADD (immediate). */
7347 *cost += extra_cost->alu.arith;
7348 return true;
7351 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7353 /* Look for ADD (extended register). */
7354 if (is_a <scalar_int_mode> (mode, &int_mode)
7355 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7357 if (speed)
7358 *cost += extra_cost->alu.extend_arith;
7360 op0 = aarch64_strip_extend (op0, true);
7361 *cost += rtx_cost (op0, VOIDmode,
7362 (enum rtx_code) GET_CODE (op0), 0, speed);
7363 return true;
7366 /* Strip any extend, leave shifts behind as we will
7367 cost them through mult_cost. */
7368 new_op0 = aarch64_strip_extend (op0, false);
7370 if (GET_CODE (new_op0) == MULT
7371 || aarch64_shift_p (GET_CODE (new_op0)))
7373 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7374 speed);
7375 return true;
7378 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7380 if (speed)
7382 if (VECTOR_MODE_P (mode))
7384 /* Vector ADD. */
7385 *cost += extra_cost->vect.alu;
7387 else if (GET_MODE_CLASS (mode) == MODE_INT)
7389 /* ADD. */
7390 *cost += extra_cost->alu.arith;
7392 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7394 /* FADD. */
7395 *cost += extra_cost->fp[mode == DFmode].addsub;
7398 return true;
7401 case BSWAP:
7402 *cost = COSTS_N_INSNS (1);
7404 if (speed)
7406 if (VECTOR_MODE_P (mode))
7407 *cost += extra_cost->vect.alu;
7408 else
7409 *cost += extra_cost->alu.rev;
7411 return false;
7413 case IOR:
7414 if (aarch_rev16_p (x))
7416 *cost = COSTS_N_INSNS (1);
7418 if (speed)
7420 if (VECTOR_MODE_P (mode))
7421 *cost += extra_cost->vect.alu;
7422 else
7423 *cost += extra_cost->alu.rev;
7425 return true;
7428 if (aarch64_extr_rtx_p (x, &op0, &op1))
7430 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7431 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7432 if (speed)
7433 *cost += extra_cost->alu.shift;
7435 return true;
7437 /* Fall through. */
7438 case XOR:
7439 case AND:
7440 cost_logic:
7441 op0 = XEXP (x, 0);
7442 op1 = XEXP (x, 1);
7444 if (VECTOR_MODE_P (mode))
7446 if (speed)
7447 *cost += extra_cost->vect.alu;
7448 return true;
7451 if (code == AND
7452 && GET_CODE (op0) == MULT
7453 && CONST_INT_P (XEXP (op0, 1))
7454 && CONST_INT_P (op1)
7455 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7456 INTVAL (op1)) != 0)
7458 /* This is a UBFM/SBFM. */
7459 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7460 if (speed)
7461 *cost += extra_cost->alu.bfx;
7462 return true;
7465 if (is_int_mode (mode, &int_mode))
7467 if (CONST_INT_P (op1))
7469 /* We have a mask + shift version of a UBFIZ
7470 i.e. the *andim_ashift<mode>_bfiz pattern. */
7471 if (GET_CODE (op0) == ASHIFT
7472 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7473 XEXP (op0, 1)))
7475 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7476 (enum rtx_code) code, 0, speed);
7477 if (speed)
7478 *cost += extra_cost->alu.bfx;
7480 return true;
7482 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7484 /* We possibly get the immediate for free, this is not
7485 modelled. */
7486 *cost += rtx_cost (op0, int_mode,
7487 (enum rtx_code) code, 0, speed);
7488 if (speed)
7489 *cost += extra_cost->alu.logical;
7491 return true;
7494 else
7496 rtx new_op0 = op0;
7498 /* Handle ORN, EON, or BIC. */
7499 if (GET_CODE (op0) == NOT)
7500 op0 = XEXP (op0, 0);
7502 new_op0 = aarch64_strip_shift (op0);
7504 /* If we had a shift on op0 then this is a logical-shift-
7505 by-register/immediate operation. Otherwise, this is just
7506 a logical operation. */
7507 if (speed)
7509 if (new_op0 != op0)
7511 /* Shift by immediate. */
7512 if (CONST_INT_P (XEXP (op0, 1)))
7513 *cost += extra_cost->alu.log_shift;
7514 else
7515 *cost += extra_cost->alu.log_shift_reg;
7517 else
7518 *cost += extra_cost->alu.logical;
7521 /* In both cases we want to cost both operands. */
7522 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7523 0, speed);
7524 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7525 1, speed);
7527 return true;
7530 return false;
7532 case NOT:
7533 x = XEXP (x, 0);
7534 op0 = aarch64_strip_shift (x);
7536 if (VECTOR_MODE_P (mode))
7538 /* Vector NOT. */
7539 *cost += extra_cost->vect.alu;
7540 return false;
7543 /* MVN-shifted-reg. */
7544 if (op0 != x)
7546 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7548 if (speed)
7549 *cost += extra_cost->alu.log_shift;
7551 return true;
7553 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7554 Handle the second form here taking care that 'a' in the above can
7555 be a shift. */
7556 else if (GET_CODE (op0) == XOR)
7558 rtx newop0 = XEXP (op0, 0);
7559 rtx newop1 = XEXP (op0, 1);
7560 rtx op0_stripped = aarch64_strip_shift (newop0);
7562 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7563 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7565 if (speed)
7567 if (op0_stripped != newop0)
7568 *cost += extra_cost->alu.log_shift;
7569 else
7570 *cost += extra_cost->alu.logical;
7573 return true;
7575 /* MVN. */
7576 if (speed)
7577 *cost += extra_cost->alu.logical;
7579 return false;
7581 case ZERO_EXTEND:
7583 op0 = XEXP (x, 0);
7584 /* If a value is written in SI mode, then zero extended to DI
7585 mode, the operation will in general be free as a write to
7586 a 'w' register implicitly zeroes the upper bits of an 'x'
7587 register. However, if this is
7589 (set (reg) (zero_extend (reg)))
7591 we must cost the explicit register move. */
7592 if (mode == DImode
7593 && GET_MODE (op0) == SImode
7594 && outer == SET)
7596 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7598 /* If OP_COST is non-zero, then the cost of the zero extend
7599 is effectively the cost of the inner operation. Otherwise
7600 we have a MOV instruction and we take the cost from the MOV
7601 itself. This is true independently of whether we are
7602 optimizing for space or time. */
7603 if (op_cost)
7604 *cost = op_cost;
7606 return true;
7608 else if (MEM_P (op0))
7610 /* All loads can zero extend to any size for free. */
7611 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7612 return true;
7615 op0 = aarch64_extend_bitfield_pattern_p (x);
7616 if (op0)
7618 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7619 if (speed)
7620 *cost += extra_cost->alu.bfx;
7621 return true;
7624 if (speed)
7626 if (VECTOR_MODE_P (mode))
7628 /* UMOV. */
7629 *cost += extra_cost->vect.alu;
7631 else
7633 /* We generate an AND instead of UXTB/UXTH. */
7634 *cost += extra_cost->alu.logical;
7637 return false;
7639 case SIGN_EXTEND:
7640 if (MEM_P (XEXP (x, 0)))
7642 /* LDRSH. */
7643 if (speed)
7645 rtx address = XEXP (XEXP (x, 0), 0);
7646 *cost += extra_cost->ldst.load_sign_extend;
7648 *cost +=
7649 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7650 0, speed));
7652 return true;
7655 op0 = aarch64_extend_bitfield_pattern_p (x);
7656 if (op0)
7658 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7659 if (speed)
7660 *cost += extra_cost->alu.bfx;
7661 return true;
7664 if (speed)
7666 if (VECTOR_MODE_P (mode))
7667 *cost += extra_cost->vect.alu;
7668 else
7669 *cost += extra_cost->alu.extend;
7671 return false;
7673 case ASHIFT:
7674 op0 = XEXP (x, 0);
7675 op1 = XEXP (x, 1);
7677 if (CONST_INT_P (op1))
7679 if (speed)
7681 if (VECTOR_MODE_P (mode))
7683 /* Vector shift (immediate). */
7684 *cost += extra_cost->vect.alu;
7686 else
7688 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7689 aliases. */
7690 *cost += extra_cost->alu.shift;
7694 /* We can incorporate zero/sign extend for free. */
7695 if (GET_CODE (op0) == ZERO_EXTEND
7696 || GET_CODE (op0) == SIGN_EXTEND)
7697 op0 = XEXP (op0, 0);
7699 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7700 return true;
7702 else
7704 if (VECTOR_MODE_P (mode))
7706 if (speed)
7707 /* Vector shift (register). */
7708 *cost += extra_cost->vect.alu;
7710 else
7712 if (speed)
7713 /* LSLV. */
7714 *cost += extra_cost->alu.shift_reg;
7716 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7717 && CONST_INT_P (XEXP (op1, 1))
7718 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7720 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7721 /* We already demanded XEXP (op1, 0) to be REG_P, so
7722 don't recurse into it. */
7723 return true;
7726 return false; /* All arguments need to be in registers. */
7729 case ROTATE:
7730 case ROTATERT:
7731 case LSHIFTRT:
7732 case ASHIFTRT:
7733 op0 = XEXP (x, 0);
7734 op1 = XEXP (x, 1);
7736 if (CONST_INT_P (op1))
7738 /* ASR (immediate) and friends. */
7739 if (speed)
7741 if (VECTOR_MODE_P (mode))
7742 *cost += extra_cost->vect.alu;
7743 else
7744 *cost += extra_cost->alu.shift;
7747 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7748 return true;
7750 else
7752 if (VECTOR_MODE_P (mode))
7754 if (speed)
7755 /* Vector shift (register). */
7756 *cost += extra_cost->vect.alu;
7758 else
7760 if (speed)
7761 /* ASR (register) and friends. */
7762 *cost += extra_cost->alu.shift_reg;
7764 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7765 && CONST_INT_P (XEXP (op1, 1))
7766 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7768 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7769 /* We already demanded XEXP (op1, 0) to be REG_P, so
7770 don't recurse into it. */
7771 return true;
7774 return false; /* All arguments need to be in registers. */
7777 case SYMBOL_REF:
7779 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7780 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7782 /* LDR. */
7783 if (speed)
7784 *cost += extra_cost->ldst.load;
7786 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7787 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7789 /* ADRP, followed by ADD. */
7790 *cost += COSTS_N_INSNS (1);
7791 if (speed)
7792 *cost += 2 * extra_cost->alu.arith;
7794 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7795 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7797 /* ADR. */
7798 if (speed)
7799 *cost += extra_cost->alu.arith;
7802 if (flag_pic)
7804 /* One extra load instruction, after accessing the GOT. */
7805 *cost += COSTS_N_INSNS (1);
7806 if (speed)
7807 *cost += extra_cost->ldst.load;
7809 return true;
7811 case HIGH:
7812 case LO_SUM:
7813 /* ADRP/ADD (immediate). */
7814 if (speed)
7815 *cost += extra_cost->alu.arith;
7816 return true;
7818 case ZERO_EXTRACT:
7819 case SIGN_EXTRACT:
7820 /* UBFX/SBFX. */
7821 if (speed)
7823 if (VECTOR_MODE_P (mode))
7824 *cost += extra_cost->vect.alu;
7825 else
7826 *cost += extra_cost->alu.bfx;
7829 /* We can trust that the immediates used will be correct (there
7830 are no by-register forms), so we need only cost op0. */
7831 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7832 return true;
7834 case MULT:
7835 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7836 /* aarch64_rtx_mult_cost always handles recursion to its
7837 operands. */
7838 return true;
7840 case MOD:
7841 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7842 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7843 an unconditional negate. This case should only ever be reached through
7844 the set_smod_pow2_cheap check in expmed.c. */
7845 if (CONST_INT_P (XEXP (x, 1))
7846 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7847 && (mode == SImode || mode == DImode))
7849 /* We expand to 4 instructions. Reset the baseline. */
7850 *cost = COSTS_N_INSNS (4);
7852 if (speed)
7853 *cost += 2 * extra_cost->alu.logical
7854 + 2 * extra_cost->alu.arith;
7856 return true;
7859 /* Fall-through. */
7860 case UMOD:
7861 if (speed)
7863 /* Slighly prefer UMOD over SMOD. */
7864 if (VECTOR_MODE_P (mode))
7865 *cost += extra_cost->vect.alu;
7866 else if (GET_MODE_CLASS (mode) == MODE_INT)
7867 *cost += (extra_cost->mult[mode == DImode].add
7868 + extra_cost->mult[mode == DImode].idiv
7869 + (code == MOD ? 1 : 0));
7871 return false; /* All arguments need to be in registers. */
7873 case DIV:
7874 case UDIV:
7875 case SQRT:
7876 if (speed)
7878 if (VECTOR_MODE_P (mode))
7879 *cost += extra_cost->vect.alu;
7880 else if (GET_MODE_CLASS (mode) == MODE_INT)
7881 /* There is no integer SQRT, so only DIV and UDIV can get
7882 here. */
7883 *cost += (extra_cost->mult[mode == DImode].idiv
7884 /* Slighly prefer UDIV over SDIV. */
7885 + (code == DIV ? 1 : 0));
7886 else
7887 *cost += extra_cost->fp[mode == DFmode].div;
7889 return false; /* All arguments need to be in registers. */
7891 case IF_THEN_ELSE:
7892 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7893 XEXP (x, 2), cost, speed);
7895 case EQ:
7896 case NE:
7897 case GT:
7898 case GTU:
7899 case LT:
7900 case LTU:
7901 case GE:
7902 case GEU:
7903 case LE:
7904 case LEU:
7906 return false; /* All arguments must be in registers. */
7908 case FMA:
7909 op0 = XEXP (x, 0);
7910 op1 = XEXP (x, 1);
7911 op2 = XEXP (x, 2);
7913 if (speed)
7915 if (VECTOR_MODE_P (mode))
7916 *cost += extra_cost->vect.alu;
7917 else
7918 *cost += extra_cost->fp[mode == DFmode].fma;
7921 /* FMSUB, FNMADD, and FNMSUB are free. */
7922 if (GET_CODE (op0) == NEG)
7923 op0 = XEXP (op0, 0);
7925 if (GET_CODE (op2) == NEG)
7926 op2 = XEXP (op2, 0);
7928 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7929 and the by-element operand as operand 0. */
7930 if (GET_CODE (op1) == NEG)
7931 op1 = XEXP (op1, 0);
7933 /* Catch vector-by-element operations. The by-element operand can
7934 either be (vec_duplicate (vec_select (x))) or just
7935 (vec_select (x)), depending on whether we are multiplying by
7936 a vector or a scalar.
7938 Canonicalization is not very good in these cases, FMA4 will put the
7939 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7940 if (GET_CODE (op0) == VEC_DUPLICATE)
7941 op0 = XEXP (op0, 0);
7942 else if (GET_CODE (op1) == VEC_DUPLICATE)
7943 op1 = XEXP (op1, 0);
7945 if (GET_CODE (op0) == VEC_SELECT)
7946 op0 = XEXP (op0, 0);
7947 else if (GET_CODE (op1) == VEC_SELECT)
7948 op1 = XEXP (op1, 0);
7950 /* If the remaining parameters are not registers,
7951 get the cost to put them into registers. */
7952 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7953 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7954 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7955 return true;
7957 case FLOAT:
7958 case UNSIGNED_FLOAT:
7959 if (speed)
7960 *cost += extra_cost->fp[mode == DFmode].fromint;
7961 return false;
7963 case FLOAT_EXTEND:
7964 if (speed)
7966 if (VECTOR_MODE_P (mode))
7968 /*Vector truncate. */
7969 *cost += extra_cost->vect.alu;
7971 else
7972 *cost += extra_cost->fp[mode == DFmode].widen;
7974 return false;
7976 case FLOAT_TRUNCATE:
7977 if (speed)
7979 if (VECTOR_MODE_P (mode))
7981 /*Vector conversion. */
7982 *cost += extra_cost->vect.alu;
7984 else
7985 *cost += extra_cost->fp[mode == DFmode].narrow;
7987 return false;
7989 case FIX:
7990 case UNSIGNED_FIX:
7991 x = XEXP (x, 0);
7992 /* Strip the rounding part. They will all be implemented
7993 by the fcvt* family of instructions anyway. */
7994 if (GET_CODE (x) == UNSPEC)
7996 unsigned int uns_code = XINT (x, 1);
7998 if (uns_code == UNSPEC_FRINTA
7999 || uns_code == UNSPEC_FRINTM
8000 || uns_code == UNSPEC_FRINTN
8001 || uns_code == UNSPEC_FRINTP
8002 || uns_code == UNSPEC_FRINTZ)
8003 x = XVECEXP (x, 0, 0);
8006 if (speed)
8008 if (VECTOR_MODE_P (mode))
8009 *cost += extra_cost->vect.alu;
8010 else
8011 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8014 /* We can combine fmul by a power of 2 followed by a fcvt into a single
8015 fixed-point fcvt. */
8016 if (GET_CODE (x) == MULT
8017 && ((VECTOR_MODE_P (mode)
8018 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8019 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8021 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8022 0, speed);
8023 return true;
8026 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8027 return true;
8029 case ABS:
8030 if (VECTOR_MODE_P (mode))
8032 /* ABS (vector). */
8033 if (speed)
8034 *cost += extra_cost->vect.alu;
8036 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8038 op0 = XEXP (x, 0);
8040 /* FABD, which is analogous to FADD. */
8041 if (GET_CODE (op0) == MINUS)
8043 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8044 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8045 if (speed)
8046 *cost += extra_cost->fp[mode == DFmode].addsub;
8048 return true;
8050 /* Simple FABS is analogous to FNEG. */
8051 if (speed)
8052 *cost += extra_cost->fp[mode == DFmode].neg;
8054 else
8056 /* Integer ABS will either be split to
8057 two arithmetic instructions, or will be an ABS
8058 (scalar), which we don't model. */
8059 *cost = COSTS_N_INSNS (2);
8060 if (speed)
8061 *cost += 2 * extra_cost->alu.arith;
8063 return false;
8065 case SMAX:
8066 case SMIN:
8067 if (speed)
8069 if (VECTOR_MODE_P (mode))
8070 *cost += extra_cost->vect.alu;
8071 else
8073 /* FMAXNM/FMINNM/FMAX/FMIN.
8074 TODO: This may not be accurate for all implementations, but
8075 we do not model this in the cost tables. */
8076 *cost += extra_cost->fp[mode == DFmode].addsub;
8079 return false;
8081 case UNSPEC:
8082 /* The floating point round to integer frint* instructions. */
8083 if (aarch64_frint_unspec_p (XINT (x, 1)))
8085 if (speed)
8086 *cost += extra_cost->fp[mode == DFmode].roundint;
8088 return false;
8091 if (XINT (x, 1) == UNSPEC_RBIT)
8093 if (speed)
8094 *cost += extra_cost->alu.rev;
8096 return false;
8098 break;
8100 case TRUNCATE:
8102 /* Decompose <su>muldi3_highpart. */
8103 if (/* (truncate:DI */
8104 mode == DImode
8105 /* (lshiftrt:TI */
8106 && GET_MODE (XEXP (x, 0)) == TImode
8107 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8108 /* (mult:TI */
8109 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8110 /* (ANY_EXTEND:TI (reg:DI))
8111 (ANY_EXTEND:TI (reg:DI))) */
8112 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8113 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8114 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8115 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8116 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8117 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8118 /* (const_int 64) */
8119 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8120 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8122 /* UMULH/SMULH. */
8123 if (speed)
8124 *cost += extra_cost->mult[mode == DImode].extend;
8125 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8126 mode, MULT, 0, speed);
8127 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8128 mode, MULT, 1, speed);
8129 return true;
8132 /* Fall through. */
8133 default:
8134 break;
8137 if (dump_file
8138 && flag_aarch64_verbose_cost)
8139 fprintf (dump_file,
8140 "\nFailed to cost RTX. Assuming default cost.\n");
8142 return true;
8145 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8146 calculated for X. This cost is stored in *COST. Returns true
8147 if the total cost of X was calculated. */
8148 static bool
8149 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8150 int param, int *cost, bool speed)
8152 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8154 if (dump_file
8155 && flag_aarch64_verbose_cost)
8157 print_rtl_single (dump_file, x);
8158 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8159 speed ? "Hot" : "Cold",
8160 *cost, result ? "final" : "partial");
8163 return result;
8166 static int
8167 aarch64_register_move_cost (machine_mode mode,
8168 reg_class_t from_i, reg_class_t to_i)
8170 enum reg_class from = (enum reg_class) from_i;
8171 enum reg_class to = (enum reg_class) to_i;
8172 const struct cpu_regmove_cost *regmove_cost
8173 = aarch64_tune_params.regmove_cost;
8175 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8176 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8177 to = GENERAL_REGS;
8179 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8180 from = GENERAL_REGS;
8182 /* Moving between GPR and stack cost is the same as GP2GP. */
8183 if ((from == GENERAL_REGS && to == STACK_REG)
8184 || (to == GENERAL_REGS && from == STACK_REG))
8185 return regmove_cost->GP2GP;
8187 /* To/From the stack register, we move via the gprs. */
8188 if (to == STACK_REG || from == STACK_REG)
8189 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8190 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8192 if (GET_MODE_SIZE (mode) == 16)
8194 /* 128-bit operations on general registers require 2 instructions. */
8195 if (from == GENERAL_REGS && to == GENERAL_REGS)
8196 return regmove_cost->GP2GP * 2;
8197 else if (from == GENERAL_REGS)
8198 return regmove_cost->GP2FP * 2;
8199 else if (to == GENERAL_REGS)
8200 return regmove_cost->FP2GP * 2;
8202 /* When AdvSIMD instructions are disabled it is not possible to move
8203 a 128-bit value directly between Q registers. This is handled in
8204 secondary reload. A general register is used as a scratch to move
8205 the upper DI value and the lower DI value is moved directly,
8206 hence the cost is the sum of three moves. */
8207 if (! TARGET_SIMD)
8208 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8210 return regmove_cost->FP2FP;
8213 if (from == GENERAL_REGS && to == GENERAL_REGS)
8214 return regmove_cost->GP2GP;
8215 else if (from == GENERAL_REGS)
8216 return regmove_cost->GP2FP;
8217 else if (to == GENERAL_REGS)
8218 return regmove_cost->FP2GP;
8220 return regmove_cost->FP2FP;
8223 static int
8224 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8225 reg_class_t rclass ATTRIBUTE_UNUSED,
8226 bool in ATTRIBUTE_UNUSED)
8228 return aarch64_tune_params.memmov_cost;
8231 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8232 to optimize 1.0/sqrt. */
8234 static bool
8235 use_rsqrt_p (machine_mode mode)
8237 return (!flag_trapping_math
8238 && flag_unsafe_math_optimizations
8239 && ((aarch64_tune_params.approx_modes->recip_sqrt
8240 & AARCH64_APPROX_MODE (mode))
8241 || flag_mrecip_low_precision_sqrt));
8244 /* Function to decide when to use the approximate reciprocal square root
8245 builtin. */
8247 static tree
8248 aarch64_builtin_reciprocal (tree fndecl)
8250 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8252 if (!use_rsqrt_p (mode))
8253 return NULL_TREE;
8254 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8257 typedef rtx (*rsqrte_type) (rtx, rtx);
8259 /* Select reciprocal square root initial estimate insn depending on machine
8260 mode. */
8262 static rsqrte_type
8263 get_rsqrte_type (machine_mode mode)
8265 switch (mode)
8267 case E_DFmode: return gen_aarch64_rsqrtedf;
8268 case E_SFmode: return gen_aarch64_rsqrtesf;
8269 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8270 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8271 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8272 default: gcc_unreachable ();
8276 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8278 /* Select reciprocal square root series step insn depending on machine mode. */
8280 static rsqrts_type
8281 get_rsqrts_type (machine_mode mode)
8283 switch (mode)
8285 case E_DFmode: return gen_aarch64_rsqrtsdf;
8286 case E_SFmode: return gen_aarch64_rsqrtssf;
8287 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8288 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8289 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8290 default: gcc_unreachable ();
8294 /* Emit instruction sequence to compute either the approximate square root
8295 or its approximate reciprocal, depending on the flag RECP, and return
8296 whether the sequence was emitted or not. */
8298 bool
8299 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8301 machine_mode mode = GET_MODE (dst);
8303 if (GET_MODE_INNER (mode) == HFmode)
8305 gcc_assert (!recp);
8306 return false;
8309 if (!recp)
8311 if (!(flag_mlow_precision_sqrt
8312 || (aarch64_tune_params.approx_modes->sqrt
8313 & AARCH64_APPROX_MODE (mode))))
8314 return false;
8316 if (flag_finite_math_only
8317 || flag_trapping_math
8318 || !flag_unsafe_math_optimizations
8319 || optimize_function_for_size_p (cfun))
8320 return false;
8322 else
8323 /* Caller assumes we cannot fail. */
8324 gcc_assert (use_rsqrt_p (mode));
8326 machine_mode mmsk = mode_for_int_vector (mode).require ();
8327 rtx xmsk = gen_reg_rtx (mmsk);
8328 if (!recp)
8329 /* When calculating the approximate square root, compare the
8330 argument with 0.0 and create a mask. */
8331 emit_insn (gen_rtx_SET (xmsk,
8332 gen_rtx_NEG (mmsk,
8333 gen_rtx_EQ (mmsk, src,
8334 CONST0_RTX (mode)))));
8336 /* Estimate the approximate reciprocal square root. */
8337 rtx xdst = gen_reg_rtx (mode);
8338 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8340 /* Iterate over the series twice for SF and thrice for DF. */
8341 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8343 /* Optionally iterate over the series once less for faster performance
8344 while sacrificing the accuracy. */
8345 if ((recp && flag_mrecip_low_precision_sqrt)
8346 || (!recp && flag_mlow_precision_sqrt))
8347 iterations--;
8349 /* Iterate over the series to calculate the approximate reciprocal square
8350 root. */
8351 rtx x1 = gen_reg_rtx (mode);
8352 while (iterations--)
8354 rtx x2 = gen_reg_rtx (mode);
8355 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8357 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8359 if (iterations > 0)
8360 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8363 if (!recp)
8365 /* Qualify the approximate reciprocal square root when the argument is
8366 0.0 by squashing the intermediary result to 0.0. */
8367 rtx xtmp = gen_reg_rtx (mmsk);
8368 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8369 gen_rtx_SUBREG (mmsk, xdst, 0)));
8370 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8372 /* Calculate the approximate square root. */
8373 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8376 /* Finalize the approximation. */
8377 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8379 return true;
8382 typedef rtx (*recpe_type) (rtx, rtx);
8384 /* Select reciprocal initial estimate insn depending on machine mode. */
8386 static recpe_type
8387 get_recpe_type (machine_mode mode)
8389 switch (mode)
8391 case E_SFmode: return (gen_aarch64_frecpesf);
8392 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8393 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8394 case E_DFmode: return (gen_aarch64_frecpedf);
8395 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8396 default: gcc_unreachable ();
8400 typedef rtx (*recps_type) (rtx, rtx, rtx);
8402 /* Select reciprocal series step insn depending on machine mode. */
8404 static recps_type
8405 get_recps_type (machine_mode mode)
8407 switch (mode)
8409 case E_SFmode: return (gen_aarch64_frecpssf);
8410 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8411 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8412 case E_DFmode: return (gen_aarch64_frecpsdf);
8413 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8414 default: gcc_unreachable ();
8418 /* Emit the instruction sequence to compute the approximation for the division
8419 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8421 bool
8422 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8424 machine_mode mode = GET_MODE (quo);
8426 if (GET_MODE_INNER (mode) == HFmode)
8427 return false;
8429 bool use_approx_division_p = (flag_mlow_precision_div
8430 || (aarch64_tune_params.approx_modes->division
8431 & AARCH64_APPROX_MODE (mode)));
8433 if (!flag_finite_math_only
8434 || flag_trapping_math
8435 || !flag_unsafe_math_optimizations
8436 || optimize_function_for_size_p (cfun)
8437 || !use_approx_division_p)
8438 return false;
8440 /* Estimate the approximate reciprocal. */
8441 rtx xrcp = gen_reg_rtx (mode);
8442 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8444 /* Iterate over the series twice for SF and thrice for DF. */
8445 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8447 /* Optionally iterate over the series once less for faster performance,
8448 while sacrificing the accuracy. */
8449 if (flag_mlow_precision_div)
8450 iterations--;
8452 /* Iterate over the series to calculate the approximate reciprocal. */
8453 rtx xtmp = gen_reg_rtx (mode);
8454 while (iterations--)
8456 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8458 if (iterations > 0)
8459 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8462 if (num != CONST1_RTX (mode))
8464 /* As the approximate reciprocal of DEN is already calculated, only
8465 calculate the approximate division when NUM is not 1.0. */
8466 rtx xnum = force_reg (mode, num);
8467 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8470 /* Finalize the approximation. */
8471 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8472 return true;
8475 /* Return the number of instructions that can be issued per cycle. */
8476 static int
8477 aarch64_sched_issue_rate (void)
8479 return aarch64_tune_params.issue_rate;
8482 static int
8483 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8485 int issue_rate = aarch64_sched_issue_rate ();
8487 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8491 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8492 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8493 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8495 static int
8496 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8497 int ready_index)
8499 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8503 /* Vectorizer cost model target hooks. */
8505 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8506 static int
8507 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8508 tree vectype,
8509 int misalign ATTRIBUTE_UNUSED)
8511 unsigned elements;
8512 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8513 bool fp = false;
8515 if (vectype != NULL)
8516 fp = FLOAT_TYPE_P (vectype);
8518 switch (type_of_cost)
8520 case scalar_stmt:
8521 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8523 case scalar_load:
8524 return costs->scalar_load_cost;
8526 case scalar_store:
8527 return costs->scalar_store_cost;
8529 case vector_stmt:
8530 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8532 case vector_load:
8533 return costs->vec_align_load_cost;
8535 case vector_store:
8536 return costs->vec_store_cost;
8538 case vec_to_scalar:
8539 return costs->vec_to_scalar_cost;
8541 case scalar_to_vec:
8542 return costs->scalar_to_vec_cost;
8544 case unaligned_load:
8545 return costs->vec_unalign_load_cost;
8547 case unaligned_store:
8548 return costs->vec_unalign_store_cost;
8550 case cond_branch_taken:
8551 return costs->cond_taken_branch_cost;
8553 case cond_branch_not_taken:
8554 return costs->cond_not_taken_branch_cost;
8556 case vec_perm:
8557 return costs->vec_permute_cost;
8559 case vec_promote_demote:
8560 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8562 case vec_construct:
8563 elements = TYPE_VECTOR_SUBPARTS (vectype);
8564 return elements / 2 + 1;
8566 default:
8567 gcc_unreachable ();
8571 /* Implement targetm.vectorize.add_stmt_cost. */
8572 static unsigned
8573 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8574 struct _stmt_vec_info *stmt_info, int misalign,
8575 enum vect_cost_model_location where)
8577 unsigned *cost = (unsigned *) data;
8578 unsigned retval = 0;
8580 if (flag_vect_cost_model)
8582 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8583 int stmt_cost =
8584 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8586 /* Statements in an inner loop relative to the loop being
8587 vectorized are weighted more heavily. The value here is
8588 arbitrary and could potentially be improved with analysis. */
8589 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8590 count *= 50; /* FIXME */
8592 retval = (unsigned) (count * stmt_cost);
8593 cost[where] += retval;
8596 return retval;
8599 static void initialize_aarch64_code_model (struct gcc_options *);
8601 /* Parse the TO_PARSE string and put the architecture struct that it
8602 selects into RES and the architectural features into ISA_FLAGS.
8603 Return an aarch64_parse_opt_result describing the parse result.
8604 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8606 static enum aarch64_parse_opt_result
8607 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8608 unsigned long *isa_flags)
8610 char *ext;
8611 const struct processor *arch;
8612 char *str = (char *) alloca (strlen (to_parse) + 1);
8613 size_t len;
8615 strcpy (str, to_parse);
8617 ext = strchr (str, '+');
8619 if (ext != NULL)
8620 len = ext - str;
8621 else
8622 len = strlen (str);
8624 if (len == 0)
8625 return AARCH64_PARSE_MISSING_ARG;
8628 /* Loop through the list of supported ARCHes to find a match. */
8629 for (arch = all_architectures; arch->name != NULL; arch++)
8631 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8633 unsigned long isa_temp = arch->flags;
8635 if (ext != NULL)
8637 /* TO_PARSE string contains at least one extension. */
8638 enum aarch64_parse_opt_result ext_res
8639 = aarch64_parse_extension (ext, &isa_temp);
8641 if (ext_res != AARCH64_PARSE_OK)
8642 return ext_res;
8644 /* Extension parsing was successful. Confirm the result
8645 arch and ISA flags. */
8646 *res = arch;
8647 *isa_flags = isa_temp;
8648 return AARCH64_PARSE_OK;
8652 /* ARCH name not found in list. */
8653 return AARCH64_PARSE_INVALID_ARG;
8656 /* Parse the TO_PARSE string and put the result tuning in RES and the
8657 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8658 describing the parse result. If there is an error parsing, RES and
8659 ISA_FLAGS are left unchanged. */
8661 static enum aarch64_parse_opt_result
8662 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8663 unsigned long *isa_flags)
8665 char *ext;
8666 const struct processor *cpu;
8667 char *str = (char *) alloca (strlen (to_parse) + 1);
8668 size_t len;
8670 strcpy (str, to_parse);
8672 ext = strchr (str, '+');
8674 if (ext != NULL)
8675 len = ext - str;
8676 else
8677 len = strlen (str);
8679 if (len == 0)
8680 return AARCH64_PARSE_MISSING_ARG;
8683 /* Loop through the list of supported CPUs to find a match. */
8684 for (cpu = all_cores; cpu->name != NULL; cpu++)
8686 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8688 unsigned long isa_temp = cpu->flags;
8691 if (ext != NULL)
8693 /* TO_PARSE string contains at least one extension. */
8694 enum aarch64_parse_opt_result ext_res
8695 = aarch64_parse_extension (ext, &isa_temp);
8697 if (ext_res != AARCH64_PARSE_OK)
8698 return ext_res;
8700 /* Extension parsing was successfull. Confirm the result
8701 cpu and ISA flags. */
8702 *res = cpu;
8703 *isa_flags = isa_temp;
8704 return AARCH64_PARSE_OK;
8708 /* CPU name not found in list. */
8709 return AARCH64_PARSE_INVALID_ARG;
8712 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8713 Return an aarch64_parse_opt_result describing the parse result.
8714 If the parsing fails the RES does not change. */
8716 static enum aarch64_parse_opt_result
8717 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8719 const struct processor *cpu;
8720 char *str = (char *) alloca (strlen (to_parse) + 1);
8722 strcpy (str, to_parse);
8724 /* Loop through the list of supported CPUs to find a match. */
8725 for (cpu = all_cores; cpu->name != NULL; cpu++)
8727 if (strcmp (cpu->name, str) == 0)
8729 *res = cpu;
8730 return AARCH64_PARSE_OK;
8734 /* CPU name not found in list. */
8735 return AARCH64_PARSE_INVALID_ARG;
8738 /* Parse TOKEN, which has length LENGTH to see if it is an option
8739 described in FLAG. If it is, return the index bit for that fusion type.
8740 If not, error (printing OPTION_NAME) and return zero. */
8742 static unsigned int
8743 aarch64_parse_one_option_token (const char *token,
8744 size_t length,
8745 const struct aarch64_flag_desc *flag,
8746 const char *option_name)
8748 for (; flag->name != NULL; flag++)
8750 if (length == strlen (flag->name)
8751 && !strncmp (flag->name, token, length))
8752 return flag->flag;
8755 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8756 return 0;
8759 /* Parse OPTION which is a comma-separated list of flags to enable.
8760 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8761 default state we inherit from the CPU tuning structures. OPTION_NAME
8762 gives the top-level option we are parsing in the -moverride string,
8763 for use in error messages. */
8765 static unsigned int
8766 aarch64_parse_boolean_options (const char *option,
8767 const struct aarch64_flag_desc *flags,
8768 unsigned int initial_state,
8769 const char *option_name)
8771 const char separator = '.';
8772 const char* specs = option;
8773 const char* ntoken = option;
8774 unsigned int found_flags = initial_state;
8776 while ((ntoken = strchr (specs, separator)))
8778 size_t token_length = ntoken - specs;
8779 unsigned token_ops = aarch64_parse_one_option_token (specs,
8780 token_length,
8781 flags,
8782 option_name);
8783 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8784 in the token stream, reset the supported operations. So:
8786 adrp+add.cmp+branch.none.adrp+add
8788 would have the result of turning on only adrp+add fusion. */
8789 if (!token_ops)
8790 found_flags = 0;
8792 found_flags |= token_ops;
8793 specs = ++ntoken;
8796 /* We ended with a comma, print something. */
8797 if (!(*specs))
8799 error ("%s string ill-formed\n", option_name);
8800 return 0;
8803 /* We still have one more token to parse. */
8804 size_t token_length = strlen (specs);
8805 unsigned token_ops = aarch64_parse_one_option_token (specs,
8806 token_length,
8807 flags,
8808 option_name);
8809 if (!token_ops)
8810 found_flags = 0;
8812 found_flags |= token_ops;
8813 return found_flags;
8816 /* Support for overriding instruction fusion. */
8818 static void
8819 aarch64_parse_fuse_string (const char *fuse_string,
8820 struct tune_params *tune)
8822 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8823 aarch64_fusible_pairs,
8824 tune->fusible_ops,
8825 "fuse=");
8828 /* Support for overriding other tuning flags. */
8830 static void
8831 aarch64_parse_tune_string (const char *tune_string,
8832 struct tune_params *tune)
8834 tune->extra_tuning_flags
8835 = aarch64_parse_boolean_options (tune_string,
8836 aarch64_tuning_flags,
8837 tune->extra_tuning_flags,
8838 "tune=");
8841 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8842 we understand. If it is, extract the option string and handoff to
8843 the appropriate function. */
8845 void
8846 aarch64_parse_one_override_token (const char* token,
8847 size_t length,
8848 struct tune_params *tune)
8850 const struct aarch64_tuning_override_function *fn
8851 = aarch64_tuning_override_functions;
8853 const char *option_part = strchr (token, '=');
8854 if (!option_part)
8856 error ("tuning string missing in option (%s)", token);
8857 return;
8860 /* Get the length of the option name. */
8861 length = option_part - token;
8862 /* Skip the '=' to get to the option string. */
8863 option_part++;
8865 for (; fn->name != NULL; fn++)
8867 if (!strncmp (fn->name, token, length))
8869 fn->parse_override (option_part, tune);
8870 return;
8874 error ("unknown tuning option (%s)",token);
8875 return;
8878 /* A checking mechanism for the implementation of the tls size. */
8880 static void
8881 initialize_aarch64_tls_size (struct gcc_options *opts)
8883 if (aarch64_tls_size == 0)
8884 aarch64_tls_size = 24;
8886 switch (opts->x_aarch64_cmodel_var)
8888 case AARCH64_CMODEL_TINY:
8889 /* Both the default and maximum TLS size allowed under tiny is 1M which
8890 needs two instructions to address, so we clamp the size to 24. */
8891 if (aarch64_tls_size > 24)
8892 aarch64_tls_size = 24;
8893 break;
8894 case AARCH64_CMODEL_SMALL:
8895 /* The maximum TLS size allowed under small is 4G. */
8896 if (aarch64_tls_size > 32)
8897 aarch64_tls_size = 32;
8898 break;
8899 case AARCH64_CMODEL_LARGE:
8900 /* The maximum TLS size allowed under large is 16E.
8901 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8902 if (aarch64_tls_size > 48)
8903 aarch64_tls_size = 48;
8904 break;
8905 default:
8906 gcc_unreachable ();
8909 return;
8912 /* Parse STRING looking for options in the format:
8913 string :: option:string
8914 option :: name=substring
8915 name :: {a-z}
8916 substring :: defined by option. */
8918 static void
8919 aarch64_parse_override_string (const char* input_string,
8920 struct tune_params* tune)
8922 const char separator = ':';
8923 size_t string_length = strlen (input_string) + 1;
8924 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8925 char *string = string_root;
8926 strncpy (string, input_string, string_length);
8927 string[string_length - 1] = '\0';
8929 char* ntoken = string;
8931 while ((ntoken = strchr (string, separator)))
8933 size_t token_length = ntoken - string;
8934 /* Make this substring look like a string. */
8935 *ntoken = '\0';
8936 aarch64_parse_one_override_token (string, token_length, tune);
8937 string = ++ntoken;
8940 /* One last option to parse. */
8941 aarch64_parse_one_override_token (string, strlen (string), tune);
8942 free (string_root);
8946 static void
8947 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8949 /* The logic here is that if we are disabling all frame pointer generation
8950 then we do not need to disable leaf frame pointer generation as a
8951 separate operation. But if we are *only* disabling leaf frame pointer
8952 generation then we set flag_omit_frame_pointer to true, but in
8953 aarch64_frame_pointer_required we return false only for leaf functions.
8955 PR 70044: We have to be careful about being called multiple times for the
8956 same function. Once we have decided to set flag_omit_frame_pointer just
8957 so that we can omit leaf frame pointers, we must then not interpret a
8958 second call as meaning that all frame pointer generation should be
8959 omitted. We do this by setting flag_omit_frame_pointer to a special,
8960 non-zero value. */
8961 if (opts->x_flag_omit_frame_pointer == 2)
8962 opts->x_flag_omit_frame_pointer = 0;
8964 if (opts->x_flag_omit_frame_pointer)
8965 opts->x_flag_omit_leaf_frame_pointer = false;
8966 else if (opts->x_flag_omit_leaf_frame_pointer)
8967 opts->x_flag_omit_frame_pointer = 2;
8969 /* If not optimizing for size, set the default
8970 alignment to what the target wants. */
8971 if (!opts->x_optimize_size)
8973 if (opts->x_align_loops <= 0)
8974 opts->x_align_loops = aarch64_tune_params.loop_align;
8975 if (opts->x_align_jumps <= 0)
8976 opts->x_align_jumps = aarch64_tune_params.jump_align;
8977 if (opts->x_align_functions <= 0)
8978 opts->x_align_functions = aarch64_tune_params.function_align;
8981 /* We default to no pc-relative literal loads. */
8983 aarch64_pcrelative_literal_loads = false;
8985 /* If -mpc-relative-literal-loads is set on the command line, this
8986 implies that the user asked for PC relative literal loads. */
8987 if (opts->x_pcrelative_literal_loads == 1)
8988 aarch64_pcrelative_literal_loads = true;
8990 /* This is PR70113. When building the Linux kernel with
8991 CONFIG_ARM64_ERRATUM_843419, support for relocations
8992 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8993 removed from the kernel to avoid loading objects with possibly
8994 offending sequences. Without -mpc-relative-literal-loads we would
8995 generate such relocations, preventing the kernel build from
8996 succeeding. */
8997 if (opts->x_pcrelative_literal_loads == 2
8998 && TARGET_FIX_ERR_A53_843419)
8999 aarch64_pcrelative_literal_loads = true;
9001 /* In the tiny memory model it makes no sense to disallow PC relative
9002 literal pool loads. */
9003 if (aarch64_cmodel == AARCH64_CMODEL_TINY
9004 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9005 aarch64_pcrelative_literal_loads = true;
9007 /* When enabling the lower precision Newton series for the square root, also
9008 enable it for the reciprocal square root, since the latter is an
9009 intermediary step for the former. */
9010 if (flag_mlow_precision_sqrt)
9011 flag_mrecip_low_precision_sqrt = true;
9014 /* 'Unpack' up the internal tuning structs and update the options
9015 in OPTS. The caller must have set up selected_tune and selected_arch
9016 as all the other target-specific codegen decisions are
9017 derived from them. */
9019 void
9020 aarch64_override_options_internal (struct gcc_options *opts)
9022 aarch64_tune_flags = selected_tune->flags;
9023 aarch64_tune = selected_tune->sched_core;
9024 /* Make a copy of the tuning parameters attached to the core, which
9025 we may later overwrite. */
9026 aarch64_tune_params = *(selected_tune->tune);
9027 aarch64_architecture_version = selected_arch->architecture_version;
9029 if (opts->x_aarch64_override_tune_string)
9030 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9031 &aarch64_tune_params);
9033 /* This target defaults to strict volatile bitfields. */
9034 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9035 opts->x_flag_strict_volatile_bitfields = 1;
9037 initialize_aarch64_code_model (opts);
9038 initialize_aarch64_tls_size (opts);
9040 int queue_depth = 0;
9041 switch (aarch64_tune_params.autoprefetcher_model)
9043 case tune_params::AUTOPREFETCHER_OFF:
9044 queue_depth = -1;
9045 break;
9046 case tune_params::AUTOPREFETCHER_WEAK:
9047 queue_depth = 0;
9048 break;
9049 case tune_params::AUTOPREFETCHER_STRONG:
9050 queue_depth = max_insn_queue_index + 1;
9051 break;
9052 default:
9053 gcc_unreachable ();
9056 /* We don't mind passing in global_options_set here as we don't use
9057 the *options_set structs anyway. */
9058 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9059 queue_depth,
9060 opts->x_param_values,
9061 global_options_set.x_param_values);
9063 /* Set up parameters to be used in prefetching algorithm. Do not
9064 override the defaults unless we are tuning for a core we have
9065 researched values for. */
9066 if (aarch64_tune_params.prefetch->num_slots > 0)
9067 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9068 aarch64_tune_params.prefetch->num_slots,
9069 opts->x_param_values,
9070 global_options_set.x_param_values);
9071 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9072 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9073 aarch64_tune_params.prefetch->l1_cache_size,
9074 opts->x_param_values,
9075 global_options_set.x_param_values);
9076 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9077 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9078 aarch64_tune_params.prefetch->l1_cache_line_size,
9079 opts->x_param_values,
9080 global_options_set.x_param_values);
9081 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9082 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9083 aarch64_tune_params.prefetch->l2_cache_size,
9084 opts->x_param_values,
9085 global_options_set.x_param_values);
9087 /* Enable sw prefetching at specified optimization level for
9088 CPUS that have prefetch. Lower optimization level threshold by 1
9089 when profiling is enabled. */
9090 if (opts->x_flag_prefetch_loop_arrays < 0
9091 && !opts->x_optimize_size
9092 && aarch64_tune_params.prefetch->default_opt_level >= 0
9093 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9094 opts->x_flag_prefetch_loop_arrays = 1;
9096 aarch64_override_options_after_change_1 (opts);
9099 /* Print a hint with a suggestion for a core or architecture name that
9100 most closely resembles what the user passed in STR. ARCH is true if
9101 the user is asking for an architecture name. ARCH is false if the user
9102 is asking for a core name. */
9104 static void
9105 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9107 auto_vec<const char *> candidates;
9108 const struct processor *entry = arch ? all_architectures : all_cores;
9109 for (; entry->name != NULL; entry++)
9110 candidates.safe_push (entry->name);
9111 char *s;
9112 const char *hint = candidates_list_and_hint (str, s, candidates);
9113 if (hint)
9114 inform (input_location, "valid arguments are: %s;"
9115 " did you mean %qs?", s, hint);
9116 XDELETEVEC (s);
9119 /* Print a hint with a suggestion for a core name that most closely resembles
9120 what the user passed in STR. */
9122 inline static void
9123 aarch64_print_hint_for_core (const char *str)
9125 aarch64_print_hint_for_core_or_arch (str, false);
9128 /* Print a hint with a suggestion for an architecture name that most closely
9129 resembles what the user passed in STR. */
9131 inline static void
9132 aarch64_print_hint_for_arch (const char *str)
9134 aarch64_print_hint_for_core_or_arch (str, true);
9137 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9138 specified in STR and throw errors if appropriate. Put the results if
9139 they are valid in RES and ISA_FLAGS. Return whether the option is
9140 valid. */
9142 static bool
9143 aarch64_validate_mcpu (const char *str, const struct processor **res,
9144 unsigned long *isa_flags)
9146 enum aarch64_parse_opt_result parse_res
9147 = aarch64_parse_cpu (str, res, isa_flags);
9149 if (parse_res == AARCH64_PARSE_OK)
9150 return true;
9152 switch (parse_res)
9154 case AARCH64_PARSE_MISSING_ARG:
9155 error ("missing cpu name in %<-mcpu=%s%>", str);
9156 break;
9157 case AARCH64_PARSE_INVALID_ARG:
9158 error ("unknown value %qs for -mcpu", str);
9159 aarch64_print_hint_for_core (str);
9160 break;
9161 case AARCH64_PARSE_INVALID_FEATURE:
9162 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9163 break;
9164 default:
9165 gcc_unreachable ();
9168 return false;
9171 /* Validate a command-line -march option. Parse the arch and extensions
9172 (if any) specified in STR and throw errors if appropriate. Put the
9173 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9174 option is valid. */
9176 static bool
9177 aarch64_validate_march (const char *str, const struct processor **res,
9178 unsigned long *isa_flags)
9180 enum aarch64_parse_opt_result parse_res
9181 = aarch64_parse_arch (str, res, isa_flags);
9183 if (parse_res == AARCH64_PARSE_OK)
9184 return true;
9186 switch (parse_res)
9188 case AARCH64_PARSE_MISSING_ARG:
9189 error ("missing arch name in %<-march=%s%>", str);
9190 break;
9191 case AARCH64_PARSE_INVALID_ARG:
9192 error ("unknown value %qs for -march", str);
9193 aarch64_print_hint_for_arch (str);
9194 break;
9195 case AARCH64_PARSE_INVALID_FEATURE:
9196 error ("invalid feature modifier in %<-march=%s%>", str);
9197 break;
9198 default:
9199 gcc_unreachable ();
9202 return false;
9205 /* Validate a command-line -mtune option. Parse the cpu
9206 specified in STR and throw errors if appropriate. Put the
9207 result, if it is valid, in RES. Return whether the option is
9208 valid. */
9210 static bool
9211 aarch64_validate_mtune (const char *str, const struct processor **res)
9213 enum aarch64_parse_opt_result parse_res
9214 = aarch64_parse_tune (str, res);
9216 if (parse_res == AARCH64_PARSE_OK)
9217 return true;
9219 switch (parse_res)
9221 case AARCH64_PARSE_MISSING_ARG:
9222 error ("missing cpu name in %<-mtune=%s%>", str);
9223 break;
9224 case AARCH64_PARSE_INVALID_ARG:
9225 error ("unknown value %qs for -mtune", str);
9226 aarch64_print_hint_for_core (str);
9227 break;
9228 default:
9229 gcc_unreachable ();
9231 return false;
9234 /* Return the CPU corresponding to the enum CPU.
9235 If it doesn't specify a cpu, return the default. */
9237 static const struct processor *
9238 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9240 if (cpu != aarch64_none)
9241 return &all_cores[cpu];
9243 /* The & 0x3f is to extract the bottom 6 bits that encode the
9244 default cpu as selected by the --with-cpu GCC configure option
9245 in config.gcc.
9246 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9247 flags mechanism should be reworked to make it more sane. */
9248 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9251 /* Return the architecture corresponding to the enum ARCH.
9252 If it doesn't specify a valid architecture, return the default. */
9254 static const struct processor *
9255 aarch64_get_arch (enum aarch64_arch arch)
9257 if (arch != aarch64_no_arch)
9258 return &all_architectures[arch];
9260 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9262 return &all_architectures[cpu->arch];
9265 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9266 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9267 tuning structs. In particular it must set selected_tune and
9268 aarch64_isa_flags that define the available ISA features and tuning
9269 decisions. It must also set selected_arch as this will be used to
9270 output the .arch asm tags for each function. */
9272 static void
9273 aarch64_override_options (void)
9275 unsigned long cpu_isa = 0;
9276 unsigned long arch_isa = 0;
9277 aarch64_isa_flags = 0;
9279 bool valid_cpu = true;
9280 bool valid_tune = true;
9281 bool valid_arch = true;
9283 selected_cpu = NULL;
9284 selected_arch = NULL;
9285 selected_tune = NULL;
9287 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9288 If either of -march or -mtune is given, they override their
9289 respective component of -mcpu. */
9290 if (aarch64_cpu_string)
9291 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9292 &cpu_isa);
9294 if (aarch64_arch_string)
9295 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9296 &arch_isa);
9298 if (aarch64_tune_string)
9299 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9301 /* If the user did not specify a processor, choose the default
9302 one for them. This will be the CPU set during configuration using
9303 --with-cpu, otherwise it is "generic". */
9304 if (!selected_cpu)
9306 if (selected_arch)
9308 selected_cpu = &all_cores[selected_arch->ident];
9309 aarch64_isa_flags = arch_isa;
9310 explicit_arch = selected_arch->arch;
9312 else
9314 /* Get default configure-time CPU. */
9315 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9316 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9319 if (selected_tune)
9320 explicit_tune_core = selected_tune->ident;
9322 /* If both -mcpu and -march are specified check that they are architecturally
9323 compatible, warn if they're not and prefer the -march ISA flags. */
9324 else if (selected_arch)
9326 if (selected_arch->arch != selected_cpu->arch)
9328 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9329 all_architectures[selected_cpu->arch].name,
9330 selected_arch->name);
9332 aarch64_isa_flags = arch_isa;
9333 explicit_arch = selected_arch->arch;
9334 explicit_tune_core = selected_tune ? selected_tune->ident
9335 : selected_cpu->ident;
9337 else
9339 /* -mcpu but no -march. */
9340 aarch64_isa_flags = cpu_isa;
9341 explicit_tune_core = selected_tune ? selected_tune->ident
9342 : selected_cpu->ident;
9343 gcc_assert (selected_cpu);
9344 selected_arch = &all_architectures[selected_cpu->arch];
9345 explicit_arch = selected_arch->arch;
9348 /* Set the arch as well as we will need it when outputing
9349 the .arch directive in assembly. */
9350 if (!selected_arch)
9352 gcc_assert (selected_cpu);
9353 selected_arch = &all_architectures[selected_cpu->arch];
9356 if (!selected_tune)
9357 selected_tune = selected_cpu;
9359 #ifndef HAVE_AS_MABI_OPTION
9360 /* The compiler may have been configured with 2.23.* binutils, which does
9361 not have support for ILP32. */
9362 if (TARGET_ILP32)
9363 error ("Assembler does not support -mabi=ilp32");
9364 #endif
9366 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9367 sorry ("Return address signing is only supported for -mabi=lp64");
9369 /* Make sure we properly set up the explicit options. */
9370 if ((aarch64_cpu_string && valid_cpu)
9371 || (aarch64_tune_string && valid_tune))
9372 gcc_assert (explicit_tune_core != aarch64_none);
9374 if ((aarch64_cpu_string && valid_cpu)
9375 || (aarch64_arch_string && valid_arch))
9376 gcc_assert (explicit_arch != aarch64_no_arch);
9378 aarch64_override_options_internal (&global_options);
9380 /* Save these options as the default ones in case we push and pop them later
9381 while processing functions with potential target attributes. */
9382 target_option_default_node = target_option_current_node
9383 = build_target_option_node (&global_options);
9386 /* Implement targetm.override_options_after_change. */
9388 static void
9389 aarch64_override_options_after_change (void)
9391 aarch64_override_options_after_change_1 (&global_options);
9394 static struct machine_function *
9395 aarch64_init_machine_status (void)
9397 struct machine_function *machine;
9398 machine = ggc_cleared_alloc<machine_function> ();
9399 return machine;
9402 void
9403 aarch64_init_expanders (void)
9405 init_machine_status = aarch64_init_machine_status;
9408 /* A checking mechanism for the implementation of the various code models. */
9409 static void
9410 initialize_aarch64_code_model (struct gcc_options *opts)
9412 if (opts->x_flag_pic)
9414 switch (opts->x_aarch64_cmodel_var)
9416 case AARCH64_CMODEL_TINY:
9417 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9418 break;
9419 case AARCH64_CMODEL_SMALL:
9420 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9421 aarch64_cmodel = (flag_pic == 2
9422 ? AARCH64_CMODEL_SMALL_PIC
9423 : AARCH64_CMODEL_SMALL_SPIC);
9424 #else
9425 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9426 #endif
9427 break;
9428 case AARCH64_CMODEL_LARGE:
9429 sorry ("code model %qs with -f%s", "large",
9430 opts->x_flag_pic > 1 ? "PIC" : "pic");
9431 break;
9432 default:
9433 gcc_unreachable ();
9436 else
9437 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9440 /* Implement TARGET_OPTION_SAVE. */
9442 static void
9443 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9445 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9448 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9449 using the information saved in PTR. */
9451 static void
9452 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9454 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9455 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9456 opts->x_explicit_arch = ptr->x_explicit_arch;
9457 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9458 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9460 aarch64_override_options_internal (opts);
9463 /* Implement TARGET_OPTION_PRINT. */
9465 static void
9466 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9468 const struct processor *cpu
9469 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9470 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9471 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9472 std::string extension
9473 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9475 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9476 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9477 arch->name, extension.c_str ());
9480 static GTY(()) tree aarch64_previous_fndecl;
9482 void
9483 aarch64_reset_previous_fndecl (void)
9485 aarch64_previous_fndecl = NULL;
9488 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9489 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9490 make sure optab availability predicates are recomputed when necessary. */
9492 void
9493 aarch64_save_restore_target_globals (tree new_tree)
9495 if (TREE_TARGET_GLOBALS (new_tree))
9496 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9497 else if (new_tree == target_option_default_node)
9498 restore_target_globals (&default_target_globals);
9499 else
9500 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9503 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9504 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9505 of the function, if such exists. This function may be called multiple
9506 times on a single function so use aarch64_previous_fndecl to avoid
9507 setting up identical state. */
9509 static void
9510 aarch64_set_current_function (tree fndecl)
9512 if (!fndecl || fndecl == aarch64_previous_fndecl)
9513 return;
9515 tree old_tree = (aarch64_previous_fndecl
9516 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9517 : NULL_TREE);
9519 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9521 /* If current function has no attributes but the previous one did,
9522 use the default node. */
9523 if (!new_tree && old_tree)
9524 new_tree = target_option_default_node;
9526 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9527 the default have been handled by aarch64_save_restore_target_globals from
9528 aarch64_pragma_target_parse. */
9529 if (old_tree == new_tree)
9530 return;
9532 aarch64_previous_fndecl = fndecl;
9534 /* First set the target options. */
9535 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9537 aarch64_save_restore_target_globals (new_tree);
9540 /* Enum describing the various ways we can handle attributes.
9541 In many cases we can reuse the generic option handling machinery. */
9543 enum aarch64_attr_opt_type
9545 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9546 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9547 aarch64_attr_enum, /* Attribute sets an enum variable. */
9548 aarch64_attr_custom /* Attribute requires a custom handling function. */
9551 /* All the information needed to handle a target attribute.
9552 NAME is the name of the attribute.
9553 ATTR_TYPE specifies the type of behavior of the attribute as described
9554 in the definition of enum aarch64_attr_opt_type.
9555 ALLOW_NEG is true if the attribute supports a "no-" form.
9556 HANDLER is the function that takes the attribute string and whether
9557 it is a pragma or attribute and handles the option. It is needed only
9558 when the ATTR_TYPE is aarch64_attr_custom.
9559 OPT_NUM is the enum specifying the option that the attribute modifies.
9560 This is needed for attributes that mirror the behavior of a command-line
9561 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9562 aarch64_attr_enum. */
9564 struct aarch64_attribute_info
9566 const char *name;
9567 enum aarch64_attr_opt_type attr_type;
9568 bool allow_neg;
9569 bool (*handler) (const char *, const char *);
9570 enum opt_code opt_num;
9573 /* Handle the ARCH_STR argument to the arch= target attribute.
9574 PRAGMA_OR_ATTR is used in potential error messages. */
9576 static bool
9577 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9579 const struct processor *tmp_arch = NULL;
9580 enum aarch64_parse_opt_result parse_res
9581 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9583 if (parse_res == AARCH64_PARSE_OK)
9585 gcc_assert (tmp_arch);
9586 selected_arch = tmp_arch;
9587 explicit_arch = selected_arch->arch;
9588 return true;
9591 switch (parse_res)
9593 case AARCH64_PARSE_MISSING_ARG:
9594 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9595 break;
9596 case AARCH64_PARSE_INVALID_ARG:
9597 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9598 aarch64_print_hint_for_arch (str);
9599 break;
9600 case AARCH64_PARSE_INVALID_FEATURE:
9601 error ("invalid feature modifier %qs for 'arch' target %s",
9602 str, pragma_or_attr);
9603 break;
9604 default:
9605 gcc_unreachable ();
9608 return false;
9611 /* Handle the argument CPU_STR to the cpu= target attribute.
9612 PRAGMA_OR_ATTR is used in potential error messages. */
9614 static bool
9615 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9617 const struct processor *tmp_cpu = NULL;
9618 enum aarch64_parse_opt_result parse_res
9619 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9621 if (parse_res == AARCH64_PARSE_OK)
9623 gcc_assert (tmp_cpu);
9624 selected_tune = tmp_cpu;
9625 explicit_tune_core = selected_tune->ident;
9627 selected_arch = &all_architectures[tmp_cpu->arch];
9628 explicit_arch = selected_arch->arch;
9629 return true;
9632 switch (parse_res)
9634 case AARCH64_PARSE_MISSING_ARG:
9635 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9636 break;
9637 case AARCH64_PARSE_INVALID_ARG:
9638 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9639 aarch64_print_hint_for_core (str);
9640 break;
9641 case AARCH64_PARSE_INVALID_FEATURE:
9642 error ("invalid feature modifier %qs for 'cpu' target %s",
9643 str, pragma_or_attr);
9644 break;
9645 default:
9646 gcc_unreachable ();
9649 return false;
9652 /* Handle the argument STR to the tune= target attribute.
9653 PRAGMA_OR_ATTR is used in potential error messages. */
9655 static bool
9656 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9658 const struct processor *tmp_tune = NULL;
9659 enum aarch64_parse_opt_result parse_res
9660 = aarch64_parse_tune (str, &tmp_tune);
9662 if (parse_res == AARCH64_PARSE_OK)
9664 gcc_assert (tmp_tune);
9665 selected_tune = tmp_tune;
9666 explicit_tune_core = selected_tune->ident;
9667 return true;
9670 switch (parse_res)
9672 case AARCH64_PARSE_INVALID_ARG:
9673 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9674 aarch64_print_hint_for_core (str);
9675 break;
9676 default:
9677 gcc_unreachable ();
9680 return false;
9683 /* Parse an architecture extensions target attribute string specified in STR.
9684 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9685 if successful. Update aarch64_isa_flags to reflect the ISA features
9686 modified.
9687 PRAGMA_OR_ATTR is used in potential error messages. */
9689 static bool
9690 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9692 enum aarch64_parse_opt_result parse_res;
9693 unsigned long isa_flags = aarch64_isa_flags;
9695 /* We allow "+nothing" in the beginning to clear out all architectural
9696 features if the user wants to handpick specific features. */
9697 if (strncmp ("+nothing", str, 8) == 0)
9699 isa_flags = 0;
9700 str += 8;
9703 parse_res = aarch64_parse_extension (str, &isa_flags);
9705 if (parse_res == AARCH64_PARSE_OK)
9707 aarch64_isa_flags = isa_flags;
9708 return true;
9711 switch (parse_res)
9713 case AARCH64_PARSE_MISSING_ARG:
9714 error ("missing feature modifier in target %s %qs",
9715 pragma_or_attr, str);
9716 break;
9718 case AARCH64_PARSE_INVALID_FEATURE:
9719 error ("invalid feature modifier in target %s %qs",
9720 pragma_or_attr, str);
9721 break;
9723 default:
9724 gcc_unreachable ();
9727 return false;
9730 /* The target attributes that we support. On top of these we also support just
9731 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9732 handled explicitly in aarch64_process_one_target_attr. */
9734 static const struct aarch64_attribute_info aarch64_attributes[] =
9736 { "general-regs-only", aarch64_attr_mask, false, NULL,
9737 OPT_mgeneral_regs_only },
9738 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9739 OPT_mfix_cortex_a53_835769 },
9740 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9741 OPT_mfix_cortex_a53_843419 },
9742 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9743 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9744 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9745 OPT_momit_leaf_frame_pointer },
9746 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9747 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9748 OPT_march_ },
9749 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9750 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9751 OPT_mtune_ },
9752 { "sign-return-address", aarch64_attr_enum, false, NULL,
9753 OPT_msign_return_address_ },
9754 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9757 /* Parse ARG_STR which contains the definition of one target attribute.
9758 Show appropriate errors if any or return true if the attribute is valid.
9759 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9760 we're processing a target attribute or pragma. */
9762 static bool
9763 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9765 bool invert = false;
9767 size_t len = strlen (arg_str);
9769 if (len == 0)
9771 error ("malformed target %s", pragma_or_attr);
9772 return false;
9775 char *str_to_check = (char *) alloca (len + 1);
9776 strcpy (str_to_check, arg_str);
9778 /* Skip leading whitespace. */
9779 while (*str_to_check == ' ' || *str_to_check == '\t')
9780 str_to_check++;
9782 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9783 It is easier to detect and handle it explicitly here rather than going
9784 through the machinery for the rest of the target attributes in this
9785 function. */
9786 if (*str_to_check == '+')
9787 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9789 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9791 invert = true;
9792 str_to_check += 3;
9794 char *arg = strchr (str_to_check, '=');
9796 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9797 and point ARG to "foo". */
9798 if (arg)
9800 *arg = '\0';
9801 arg++;
9803 const struct aarch64_attribute_info *p_attr;
9804 bool found = false;
9805 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9807 /* If the names don't match up, or the user has given an argument
9808 to an attribute that doesn't accept one, or didn't give an argument
9809 to an attribute that expects one, fail to match. */
9810 if (strcmp (str_to_check, p_attr->name) != 0)
9811 continue;
9813 found = true;
9814 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9815 || p_attr->attr_type == aarch64_attr_enum;
9817 if (attr_need_arg_p ^ (arg != NULL))
9819 error ("target %s %qs does not accept an argument",
9820 pragma_or_attr, str_to_check);
9821 return false;
9824 /* If the name matches but the attribute does not allow "no-" versions
9825 then we can't match. */
9826 if (invert && !p_attr->allow_neg)
9828 error ("target %s %qs does not allow a negated form",
9829 pragma_or_attr, str_to_check);
9830 return false;
9833 switch (p_attr->attr_type)
9835 /* Has a custom handler registered.
9836 For example, cpu=, arch=, tune=. */
9837 case aarch64_attr_custom:
9838 gcc_assert (p_attr->handler);
9839 if (!p_attr->handler (arg, pragma_or_attr))
9840 return false;
9841 break;
9843 /* Either set or unset a boolean option. */
9844 case aarch64_attr_bool:
9846 struct cl_decoded_option decoded;
9848 generate_option (p_attr->opt_num, NULL, !invert,
9849 CL_TARGET, &decoded);
9850 aarch64_handle_option (&global_options, &global_options_set,
9851 &decoded, input_location);
9852 break;
9854 /* Set or unset a bit in the target_flags. aarch64_handle_option
9855 should know what mask to apply given the option number. */
9856 case aarch64_attr_mask:
9858 struct cl_decoded_option decoded;
9859 /* We only need to specify the option number.
9860 aarch64_handle_option will know which mask to apply. */
9861 decoded.opt_index = p_attr->opt_num;
9862 decoded.value = !invert;
9863 aarch64_handle_option (&global_options, &global_options_set,
9864 &decoded, input_location);
9865 break;
9867 /* Use the option setting machinery to set an option to an enum. */
9868 case aarch64_attr_enum:
9870 gcc_assert (arg);
9871 bool valid;
9872 int value;
9873 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9874 &value, CL_TARGET);
9875 if (valid)
9877 set_option (&global_options, NULL, p_attr->opt_num, value,
9878 NULL, DK_UNSPECIFIED, input_location,
9879 global_dc);
9881 else
9883 error ("target %s %s=%s is not valid",
9884 pragma_or_attr, str_to_check, arg);
9886 break;
9888 default:
9889 gcc_unreachable ();
9893 /* If we reached here we either have found an attribute and validated
9894 it or didn't match any. If we matched an attribute but its arguments
9895 were malformed we will have returned false already. */
9896 return found;
9899 /* Count how many times the character C appears in
9900 NULL-terminated string STR. */
9902 static unsigned int
9903 num_occurences_in_str (char c, char *str)
9905 unsigned int res = 0;
9906 while (*str != '\0')
9908 if (*str == c)
9909 res++;
9911 str++;
9914 return res;
9917 /* Parse the tree in ARGS that contains the target attribute information
9918 and update the global target options space. PRAGMA_OR_ATTR is a string
9919 to be used in error messages, specifying whether this is processing
9920 a target attribute or a target pragma. */
9922 bool
9923 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9925 if (TREE_CODE (args) == TREE_LIST)
9929 tree head = TREE_VALUE (args);
9930 if (head)
9932 if (!aarch64_process_target_attr (head, pragma_or_attr))
9933 return false;
9935 args = TREE_CHAIN (args);
9936 } while (args);
9938 return true;
9941 if (TREE_CODE (args) != STRING_CST)
9943 error ("attribute %<target%> argument not a string");
9944 return false;
9947 size_t len = strlen (TREE_STRING_POINTER (args));
9948 char *str_to_check = (char *) alloca (len + 1);
9949 strcpy (str_to_check, TREE_STRING_POINTER (args));
9951 if (len == 0)
9953 error ("malformed target %s value", pragma_or_attr);
9954 return false;
9957 /* Used to catch empty spaces between commas i.e.
9958 attribute ((target ("attr1,,attr2"))). */
9959 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9961 /* Handle multiple target attributes separated by ','. */
9962 char *token = strtok (str_to_check, ",");
9964 unsigned int num_attrs = 0;
9965 while (token)
9967 num_attrs++;
9968 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9970 error ("target %s %qs is invalid", pragma_or_attr, token);
9971 return false;
9974 token = strtok (NULL, ",");
9977 if (num_attrs != num_commas + 1)
9979 error ("malformed target %s list %qs",
9980 pragma_or_attr, TREE_STRING_POINTER (args));
9981 return false;
9984 return true;
9987 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9988 process attribute ((target ("..."))). */
9990 static bool
9991 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9993 struct cl_target_option cur_target;
9994 bool ret;
9995 tree old_optimize;
9996 tree new_target, new_optimize;
9997 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9999 /* If what we're processing is the current pragma string then the
10000 target option node is already stored in target_option_current_node
10001 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
10002 having to re-parse the string. This is especially useful to keep
10003 arm_neon.h compile times down since that header contains a lot
10004 of intrinsics enclosed in pragmas. */
10005 if (!existing_target && args == current_target_pragma)
10007 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10008 return true;
10010 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10012 old_optimize = build_optimization_node (&global_options);
10013 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10015 /* If the function changed the optimization levels as well as setting
10016 target options, start with the optimizations specified. */
10017 if (func_optimize && func_optimize != old_optimize)
10018 cl_optimization_restore (&global_options,
10019 TREE_OPTIMIZATION (func_optimize));
10021 /* Save the current target options to restore at the end. */
10022 cl_target_option_save (&cur_target, &global_options);
10024 /* If fndecl already has some target attributes applied to it, unpack
10025 them so that we add this attribute on top of them, rather than
10026 overwriting them. */
10027 if (existing_target)
10029 struct cl_target_option *existing_options
10030 = TREE_TARGET_OPTION (existing_target);
10032 if (existing_options)
10033 cl_target_option_restore (&global_options, existing_options);
10035 else
10036 cl_target_option_restore (&global_options,
10037 TREE_TARGET_OPTION (target_option_current_node));
10040 ret = aarch64_process_target_attr (args, "attribute");
10042 /* Set up any additional state. */
10043 if (ret)
10045 aarch64_override_options_internal (&global_options);
10046 /* Initialize SIMD builtins if we haven't already.
10047 Set current_target_pragma to NULL for the duration so that
10048 the builtin initialization code doesn't try to tag the functions
10049 being built with the attributes specified by any current pragma, thus
10050 going into an infinite recursion. */
10051 if (TARGET_SIMD)
10053 tree saved_current_target_pragma = current_target_pragma;
10054 current_target_pragma = NULL;
10055 aarch64_init_simd_builtins ();
10056 current_target_pragma = saved_current_target_pragma;
10058 new_target = build_target_option_node (&global_options);
10060 else
10061 new_target = NULL;
10063 new_optimize = build_optimization_node (&global_options);
10065 if (fndecl && ret)
10067 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10069 if (old_optimize != new_optimize)
10070 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10073 cl_target_option_restore (&global_options, &cur_target);
10075 if (old_optimize != new_optimize)
10076 cl_optimization_restore (&global_options,
10077 TREE_OPTIMIZATION (old_optimize));
10078 return ret;
10081 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10082 tri-bool options (yes, no, don't care) and the default value is
10083 DEF, determine whether to reject inlining. */
10085 static bool
10086 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10087 int dont_care, int def)
10089 /* If the callee doesn't care, always allow inlining. */
10090 if (callee == dont_care)
10091 return true;
10093 /* If the caller doesn't care, always allow inlining. */
10094 if (caller == dont_care)
10095 return true;
10097 /* Otherwise, allow inlining if either the callee and caller values
10098 agree, or if the callee is using the default value. */
10099 return (callee == caller || callee == def);
10102 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10103 to inline CALLEE into CALLER based on target-specific info.
10104 Make sure that the caller and callee have compatible architectural
10105 features. Then go through the other possible target attributes
10106 and see if they can block inlining. Try not to reject always_inline
10107 callees unless they are incompatible architecturally. */
10109 static bool
10110 aarch64_can_inline_p (tree caller, tree callee)
10112 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10113 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10115 /* If callee has no option attributes, then it is ok to inline. */
10116 if (!callee_tree)
10117 return true;
10119 struct cl_target_option *caller_opts
10120 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10121 : target_option_default_node);
10123 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10126 /* Callee's ISA flags should be a subset of the caller's. */
10127 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10128 != callee_opts->x_aarch64_isa_flags)
10129 return false;
10131 /* Allow non-strict aligned functions inlining into strict
10132 aligned ones. */
10133 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10134 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10135 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10136 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10137 return false;
10139 bool always_inline = lookup_attribute ("always_inline",
10140 DECL_ATTRIBUTES (callee));
10142 /* If the architectural features match up and the callee is always_inline
10143 then the other attributes don't matter. */
10144 if (always_inline)
10145 return true;
10147 if (caller_opts->x_aarch64_cmodel_var
10148 != callee_opts->x_aarch64_cmodel_var)
10149 return false;
10151 if (caller_opts->x_aarch64_tls_dialect
10152 != callee_opts->x_aarch64_tls_dialect)
10153 return false;
10155 /* Honour explicit requests to workaround errata. */
10156 if (!aarch64_tribools_ok_for_inlining_p (
10157 caller_opts->x_aarch64_fix_a53_err835769,
10158 callee_opts->x_aarch64_fix_a53_err835769,
10159 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10160 return false;
10162 if (!aarch64_tribools_ok_for_inlining_p (
10163 caller_opts->x_aarch64_fix_a53_err843419,
10164 callee_opts->x_aarch64_fix_a53_err843419,
10165 2, TARGET_FIX_ERR_A53_843419))
10166 return false;
10168 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10169 caller and calle and they don't match up, reject inlining. */
10170 if (!aarch64_tribools_ok_for_inlining_p (
10171 caller_opts->x_flag_omit_leaf_frame_pointer,
10172 callee_opts->x_flag_omit_leaf_frame_pointer,
10173 2, 1))
10174 return false;
10176 /* If the callee has specific tuning overrides, respect them. */
10177 if (callee_opts->x_aarch64_override_tune_string != NULL
10178 && caller_opts->x_aarch64_override_tune_string == NULL)
10179 return false;
10181 /* If the user specified tuning override strings for the
10182 caller and callee and they don't match up, reject inlining.
10183 We just do a string compare here, we don't analyze the meaning
10184 of the string, as it would be too costly for little gain. */
10185 if (callee_opts->x_aarch64_override_tune_string
10186 && caller_opts->x_aarch64_override_tune_string
10187 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10188 caller_opts->x_aarch64_override_tune_string) != 0))
10189 return false;
10191 return true;
10194 /* Return true if SYMBOL_REF X binds locally. */
10196 static bool
10197 aarch64_symbol_binds_local_p (const_rtx x)
10199 return (SYMBOL_REF_DECL (x)
10200 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10201 : SYMBOL_REF_LOCAL_P (x));
10204 /* Return true if SYMBOL_REF X is thread local */
10205 static bool
10206 aarch64_tls_symbol_p (rtx x)
10208 if (! TARGET_HAVE_TLS)
10209 return false;
10211 if (GET_CODE (x) != SYMBOL_REF)
10212 return false;
10214 return SYMBOL_REF_TLS_MODEL (x) != 0;
10217 /* Classify a TLS symbol into one of the TLS kinds. */
10218 enum aarch64_symbol_type
10219 aarch64_classify_tls_symbol (rtx x)
10221 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10223 switch (tls_kind)
10225 case TLS_MODEL_GLOBAL_DYNAMIC:
10226 case TLS_MODEL_LOCAL_DYNAMIC:
10227 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10229 case TLS_MODEL_INITIAL_EXEC:
10230 switch (aarch64_cmodel)
10232 case AARCH64_CMODEL_TINY:
10233 case AARCH64_CMODEL_TINY_PIC:
10234 return SYMBOL_TINY_TLSIE;
10235 default:
10236 return SYMBOL_SMALL_TLSIE;
10239 case TLS_MODEL_LOCAL_EXEC:
10240 if (aarch64_tls_size == 12)
10241 return SYMBOL_TLSLE12;
10242 else if (aarch64_tls_size == 24)
10243 return SYMBOL_TLSLE24;
10244 else if (aarch64_tls_size == 32)
10245 return SYMBOL_TLSLE32;
10246 else if (aarch64_tls_size == 48)
10247 return SYMBOL_TLSLE48;
10248 else
10249 gcc_unreachable ();
10251 case TLS_MODEL_EMULATED:
10252 case TLS_MODEL_NONE:
10253 return SYMBOL_FORCE_TO_MEM;
10255 default:
10256 gcc_unreachable ();
10260 /* Return the method that should be used to access SYMBOL_REF or
10261 LABEL_REF X. */
10263 enum aarch64_symbol_type
10264 aarch64_classify_symbol (rtx x, rtx offset)
10266 if (GET_CODE (x) == LABEL_REF)
10268 switch (aarch64_cmodel)
10270 case AARCH64_CMODEL_LARGE:
10271 return SYMBOL_FORCE_TO_MEM;
10273 case AARCH64_CMODEL_TINY_PIC:
10274 case AARCH64_CMODEL_TINY:
10275 return SYMBOL_TINY_ABSOLUTE;
10277 case AARCH64_CMODEL_SMALL_SPIC:
10278 case AARCH64_CMODEL_SMALL_PIC:
10279 case AARCH64_CMODEL_SMALL:
10280 return SYMBOL_SMALL_ABSOLUTE;
10282 default:
10283 gcc_unreachable ();
10287 if (GET_CODE (x) == SYMBOL_REF)
10289 if (aarch64_tls_symbol_p (x))
10290 return aarch64_classify_tls_symbol (x);
10292 switch (aarch64_cmodel)
10294 case AARCH64_CMODEL_TINY:
10295 /* When we retrieve symbol + offset address, we have to make sure
10296 the offset does not cause overflow of the final address. But
10297 we have no way of knowing the address of symbol at compile time
10298 so we can't accurately say if the distance between the PC and
10299 symbol + offset is outside the addressible range of +/-1M in the
10300 TINY code model. So we rely on images not being greater than
10301 1M and cap the offset at 1M and anything beyond 1M will have to
10302 be loaded using an alternative mechanism. Furthermore if the
10303 symbol is a weak reference to something that isn't known to
10304 resolve to a symbol in this module, then force to memory. */
10305 if ((SYMBOL_REF_WEAK (x)
10306 && !aarch64_symbol_binds_local_p (x))
10307 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10308 return SYMBOL_FORCE_TO_MEM;
10309 return SYMBOL_TINY_ABSOLUTE;
10311 case AARCH64_CMODEL_SMALL:
10312 /* Same reasoning as the tiny code model, but the offset cap here is
10313 4G. */
10314 if ((SYMBOL_REF_WEAK (x)
10315 && !aarch64_symbol_binds_local_p (x))
10316 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10317 HOST_WIDE_INT_C (4294967264)))
10318 return SYMBOL_FORCE_TO_MEM;
10319 return SYMBOL_SMALL_ABSOLUTE;
10321 case AARCH64_CMODEL_TINY_PIC:
10322 if (!aarch64_symbol_binds_local_p (x))
10323 return SYMBOL_TINY_GOT;
10324 return SYMBOL_TINY_ABSOLUTE;
10326 case AARCH64_CMODEL_SMALL_SPIC:
10327 case AARCH64_CMODEL_SMALL_PIC:
10328 if (!aarch64_symbol_binds_local_p (x))
10329 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10330 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10331 return SYMBOL_SMALL_ABSOLUTE;
10333 case AARCH64_CMODEL_LARGE:
10334 /* This is alright even in PIC code as the constant
10335 pool reference is always PC relative and within
10336 the same translation unit. */
10337 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10338 return SYMBOL_SMALL_ABSOLUTE;
10339 else
10340 return SYMBOL_FORCE_TO_MEM;
10342 default:
10343 gcc_unreachable ();
10347 /* By default push everything into the constant pool. */
10348 return SYMBOL_FORCE_TO_MEM;
10351 bool
10352 aarch64_constant_address_p (rtx x)
10354 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10357 bool
10358 aarch64_legitimate_pic_operand_p (rtx x)
10360 if (GET_CODE (x) == SYMBOL_REF
10361 || (GET_CODE (x) == CONST
10362 && GET_CODE (XEXP (x, 0)) == PLUS
10363 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10364 return false;
10366 return true;
10369 /* Return true if X holds either a quarter-precision or
10370 floating-point +0.0 constant. */
10371 static bool
10372 aarch64_valid_floating_const (rtx x)
10374 if (!CONST_DOUBLE_P (x))
10375 return false;
10377 /* This call determines which constants can be used in mov<mode>
10378 as integer moves instead of constant loads. */
10379 if (aarch64_float_const_rtx_p (x))
10380 return true;
10382 return aarch64_float_const_representable_p (x);
10385 static bool
10386 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10388 /* Do not allow vector struct mode constants. We could support
10389 0 and -1 easily, but they need support in aarch64-simd.md. */
10390 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10391 return false;
10393 /* For these cases we never want to use a literal load.
10394 As such we have to prevent the compiler from forcing these
10395 to memory. */
10396 if ((GET_CODE (x) == CONST_VECTOR
10397 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10398 || CONST_INT_P (x)
10399 || aarch64_valid_floating_const (x)
10400 || aarch64_can_const_movi_rtx_p (x, mode)
10401 || aarch64_float_const_rtx_p (x))
10402 return !targetm.cannot_force_const_mem (mode, x);
10404 if (GET_CODE (x) == HIGH
10405 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10406 return true;
10408 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10409 so spilling them is better than rematerialization. */
10410 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10411 return true;
10413 return aarch64_constant_address_p (x);
10417 aarch64_load_tp (rtx target)
10419 if (!target
10420 || GET_MODE (target) != Pmode
10421 || !register_operand (target, Pmode))
10422 target = gen_reg_rtx (Pmode);
10424 /* Can return in any reg. */
10425 emit_insn (gen_aarch64_load_tp_hard (target));
10426 return target;
10429 /* On AAPCS systems, this is the "struct __va_list". */
10430 static GTY(()) tree va_list_type;
10432 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10433 Return the type to use as __builtin_va_list.
10435 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10437 struct __va_list
10439 void *__stack;
10440 void *__gr_top;
10441 void *__vr_top;
10442 int __gr_offs;
10443 int __vr_offs;
10444 }; */
10446 static tree
10447 aarch64_build_builtin_va_list (void)
10449 tree va_list_name;
10450 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10452 /* Create the type. */
10453 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10454 /* Give it the required name. */
10455 va_list_name = build_decl (BUILTINS_LOCATION,
10456 TYPE_DECL,
10457 get_identifier ("__va_list"),
10458 va_list_type);
10459 DECL_ARTIFICIAL (va_list_name) = 1;
10460 TYPE_NAME (va_list_type) = va_list_name;
10461 TYPE_STUB_DECL (va_list_type) = va_list_name;
10463 /* Create the fields. */
10464 f_stack = build_decl (BUILTINS_LOCATION,
10465 FIELD_DECL, get_identifier ("__stack"),
10466 ptr_type_node);
10467 f_grtop = build_decl (BUILTINS_LOCATION,
10468 FIELD_DECL, get_identifier ("__gr_top"),
10469 ptr_type_node);
10470 f_vrtop = build_decl (BUILTINS_LOCATION,
10471 FIELD_DECL, get_identifier ("__vr_top"),
10472 ptr_type_node);
10473 f_groff = build_decl (BUILTINS_LOCATION,
10474 FIELD_DECL, get_identifier ("__gr_offs"),
10475 integer_type_node);
10476 f_vroff = build_decl (BUILTINS_LOCATION,
10477 FIELD_DECL, get_identifier ("__vr_offs"),
10478 integer_type_node);
10480 /* Tell tree-stdarg pass about our internal offset fields.
10481 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10482 purpose to identify whether the code is updating va_list internal
10483 offset fields through irregular way. */
10484 va_list_gpr_counter_field = f_groff;
10485 va_list_fpr_counter_field = f_vroff;
10487 DECL_ARTIFICIAL (f_stack) = 1;
10488 DECL_ARTIFICIAL (f_grtop) = 1;
10489 DECL_ARTIFICIAL (f_vrtop) = 1;
10490 DECL_ARTIFICIAL (f_groff) = 1;
10491 DECL_ARTIFICIAL (f_vroff) = 1;
10493 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10494 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10495 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10496 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10497 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10499 TYPE_FIELDS (va_list_type) = f_stack;
10500 DECL_CHAIN (f_stack) = f_grtop;
10501 DECL_CHAIN (f_grtop) = f_vrtop;
10502 DECL_CHAIN (f_vrtop) = f_groff;
10503 DECL_CHAIN (f_groff) = f_vroff;
10505 /* Compute its layout. */
10506 layout_type (va_list_type);
10508 return va_list_type;
10511 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10512 static void
10513 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10515 const CUMULATIVE_ARGS *cum;
10516 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10517 tree stack, grtop, vrtop, groff, vroff;
10518 tree t;
10519 int gr_save_area_size = cfun->va_list_gpr_size;
10520 int vr_save_area_size = cfun->va_list_fpr_size;
10521 int vr_offset;
10523 cum = &crtl->args.info;
10524 if (cfun->va_list_gpr_size)
10525 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10526 cfun->va_list_gpr_size);
10527 if (cfun->va_list_fpr_size)
10528 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10529 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10531 if (!TARGET_FLOAT)
10533 gcc_assert (cum->aapcs_nvrn == 0);
10534 vr_save_area_size = 0;
10537 f_stack = TYPE_FIELDS (va_list_type_node);
10538 f_grtop = DECL_CHAIN (f_stack);
10539 f_vrtop = DECL_CHAIN (f_grtop);
10540 f_groff = DECL_CHAIN (f_vrtop);
10541 f_vroff = DECL_CHAIN (f_groff);
10543 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10544 NULL_TREE);
10545 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10546 NULL_TREE);
10547 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10548 NULL_TREE);
10549 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10550 NULL_TREE);
10551 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10552 NULL_TREE);
10554 /* Emit code to initialize STACK, which points to the next varargs stack
10555 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10556 by named arguments. STACK is 8-byte aligned. */
10557 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10558 if (cum->aapcs_stack_size > 0)
10559 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10560 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10561 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10563 /* Emit code to initialize GRTOP, the top of the GR save area.
10564 virtual_incoming_args_rtx should have been 16 byte aligned. */
10565 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10566 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10567 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10569 /* Emit code to initialize VRTOP, the top of the VR save area.
10570 This address is gr_save_area_bytes below GRTOP, rounded
10571 down to the next 16-byte boundary. */
10572 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10573 vr_offset = ROUND_UP (gr_save_area_size,
10574 STACK_BOUNDARY / BITS_PER_UNIT);
10576 if (vr_offset)
10577 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10578 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10579 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10581 /* Emit code to initialize GROFF, the offset from GRTOP of the
10582 next GPR argument. */
10583 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10584 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10585 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10587 /* Likewise emit code to initialize VROFF, the offset from FTOP
10588 of the next VR argument. */
10589 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10590 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10591 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10594 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10596 static tree
10597 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10598 gimple_seq *post_p ATTRIBUTE_UNUSED)
10600 tree addr;
10601 bool indirect_p;
10602 bool is_ha; /* is HFA or HVA. */
10603 bool dw_align; /* double-word align. */
10604 machine_mode ag_mode = VOIDmode;
10605 int nregs;
10606 machine_mode mode;
10608 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10609 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10610 HOST_WIDE_INT size, rsize, adjust, align;
10611 tree t, u, cond1, cond2;
10613 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10614 if (indirect_p)
10615 type = build_pointer_type (type);
10617 mode = TYPE_MODE (type);
10619 f_stack = TYPE_FIELDS (va_list_type_node);
10620 f_grtop = DECL_CHAIN (f_stack);
10621 f_vrtop = DECL_CHAIN (f_grtop);
10622 f_groff = DECL_CHAIN (f_vrtop);
10623 f_vroff = DECL_CHAIN (f_groff);
10625 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10626 f_stack, NULL_TREE);
10627 size = int_size_in_bytes (type);
10628 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10630 dw_align = false;
10631 adjust = 0;
10632 if (aarch64_vfp_is_call_or_return_candidate (mode,
10633 type,
10634 &ag_mode,
10635 &nregs,
10636 &is_ha))
10638 /* TYPE passed in fp/simd registers. */
10639 if (!TARGET_FLOAT)
10640 aarch64_err_no_fpadvsimd (mode, "varargs");
10642 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10643 unshare_expr (valist), f_vrtop, NULL_TREE);
10644 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10645 unshare_expr (valist), f_vroff, NULL_TREE);
10647 rsize = nregs * UNITS_PER_VREG;
10649 if (is_ha)
10651 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10652 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10654 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10655 && size < UNITS_PER_VREG)
10657 adjust = UNITS_PER_VREG - size;
10660 else
10662 /* TYPE passed in general registers. */
10663 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10664 unshare_expr (valist), f_grtop, NULL_TREE);
10665 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10666 unshare_expr (valist), f_groff, NULL_TREE);
10667 rsize = ROUND_UP (size, UNITS_PER_WORD);
10668 nregs = rsize / UNITS_PER_WORD;
10670 if (align > 8)
10671 dw_align = true;
10673 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10674 && size < UNITS_PER_WORD)
10676 adjust = UNITS_PER_WORD - size;
10680 /* Get a local temporary for the field value. */
10681 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10683 /* Emit code to branch if off >= 0. */
10684 t = build2 (GE_EXPR, boolean_type_node, off,
10685 build_int_cst (TREE_TYPE (off), 0));
10686 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10688 if (dw_align)
10690 /* Emit: offs = (offs + 15) & -16. */
10691 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10692 build_int_cst (TREE_TYPE (off), 15));
10693 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10694 build_int_cst (TREE_TYPE (off), -16));
10695 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10697 else
10698 roundup = NULL;
10700 /* Update ap.__[g|v]r_offs */
10701 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10702 build_int_cst (TREE_TYPE (off), rsize));
10703 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10705 /* String up. */
10706 if (roundup)
10707 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10709 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10710 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10711 build_int_cst (TREE_TYPE (f_off), 0));
10712 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10714 /* String up: make sure the assignment happens before the use. */
10715 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10716 COND_EXPR_ELSE (cond1) = t;
10718 /* Prepare the trees handling the argument that is passed on the stack;
10719 the top level node will store in ON_STACK. */
10720 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10721 if (align > 8)
10723 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10724 t = fold_convert (intDI_type_node, arg);
10725 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10726 build_int_cst (TREE_TYPE (t), 15));
10727 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10728 build_int_cst (TREE_TYPE (t), -16));
10729 t = fold_convert (TREE_TYPE (arg), t);
10730 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10732 else
10733 roundup = NULL;
10734 /* Advance ap.__stack */
10735 t = fold_convert (intDI_type_node, arg);
10736 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10737 build_int_cst (TREE_TYPE (t), size + 7));
10738 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10739 build_int_cst (TREE_TYPE (t), -8));
10740 t = fold_convert (TREE_TYPE (arg), t);
10741 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10742 /* String up roundup and advance. */
10743 if (roundup)
10744 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10745 /* String up with arg */
10746 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10747 /* Big-endianness related address adjustment. */
10748 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10749 && size < UNITS_PER_WORD)
10751 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10752 size_int (UNITS_PER_WORD - size));
10753 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10756 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10757 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10759 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10760 t = off;
10761 if (adjust)
10762 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10763 build_int_cst (TREE_TYPE (off), adjust));
10765 t = fold_convert (sizetype, t);
10766 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10768 if (is_ha)
10770 /* type ha; // treat as "struct {ftype field[n];}"
10771 ... [computing offs]
10772 for (i = 0; i <nregs; ++i, offs += 16)
10773 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10774 return ha; */
10775 int i;
10776 tree tmp_ha, field_t, field_ptr_t;
10778 /* Declare a local variable. */
10779 tmp_ha = create_tmp_var_raw (type, "ha");
10780 gimple_add_tmp_var (tmp_ha);
10782 /* Establish the base type. */
10783 switch (ag_mode)
10785 case E_SFmode:
10786 field_t = float_type_node;
10787 field_ptr_t = float_ptr_type_node;
10788 break;
10789 case E_DFmode:
10790 field_t = double_type_node;
10791 field_ptr_t = double_ptr_type_node;
10792 break;
10793 case E_TFmode:
10794 field_t = long_double_type_node;
10795 field_ptr_t = long_double_ptr_type_node;
10796 break;
10797 case E_HFmode:
10798 field_t = aarch64_fp16_type_node;
10799 field_ptr_t = aarch64_fp16_ptr_type_node;
10800 break;
10801 case E_V2SImode:
10802 case E_V4SImode:
10804 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10805 field_t = build_vector_type_for_mode (innertype, ag_mode);
10806 field_ptr_t = build_pointer_type (field_t);
10808 break;
10809 default:
10810 gcc_assert (0);
10813 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10814 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10815 addr = t;
10816 t = fold_convert (field_ptr_t, addr);
10817 t = build2 (MODIFY_EXPR, field_t,
10818 build1 (INDIRECT_REF, field_t, tmp_ha),
10819 build1 (INDIRECT_REF, field_t, t));
10821 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10822 for (i = 1; i < nregs; ++i)
10824 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10825 u = fold_convert (field_ptr_t, addr);
10826 u = build2 (MODIFY_EXPR, field_t,
10827 build2 (MEM_REF, field_t, tmp_ha,
10828 build_int_cst (field_ptr_t,
10829 (i *
10830 int_size_in_bytes (field_t)))),
10831 build1 (INDIRECT_REF, field_t, u));
10832 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10835 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10836 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10839 COND_EXPR_ELSE (cond2) = t;
10840 addr = fold_convert (build_pointer_type (type), cond1);
10841 addr = build_va_arg_indirect_ref (addr);
10843 if (indirect_p)
10844 addr = build_va_arg_indirect_ref (addr);
10846 return addr;
10849 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10851 static void
10852 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10853 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10854 int no_rtl)
10856 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10857 CUMULATIVE_ARGS local_cum;
10858 int gr_saved = cfun->va_list_gpr_size;
10859 int vr_saved = cfun->va_list_fpr_size;
10861 /* The caller has advanced CUM up to, but not beyond, the last named
10862 argument. Advance a local copy of CUM past the last "real" named
10863 argument, to find out how many registers are left over. */
10864 local_cum = *cum;
10865 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10867 /* Found out how many registers we need to save.
10868 Honor tree-stdvar analysis results. */
10869 if (cfun->va_list_gpr_size)
10870 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10871 cfun->va_list_gpr_size / UNITS_PER_WORD);
10872 if (cfun->va_list_fpr_size)
10873 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10874 cfun->va_list_fpr_size / UNITS_PER_VREG);
10876 if (!TARGET_FLOAT)
10878 gcc_assert (local_cum.aapcs_nvrn == 0);
10879 vr_saved = 0;
10882 if (!no_rtl)
10884 if (gr_saved > 0)
10886 rtx ptr, mem;
10888 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10889 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10890 - gr_saved * UNITS_PER_WORD);
10891 mem = gen_frame_mem (BLKmode, ptr);
10892 set_mem_alias_set (mem, get_varargs_alias_set ());
10894 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10895 mem, gr_saved);
10897 if (vr_saved > 0)
10899 /* We can't use move_block_from_reg, because it will use
10900 the wrong mode, storing D regs only. */
10901 machine_mode mode = TImode;
10902 int off, i, vr_start;
10904 /* Set OFF to the offset from virtual_incoming_args_rtx of
10905 the first vector register. The VR save area lies below
10906 the GR one, and is aligned to 16 bytes. */
10907 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10908 STACK_BOUNDARY / BITS_PER_UNIT);
10909 off -= vr_saved * UNITS_PER_VREG;
10911 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10912 for (i = 0; i < vr_saved; ++i)
10914 rtx ptr, mem;
10916 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10917 mem = gen_frame_mem (mode, ptr);
10918 set_mem_alias_set (mem, get_varargs_alias_set ());
10919 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10920 off += UNITS_PER_VREG;
10925 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10926 any complication of having crtl->args.pretend_args_size changed. */
10927 cfun->machine->frame.saved_varargs_size
10928 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10929 STACK_BOUNDARY / BITS_PER_UNIT)
10930 + vr_saved * UNITS_PER_VREG);
10933 static void
10934 aarch64_conditional_register_usage (void)
10936 int i;
10937 if (!TARGET_FLOAT)
10939 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10941 fixed_regs[i] = 1;
10942 call_used_regs[i] = 1;
10947 /* Walk down the type tree of TYPE counting consecutive base elements.
10948 If *MODEP is VOIDmode, then set it to the first valid floating point
10949 type. If a non-floating point type is found, or if a floating point
10950 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10951 otherwise return the count in the sub-tree. */
10952 static int
10953 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10955 machine_mode mode;
10956 HOST_WIDE_INT size;
10958 switch (TREE_CODE (type))
10960 case REAL_TYPE:
10961 mode = TYPE_MODE (type);
10962 if (mode != DFmode && mode != SFmode
10963 && mode != TFmode && mode != HFmode)
10964 return -1;
10966 if (*modep == VOIDmode)
10967 *modep = mode;
10969 if (*modep == mode)
10970 return 1;
10972 break;
10974 case COMPLEX_TYPE:
10975 mode = TYPE_MODE (TREE_TYPE (type));
10976 if (mode != DFmode && mode != SFmode
10977 && mode != TFmode && mode != HFmode)
10978 return -1;
10980 if (*modep == VOIDmode)
10981 *modep = mode;
10983 if (*modep == mode)
10984 return 2;
10986 break;
10988 case VECTOR_TYPE:
10989 /* Use V2SImode and V4SImode as representatives of all 64-bit
10990 and 128-bit vector types. */
10991 size = int_size_in_bytes (type);
10992 switch (size)
10994 case 8:
10995 mode = V2SImode;
10996 break;
10997 case 16:
10998 mode = V4SImode;
10999 break;
11000 default:
11001 return -1;
11004 if (*modep == VOIDmode)
11005 *modep = mode;
11007 /* Vector modes are considered to be opaque: two vectors are
11008 equivalent for the purposes of being homogeneous aggregates
11009 if they are the same size. */
11010 if (*modep == mode)
11011 return 1;
11013 break;
11015 case ARRAY_TYPE:
11017 int count;
11018 tree index = TYPE_DOMAIN (type);
11020 /* Can't handle incomplete types nor sizes that are not
11021 fixed. */
11022 if (!COMPLETE_TYPE_P (type)
11023 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11024 return -1;
11026 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11027 if (count == -1
11028 || !index
11029 || !TYPE_MAX_VALUE (index)
11030 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11031 || !TYPE_MIN_VALUE (index)
11032 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11033 || count < 0)
11034 return -1;
11036 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11037 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11039 /* There must be no padding. */
11040 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11041 return -1;
11043 return count;
11046 case RECORD_TYPE:
11048 int count = 0;
11049 int sub_count;
11050 tree field;
11052 /* Can't handle incomplete types nor sizes that are not
11053 fixed. */
11054 if (!COMPLETE_TYPE_P (type)
11055 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11056 return -1;
11058 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11060 if (TREE_CODE (field) != FIELD_DECL)
11061 continue;
11063 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11064 if (sub_count < 0)
11065 return -1;
11066 count += sub_count;
11069 /* There must be no padding. */
11070 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11071 return -1;
11073 return count;
11076 case UNION_TYPE:
11077 case QUAL_UNION_TYPE:
11079 /* These aren't very interesting except in a degenerate case. */
11080 int count = 0;
11081 int sub_count;
11082 tree field;
11084 /* Can't handle incomplete types nor sizes that are not
11085 fixed. */
11086 if (!COMPLETE_TYPE_P (type)
11087 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11088 return -1;
11090 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11092 if (TREE_CODE (field) != FIELD_DECL)
11093 continue;
11095 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11096 if (sub_count < 0)
11097 return -1;
11098 count = count > sub_count ? count : sub_count;
11101 /* There must be no padding. */
11102 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11103 return -1;
11105 return count;
11108 default:
11109 break;
11112 return -1;
11115 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11116 type as described in AAPCS64 \S 4.1.2.
11118 See the comment above aarch64_composite_type_p for the notes on MODE. */
11120 static bool
11121 aarch64_short_vector_p (const_tree type,
11122 machine_mode mode)
11124 HOST_WIDE_INT size = -1;
11126 if (type && TREE_CODE (type) == VECTOR_TYPE)
11127 size = int_size_in_bytes (type);
11128 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11129 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11130 size = GET_MODE_SIZE (mode);
11132 return (size == 8 || size == 16);
11135 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11136 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11137 array types. The C99 floating-point complex types are also considered
11138 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11139 types, which are GCC extensions and out of the scope of AAPCS64, are
11140 treated as composite types here as well.
11142 Note that MODE itself is not sufficient in determining whether a type
11143 is such a composite type or not. This is because
11144 stor-layout.c:compute_record_mode may have already changed the MODE
11145 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11146 structure with only one field may have its MODE set to the mode of the
11147 field. Also an integer mode whose size matches the size of the
11148 RECORD_TYPE type may be used to substitute the original mode
11149 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11150 solely relied on. */
11152 static bool
11153 aarch64_composite_type_p (const_tree type,
11154 machine_mode mode)
11156 if (aarch64_short_vector_p (type, mode))
11157 return false;
11159 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11160 return true;
11162 if (mode == BLKmode
11163 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11164 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11165 return true;
11167 return false;
11170 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11171 shall be passed or returned in simd/fp register(s) (providing these
11172 parameter passing registers are available).
11174 Upon successful return, *COUNT returns the number of needed registers,
11175 *BASE_MODE returns the mode of the individual register and when IS_HAF
11176 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11177 floating-point aggregate or a homogeneous short-vector aggregate. */
11179 static bool
11180 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11181 const_tree type,
11182 machine_mode *base_mode,
11183 int *count,
11184 bool *is_ha)
11186 machine_mode new_mode = VOIDmode;
11187 bool composite_p = aarch64_composite_type_p (type, mode);
11189 if (is_ha != NULL) *is_ha = false;
11191 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11192 || aarch64_short_vector_p (type, mode))
11194 *count = 1;
11195 new_mode = mode;
11197 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11199 if (is_ha != NULL) *is_ha = true;
11200 *count = 2;
11201 new_mode = GET_MODE_INNER (mode);
11203 else if (type && composite_p)
11205 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11207 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11209 if (is_ha != NULL) *is_ha = true;
11210 *count = ag_count;
11212 else
11213 return false;
11215 else
11216 return false;
11218 *base_mode = new_mode;
11219 return true;
11222 /* Implement TARGET_STRUCT_VALUE_RTX. */
11224 static rtx
11225 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11226 int incoming ATTRIBUTE_UNUSED)
11228 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11231 /* Implements target hook vector_mode_supported_p. */
11232 static bool
11233 aarch64_vector_mode_supported_p (machine_mode mode)
11235 if (TARGET_SIMD
11236 && (mode == V4SImode || mode == V8HImode
11237 || mode == V16QImode || mode == V2DImode
11238 || mode == V2SImode || mode == V4HImode
11239 || mode == V8QImode || mode == V2SFmode
11240 || mode == V4SFmode || mode == V2DFmode
11241 || mode == V4HFmode || mode == V8HFmode
11242 || mode == V1DFmode))
11243 return true;
11245 return false;
11248 /* Return appropriate SIMD container
11249 for MODE within a vector of WIDTH bits. */
11250 static machine_mode
11251 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11253 gcc_assert (width == 64 || width == 128);
11254 if (TARGET_SIMD)
11256 if (width == 128)
11257 switch (mode)
11259 case E_DFmode:
11260 return V2DFmode;
11261 case E_SFmode:
11262 return V4SFmode;
11263 case E_HFmode:
11264 return V8HFmode;
11265 case E_SImode:
11266 return V4SImode;
11267 case E_HImode:
11268 return V8HImode;
11269 case E_QImode:
11270 return V16QImode;
11271 case E_DImode:
11272 return V2DImode;
11273 default:
11274 break;
11276 else
11277 switch (mode)
11279 case E_SFmode:
11280 return V2SFmode;
11281 case E_HFmode:
11282 return V4HFmode;
11283 case E_SImode:
11284 return V2SImode;
11285 case E_HImode:
11286 return V4HImode;
11287 case E_QImode:
11288 return V8QImode;
11289 default:
11290 break;
11293 return word_mode;
11296 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11297 static machine_mode
11298 aarch64_preferred_simd_mode (scalar_mode mode)
11300 return aarch64_simd_container_mode (mode, 128);
11303 /* Return the bitmask of possible vector sizes for the vectorizer
11304 to iterate over. */
11305 static unsigned int
11306 aarch64_autovectorize_vector_sizes (void)
11308 return (16 | 8);
11311 /* Implement TARGET_MANGLE_TYPE. */
11313 static const char *
11314 aarch64_mangle_type (const_tree type)
11316 /* The AArch64 ABI documents say that "__va_list" has to be
11317 managled as if it is in the "std" namespace. */
11318 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11319 return "St9__va_list";
11321 /* Half-precision float. */
11322 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11323 return "Dh";
11325 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11326 builtin types. */
11327 if (TYPE_NAME (type) != NULL)
11328 return aarch64_mangle_builtin_type (type);
11330 /* Use the default mangling. */
11331 return NULL;
11334 /* Find the first rtx_insn before insn that will generate an assembly
11335 instruction. */
11337 static rtx_insn *
11338 aarch64_prev_real_insn (rtx_insn *insn)
11340 if (!insn)
11341 return NULL;
11345 insn = prev_real_insn (insn);
11347 while (insn && recog_memoized (insn) < 0);
11349 return insn;
11352 static bool
11353 is_madd_op (enum attr_type t1)
11355 unsigned int i;
11356 /* A number of these may be AArch32 only. */
11357 enum attr_type mlatypes[] = {
11358 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11359 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11360 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11363 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11365 if (t1 == mlatypes[i])
11366 return true;
11369 return false;
11372 /* Check if there is a register dependency between a load and the insn
11373 for which we hold recog_data. */
11375 static bool
11376 dep_between_memop_and_curr (rtx memop)
11378 rtx load_reg;
11379 int opno;
11381 gcc_assert (GET_CODE (memop) == SET);
11383 if (!REG_P (SET_DEST (memop)))
11384 return false;
11386 load_reg = SET_DEST (memop);
11387 for (opno = 1; opno < recog_data.n_operands; opno++)
11389 rtx operand = recog_data.operand[opno];
11390 if (REG_P (operand)
11391 && reg_overlap_mentioned_p (load_reg, operand))
11392 return true;
11395 return false;
11399 /* When working around the Cortex-A53 erratum 835769,
11400 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11401 instruction and has a preceding memory instruction such that a NOP
11402 should be inserted between them. */
11404 bool
11405 aarch64_madd_needs_nop (rtx_insn* insn)
11407 enum attr_type attr_type;
11408 rtx_insn *prev;
11409 rtx body;
11411 if (!TARGET_FIX_ERR_A53_835769)
11412 return false;
11414 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11415 return false;
11417 attr_type = get_attr_type (insn);
11418 if (!is_madd_op (attr_type))
11419 return false;
11421 prev = aarch64_prev_real_insn (insn);
11422 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11423 Restore recog state to INSN to avoid state corruption. */
11424 extract_constrain_insn_cached (insn);
11426 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11427 return false;
11429 body = single_set (prev);
11431 /* If the previous insn is a memory op and there is no dependency between
11432 it and the DImode madd, emit a NOP between them. If body is NULL then we
11433 have a complex memory operation, probably a load/store pair.
11434 Be conservative for now and emit a NOP. */
11435 if (GET_MODE (recog_data.operand[0]) == DImode
11436 && (!body || !dep_between_memop_and_curr (body)))
11437 return true;
11439 return false;
11444 /* Implement FINAL_PRESCAN_INSN. */
11446 void
11447 aarch64_final_prescan_insn (rtx_insn *insn)
11449 if (aarch64_madd_needs_nop (insn))
11450 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11454 /* Return the equivalent letter for size. */
11455 static char
11456 sizetochar (int size)
11458 switch (size)
11460 case 64: return 'd';
11461 case 32: return 's';
11462 case 16: return 'h';
11463 case 8 : return 'b';
11464 default: gcc_unreachable ();
11468 /* Return true iff x is a uniform vector of floating-point
11469 constants, and the constant can be represented in
11470 quarter-precision form. Note, as aarch64_float_const_representable
11471 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11472 static bool
11473 aarch64_vect_float_const_representable_p (rtx x)
11475 rtx elt;
11476 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11477 && const_vec_duplicate_p (x, &elt)
11478 && aarch64_float_const_representable_p (elt));
11481 /* Return true for valid and false for invalid. */
11482 bool
11483 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11484 struct simd_immediate_info *info)
11486 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11487 matches = 1; \
11488 for (i = 0; i < idx; i += (STRIDE)) \
11489 if (!(TEST)) \
11490 matches = 0; \
11491 if (matches) \
11493 immtype = (CLASS); \
11494 elsize = (ELSIZE); \
11495 eshift = (SHIFT); \
11496 emvn = (NEG); \
11497 break; \
11500 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11501 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11502 unsigned char bytes[16];
11503 int immtype = -1, matches;
11504 unsigned int invmask = inverse ? 0xff : 0;
11505 int eshift, emvn;
11507 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11509 if (! (aarch64_simd_imm_zero_p (op, mode)
11510 || aarch64_vect_float_const_representable_p (op)))
11511 return false;
11513 if (info)
11515 rtx elt = CONST_VECTOR_ELT (op, 0);
11516 scalar_float_mode elt_mode
11517 = as_a <scalar_float_mode> (GET_MODE (elt));
11519 info->value = elt;
11520 info->element_width = GET_MODE_BITSIZE (elt_mode);
11521 info->mvn = false;
11522 info->shift = 0;
11525 return true;
11528 /* Splat vector constant out into a byte vector. */
11529 for (i = 0; i < n_elts; i++)
11531 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11532 it must be laid out in the vector register in reverse order. */
11533 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11534 unsigned HOST_WIDE_INT elpart;
11536 gcc_assert (CONST_INT_P (el));
11537 elpart = INTVAL (el);
11539 for (unsigned int byte = 0; byte < innersize; byte++)
11541 bytes[idx++] = (elpart & 0xff) ^ invmask;
11542 elpart >>= BITS_PER_UNIT;
11547 /* Sanity check. */
11548 gcc_assert (idx == GET_MODE_SIZE (mode));
11552 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11553 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11555 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11556 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11558 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11559 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11561 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11562 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11564 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11566 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11568 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11569 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11571 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11572 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11574 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11575 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11577 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11578 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11580 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11582 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11584 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11585 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11587 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11588 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11590 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11591 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11593 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11594 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11596 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11598 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11599 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11601 while (0);
11603 if (immtype == -1)
11604 return false;
11606 if (info)
11608 info->element_width = elsize;
11609 info->mvn = emvn != 0;
11610 info->shift = eshift;
11612 unsigned HOST_WIDE_INT imm = 0;
11614 if (immtype >= 12 && immtype <= 15)
11615 info->msl = true;
11617 /* Un-invert bytes of recognized vector, if necessary. */
11618 if (invmask != 0)
11619 for (i = 0; i < idx; i++)
11620 bytes[i] ^= invmask;
11622 if (immtype == 17)
11624 /* FIXME: Broken on 32-bit H_W_I hosts. */
11625 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11627 for (i = 0; i < 8; i++)
11628 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11629 << (i * BITS_PER_UNIT);
11632 info->value = GEN_INT (imm);
11634 else
11636 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11637 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11639 /* Construct 'abcdefgh' because the assembler cannot handle
11640 generic constants. */
11641 if (info->mvn)
11642 imm = ~imm;
11643 imm = (imm >> info->shift) & 0xff;
11644 info->value = GEN_INT (imm);
11648 return true;
11649 #undef CHECK
11652 /* Check of immediate shift constants are within range. */
11653 bool
11654 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11656 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11657 if (left)
11658 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11659 else
11660 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11663 /* Return true if X is a uniform vector where all elements
11664 are either the floating-point constant 0.0 or the
11665 integer constant 0. */
11666 bool
11667 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11669 return x == CONST0_RTX (mode);
11673 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11674 operation of width WIDTH at bit position POS. */
11677 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11679 gcc_assert (CONST_INT_P (width));
11680 gcc_assert (CONST_INT_P (pos));
11682 unsigned HOST_WIDE_INT mask
11683 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11684 return GEN_INT (mask << UINTVAL (pos));
11687 bool
11688 aarch64_mov_operand_p (rtx x, machine_mode mode)
11690 if (GET_CODE (x) == HIGH
11691 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11692 return true;
11694 if (CONST_INT_P (x))
11695 return true;
11697 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11698 return true;
11700 return aarch64_classify_symbolic_expression (x)
11701 == SYMBOL_TINY_ABSOLUTE;
11704 /* Return a const_int vector of VAL. */
11706 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11708 int nunits = GET_MODE_NUNITS (mode);
11709 rtvec v = rtvec_alloc (nunits);
11710 int i;
11712 rtx cache = GEN_INT (val);
11714 for (i=0; i < nunits; i++)
11715 RTVEC_ELT (v, i) = cache;
11717 return gen_rtx_CONST_VECTOR (mode, v);
11720 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11722 bool
11723 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11725 machine_mode vmode;
11727 vmode = aarch64_preferred_simd_mode (mode);
11728 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11729 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11732 /* Construct and return a PARALLEL RTX vector with elements numbering the
11733 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11734 the vector - from the perspective of the architecture. This does not
11735 line up with GCC's perspective on lane numbers, so we end up with
11736 different masks depending on our target endian-ness. The diagram
11737 below may help. We must draw the distinction when building masks
11738 which select one half of the vector. An instruction selecting
11739 architectural low-lanes for a big-endian target, must be described using
11740 a mask selecting GCC high-lanes.
11742 Big-Endian Little-Endian
11744 GCC 0 1 2 3 3 2 1 0
11745 | x | x | x | x | | x | x | x | x |
11746 Architecture 3 2 1 0 3 2 1 0
11748 Low Mask: { 2, 3 } { 0, 1 }
11749 High Mask: { 0, 1 } { 2, 3 }
11753 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11755 int nunits = GET_MODE_NUNITS (mode);
11756 rtvec v = rtvec_alloc (nunits / 2);
11757 int high_base = nunits / 2;
11758 int low_base = 0;
11759 int base;
11760 rtx t1;
11761 int i;
11763 if (BYTES_BIG_ENDIAN)
11764 base = high ? low_base : high_base;
11765 else
11766 base = high ? high_base : low_base;
11768 for (i = 0; i < nunits / 2; i++)
11769 RTVEC_ELT (v, i) = GEN_INT (base + i);
11771 t1 = gen_rtx_PARALLEL (mode, v);
11772 return t1;
11775 /* Check OP for validity as a PARALLEL RTX vector with elements
11776 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11777 from the perspective of the architecture. See the diagram above
11778 aarch64_simd_vect_par_cnst_half for more details. */
11780 bool
11781 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11782 bool high)
11784 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11785 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11786 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11787 int i = 0;
11789 if (!VECTOR_MODE_P (mode))
11790 return false;
11792 if (count_op != count_ideal)
11793 return false;
11795 for (i = 0; i < count_ideal; i++)
11797 rtx elt_op = XVECEXP (op, 0, i);
11798 rtx elt_ideal = XVECEXP (ideal, 0, i);
11800 if (!CONST_INT_P (elt_op)
11801 || INTVAL (elt_ideal) != INTVAL (elt_op))
11802 return false;
11804 return true;
11807 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11808 HIGH (exclusive). */
11809 void
11810 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11811 const_tree exp)
11813 HOST_WIDE_INT lane;
11814 gcc_assert (CONST_INT_P (operand));
11815 lane = INTVAL (operand);
11817 if (lane < low || lane >= high)
11819 if (exp)
11820 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11821 else
11822 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11826 /* Return TRUE if OP is a valid vector addressing mode. */
11827 bool
11828 aarch64_simd_mem_operand_p (rtx op)
11830 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11831 || REG_P (XEXP (op, 0)));
11834 /* Emit a register copy from operand to operand, taking care not to
11835 early-clobber source registers in the process.
11837 COUNT is the number of components into which the copy needs to be
11838 decomposed. */
11839 void
11840 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11841 unsigned int count)
11843 unsigned int i;
11844 int rdest = REGNO (operands[0]);
11845 int rsrc = REGNO (operands[1]);
11847 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11848 || rdest < rsrc)
11849 for (i = 0; i < count; i++)
11850 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11851 gen_rtx_REG (mode, rsrc + i));
11852 else
11853 for (i = 0; i < count; i++)
11854 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11855 gen_rtx_REG (mode, rsrc + count - i - 1));
11858 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11859 one of VSTRUCT modes: OI, CI, or XI. */
11861 aarch64_simd_attr_length_rglist (machine_mode mode)
11863 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11866 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11867 alignment of a vector to 128 bits. */
11868 static HOST_WIDE_INT
11869 aarch64_simd_vector_alignment (const_tree type)
11871 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11872 return MIN (align, 128);
11875 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11876 static bool
11877 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11879 if (is_packed)
11880 return false;
11882 /* We guarantee alignment for vectors up to 128-bits. */
11883 if (tree_int_cst_compare (TYPE_SIZE (type),
11884 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11885 return false;
11887 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11888 return true;
11891 /* Return true if the vector misalignment factor is supported by the
11892 target. */
11893 static bool
11894 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11895 const_tree type, int misalignment,
11896 bool is_packed)
11898 if (TARGET_SIMD && STRICT_ALIGNMENT)
11900 /* Return if movmisalign pattern is not supported for this mode. */
11901 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11902 return false;
11904 if (misalignment == -1)
11906 /* Misalignment factor is unknown at compile time but we know
11907 it's word aligned. */
11908 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11910 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11912 if (element_size != 64)
11913 return true;
11915 return false;
11918 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11919 is_packed);
11922 /* If VALS is a vector constant that can be loaded into a register
11923 using DUP, generate instructions to do so and return an RTX to
11924 assign to the register. Otherwise return NULL_RTX. */
11925 static rtx
11926 aarch64_simd_dup_constant (rtx vals)
11928 machine_mode mode = GET_MODE (vals);
11929 machine_mode inner_mode = GET_MODE_INNER (mode);
11930 rtx x;
11932 if (!const_vec_duplicate_p (vals, &x))
11933 return NULL_RTX;
11935 /* We can load this constant by using DUP and a constant in a
11936 single ARM register. This will be cheaper than a vector
11937 load. */
11938 x = copy_to_mode_reg (inner_mode, x);
11939 return gen_rtx_VEC_DUPLICATE (mode, x);
11943 /* Generate code to load VALS, which is a PARALLEL containing only
11944 constants (for vec_init) or CONST_VECTOR, efficiently into a
11945 register. Returns an RTX to copy into the register, or NULL_RTX
11946 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11947 static rtx
11948 aarch64_simd_make_constant (rtx vals)
11950 machine_mode mode = GET_MODE (vals);
11951 rtx const_dup;
11952 rtx const_vec = NULL_RTX;
11953 int n_elts = GET_MODE_NUNITS (mode);
11954 int n_const = 0;
11955 int i;
11957 if (GET_CODE (vals) == CONST_VECTOR)
11958 const_vec = vals;
11959 else if (GET_CODE (vals) == PARALLEL)
11961 /* A CONST_VECTOR must contain only CONST_INTs and
11962 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11963 Only store valid constants in a CONST_VECTOR. */
11964 for (i = 0; i < n_elts; ++i)
11966 rtx x = XVECEXP (vals, 0, i);
11967 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11968 n_const++;
11970 if (n_const == n_elts)
11971 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11973 else
11974 gcc_unreachable ();
11976 if (const_vec != NULL_RTX
11977 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11978 /* Load using MOVI/MVNI. */
11979 return const_vec;
11980 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11981 /* Loaded using DUP. */
11982 return const_dup;
11983 else if (const_vec != NULL_RTX)
11984 /* Load from constant pool. We can not take advantage of single-cycle
11985 LD1 because we need a PC-relative addressing mode. */
11986 return const_vec;
11987 else
11988 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11989 We can not construct an initializer. */
11990 return NULL_RTX;
11993 /* Expand a vector initialisation sequence, such that TARGET is
11994 initialised to contain VALS. */
11996 void
11997 aarch64_expand_vector_init (rtx target, rtx vals)
11999 machine_mode mode = GET_MODE (target);
12000 scalar_mode inner_mode = GET_MODE_INNER (mode);
12001 /* The number of vector elements. */
12002 int n_elts = GET_MODE_NUNITS (mode);
12003 /* The number of vector elements which are not constant. */
12004 int n_var = 0;
12005 rtx any_const = NULL_RTX;
12006 /* The first element of vals. */
12007 rtx v0 = XVECEXP (vals, 0, 0);
12008 bool all_same = true;
12010 /* Count the number of variable elements to initialise. */
12011 for (int i = 0; i < n_elts; ++i)
12013 rtx x = XVECEXP (vals, 0, i);
12014 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12015 ++n_var;
12016 else
12017 any_const = x;
12019 all_same &= rtx_equal_p (x, v0);
12022 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12023 how best to handle this. */
12024 if (n_var == 0)
12026 rtx constant = aarch64_simd_make_constant (vals);
12027 if (constant != NULL_RTX)
12029 emit_move_insn (target, constant);
12030 return;
12034 /* Splat a single non-constant element if we can. */
12035 if (all_same)
12037 rtx x = copy_to_mode_reg (inner_mode, v0);
12038 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12039 return;
12042 enum insn_code icode = optab_handler (vec_set_optab, mode);
12043 gcc_assert (icode != CODE_FOR_nothing);
12045 /* If there are only variable elements, try to optimize
12046 the insertion using dup for the most common element
12047 followed by insertions. */
12049 /* The algorithm will fill matches[*][0] with the earliest matching element,
12050 and matches[X][1] with the count of duplicate elements (if X is the
12051 earliest element which has duplicates). */
12053 if (n_var == n_elts && n_elts <= 16)
12055 int matches[16][2] = {0};
12056 for (int i = 0; i < n_elts; i++)
12058 for (int j = 0; j <= i; j++)
12060 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12062 matches[i][0] = j;
12063 matches[j][1]++;
12064 break;
12068 int maxelement = 0;
12069 int maxv = 0;
12070 for (int i = 0; i < n_elts; i++)
12071 if (matches[i][1] > maxv)
12073 maxelement = i;
12074 maxv = matches[i][1];
12077 /* Create a duplicate of the most common element. */
12078 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12079 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12081 /* Insert the rest. */
12082 for (int i = 0; i < n_elts; i++)
12084 rtx x = XVECEXP (vals, 0, i);
12085 if (matches[i][0] == maxelement)
12086 continue;
12087 x = copy_to_mode_reg (inner_mode, x);
12088 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12090 return;
12093 /* Initialise a vector which is part-variable. We want to first try
12094 to build those lanes which are constant in the most efficient way we
12095 can. */
12096 if (n_var != n_elts)
12098 rtx copy = copy_rtx (vals);
12100 /* Load constant part of vector. We really don't care what goes into the
12101 parts we will overwrite, but we're more likely to be able to load the
12102 constant efficiently if it has fewer, larger, repeating parts
12103 (see aarch64_simd_valid_immediate). */
12104 for (int i = 0; i < n_elts; i++)
12106 rtx x = XVECEXP (vals, 0, i);
12107 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12108 continue;
12109 rtx subst = any_const;
12110 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12112 /* Look in the copied vector, as more elements are const. */
12113 rtx test = XVECEXP (copy, 0, i ^ bit);
12114 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12116 subst = test;
12117 break;
12120 XVECEXP (copy, 0, i) = subst;
12122 aarch64_expand_vector_init (target, copy);
12125 /* Insert the variable lanes directly. */
12126 for (int i = 0; i < n_elts; i++)
12128 rtx x = XVECEXP (vals, 0, i);
12129 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12130 continue;
12131 x = copy_to_mode_reg (inner_mode, x);
12132 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12136 static unsigned HOST_WIDE_INT
12137 aarch64_shift_truncation_mask (machine_mode mode)
12139 return
12140 (!SHIFT_COUNT_TRUNCATED
12141 || aarch64_vector_mode_supported_p (mode)
12142 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12145 /* Select a format to encode pointers in exception handling data. */
12147 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12149 int type;
12150 switch (aarch64_cmodel)
12152 case AARCH64_CMODEL_TINY:
12153 case AARCH64_CMODEL_TINY_PIC:
12154 case AARCH64_CMODEL_SMALL:
12155 case AARCH64_CMODEL_SMALL_PIC:
12156 case AARCH64_CMODEL_SMALL_SPIC:
12157 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12158 for everything. */
12159 type = DW_EH_PE_sdata4;
12160 break;
12161 default:
12162 /* No assumptions here. 8-byte relocs required. */
12163 type = DW_EH_PE_sdata8;
12164 break;
12166 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12169 /* The last .arch and .tune assembly strings that we printed. */
12170 static std::string aarch64_last_printed_arch_string;
12171 static std::string aarch64_last_printed_tune_string;
12173 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12174 by the function fndecl. */
12176 void
12177 aarch64_declare_function_name (FILE *stream, const char* name,
12178 tree fndecl)
12180 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12182 struct cl_target_option *targ_options;
12183 if (target_parts)
12184 targ_options = TREE_TARGET_OPTION (target_parts);
12185 else
12186 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12187 gcc_assert (targ_options);
12189 const struct processor *this_arch
12190 = aarch64_get_arch (targ_options->x_explicit_arch);
12192 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12193 std::string extension
12194 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12195 this_arch->flags);
12196 /* Only update the assembler .arch string if it is distinct from the last
12197 such string we printed. */
12198 std::string to_print = this_arch->name + extension;
12199 if (to_print != aarch64_last_printed_arch_string)
12201 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12202 aarch64_last_printed_arch_string = to_print;
12205 /* Print the cpu name we're tuning for in the comments, might be
12206 useful to readers of the generated asm. Do it only when it changes
12207 from function to function and verbose assembly is requested. */
12208 const struct processor *this_tune
12209 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12211 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12213 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12214 this_tune->name);
12215 aarch64_last_printed_tune_string = this_tune->name;
12218 /* Don't forget the type directive for ELF. */
12219 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12220 ASM_OUTPUT_LABEL (stream, name);
12223 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12225 static void
12226 aarch64_start_file (void)
12228 struct cl_target_option *default_options
12229 = TREE_TARGET_OPTION (target_option_default_node);
12231 const struct processor *default_arch
12232 = aarch64_get_arch (default_options->x_explicit_arch);
12233 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12234 std::string extension
12235 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12236 default_arch->flags);
12238 aarch64_last_printed_arch_string = default_arch->name + extension;
12239 aarch64_last_printed_tune_string = "";
12240 asm_fprintf (asm_out_file, "\t.arch %s\n",
12241 aarch64_last_printed_arch_string.c_str ());
12243 default_file_start ();
12246 /* Emit load exclusive. */
12248 static void
12249 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12250 rtx mem, rtx model_rtx)
12252 rtx (*gen) (rtx, rtx, rtx);
12254 switch (mode)
12256 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12257 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12258 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12259 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12260 default:
12261 gcc_unreachable ();
12264 emit_insn (gen (rval, mem, model_rtx));
12267 /* Emit store exclusive. */
12269 static void
12270 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12271 rtx rval, rtx mem, rtx model_rtx)
12273 rtx (*gen) (rtx, rtx, rtx, rtx);
12275 switch (mode)
12277 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12278 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12279 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12280 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12281 default:
12282 gcc_unreachable ();
12285 emit_insn (gen (bval, rval, mem, model_rtx));
12288 /* Mark the previous jump instruction as unlikely. */
12290 static void
12291 aarch64_emit_unlikely_jump (rtx insn)
12293 rtx_insn *jump = emit_jump_insn (insn);
12294 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12297 /* Expand a compare and swap pattern. */
12299 void
12300 aarch64_expand_compare_and_swap (rtx operands[])
12302 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12303 machine_mode mode, cmp_mode;
12304 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12305 int idx;
12306 gen_cas_fn gen;
12307 const gen_cas_fn split_cas[] =
12309 gen_aarch64_compare_and_swapqi,
12310 gen_aarch64_compare_and_swaphi,
12311 gen_aarch64_compare_and_swapsi,
12312 gen_aarch64_compare_and_swapdi
12314 const gen_cas_fn atomic_cas[] =
12316 gen_aarch64_compare_and_swapqi_lse,
12317 gen_aarch64_compare_and_swaphi_lse,
12318 gen_aarch64_compare_and_swapsi_lse,
12319 gen_aarch64_compare_and_swapdi_lse
12322 bval = operands[0];
12323 rval = operands[1];
12324 mem = operands[2];
12325 oldval = operands[3];
12326 newval = operands[4];
12327 is_weak = operands[5];
12328 mod_s = operands[6];
12329 mod_f = operands[7];
12330 mode = GET_MODE (mem);
12331 cmp_mode = mode;
12333 /* Normally the succ memory model must be stronger than fail, but in the
12334 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12335 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12337 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12338 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12339 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12341 switch (mode)
12343 case E_QImode:
12344 case E_HImode:
12345 /* For short modes, we're going to perform the comparison in SImode,
12346 so do the zero-extension now. */
12347 cmp_mode = SImode;
12348 rval = gen_reg_rtx (SImode);
12349 oldval = convert_modes (SImode, mode, oldval, true);
12350 /* Fall through. */
12352 case E_SImode:
12353 case E_DImode:
12354 /* Force the value into a register if needed. */
12355 if (!aarch64_plus_operand (oldval, mode))
12356 oldval = force_reg (cmp_mode, oldval);
12357 break;
12359 default:
12360 gcc_unreachable ();
12363 switch (mode)
12365 case E_QImode: idx = 0; break;
12366 case E_HImode: idx = 1; break;
12367 case E_SImode: idx = 2; break;
12368 case E_DImode: idx = 3; break;
12369 default:
12370 gcc_unreachable ();
12372 if (TARGET_LSE)
12373 gen = atomic_cas[idx];
12374 else
12375 gen = split_cas[idx];
12377 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12379 if (mode == QImode || mode == HImode)
12380 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12382 x = gen_rtx_REG (CCmode, CC_REGNUM);
12383 x = gen_rtx_EQ (SImode, x, const0_rtx);
12384 emit_insn (gen_rtx_SET (bval, x));
12387 /* Test whether the target supports using a atomic load-operate instruction.
12388 CODE is the operation and AFTER is TRUE if the data in memory after the
12389 operation should be returned and FALSE if the data before the operation
12390 should be returned. Returns FALSE if the operation isn't supported by the
12391 architecture. */
12393 bool
12394 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12396 if (!TARGET_LSE)
12397 return false;
12399 switch (code)
12401 case SET:
12402 case AND:
12403 case IOR:
12404 case XOR:
12405 case MINUS:
12406 case PLUS:
12407 return true;
12408 default:
12409 return false;
12413 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12414 sequence implementing an atomic operation. */
12416 static void
12417 aarch64_emit_post_barrier (enum memmodel model)
12419 const enum memmodel base_model = memmodel_base (model);
12421 if (is_mm_sync (model)
12422 && (base_model == MEMMODEL_ACQUIRE
12423 || base_model == MEMMODEL_ACQ_REL
12424 || base_model == MEMMODEL_SEQ_CST))
12426 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12430 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12431 for the data in memory. EXPECTED is the value expected to be in memory.
12432 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12433 is the memory ordering to use. */
12435 void
12436 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12437 rtx expected, rtx desired,
12438 rtx model)
12440 rtx (*gen) (rtx, rtx, rtx, rtx);
12441 machine_mode mode;
12443 mode = GET_MODE (mem);
12445 switch (mode)
12447 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12448 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12449 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12450 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12451 default:
12452 gcc_unreachable ();
12455 /* Move the expected value into the CAS destination register. */
12456 emit_insn (gen_rtx_SET (rval, expected));
12458 /* Emit the CAS. */
12459 emit_insn (gen (rval, mem, desired, model));
12461 /* Compare the expected value with the value loaded by the CAS, to establish
12462 whether the swap was made. */
12463 aarch64_gen_compare_reg (EQ, rval, expected);
12466 /* Split a compare and swap pattern. */
12468 void
12469 aarch64_split_compare_and_swap (rtx operands[])
12471 rtx rval, mem, oldval, newval, scratch;
12472 machine_mode mode;
12473 bool is_weak;
12474 rtx_code_label *label1, *label2;
12475 rtx x, cond;
12476 enum memmodel model;
12477 rtx model_rtx;
12479 rval = operands[0];
12480 mem = operands[1];
12481 oldval = operands[2];
12482 newval = operands[3];
12483 is_weak = (operands[4] != const0_rtx);
12484 model_rtx = operands[5];
12485 scratch = operands[7];
12486 mode = GET_MODE (mem);
12487 model = memmodel_from_int (INTVAL (model_rtx));
12489 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12490 loop:
12491 .label1:
12492 LD[A]XR rval, [mem]
12493 CBNZ rval, .label2
12494 ST[L]XR scratch, newval, [mem]
12495 CBNZ scratch, .label1
12496 .label2:
12497 CMP rval, 0. */
12498 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12500 label1 = NULL;
12501 if (!is_weak)
12503 label1 = gen_label_rtx ();
12504 emit_label (label1);
12506 label2 = gen_label_rtx ();
12508 /* The initial load can be relaxed for a __sync operation since a final
12509 barrier will be emitted to stop code hoisting. */
12510 if (is_mm_sync (model))
12511 aarch64_emit_load_exclusive (mode, rval, mem,
12512 GEN_INT (MEMMODEL_RELAXED));
12513 else
12514 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12516 if (strong_zero_p)
12518 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12519 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12520 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12521 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12523 else
12525 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12526 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12527 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12528 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12529 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12532 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12534 if (!is_weak)
12536 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12537 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12538 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12539 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12541 else
12543 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12544 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12545 emit_insn (gen_rtx_SET (cond, x));
12548 emit_label (label2);
12549 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12550 to set the condition flags. If this is not used it will be removed by
12551 later passes. */
12552 if (strong_zero_p)
12554 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12555 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12556 emit_insn (gen_rtx_SET (cond, x));
12558 /* Emit any final barrier needed for a __sync operation. */
12559 if (is_mm_sync (model))
12560 aarch64_emit_post_barrier (model);
12563 /* Emit a BIC instruction. */
12565 static void
12566 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12568 rtx shift_rtx = GEN_INT (shift);
12569 rtx (*gen) (rtx, rtx, rtx, rtx);
12571 switch (mode)
12573 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12574 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12575 default:
12576 gcc_unreachable ();
12579 emit_insn (gen (dst, s2, shift_rtx, s1));
12582 /* Emit an atomic swap. */
12584 static void
12585 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12586 rtx mem, rtx model)
12588 rtx (*gen) (rtx, rtx, rtx, rtx);
12590 switch (mode)
12592 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12593 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12594 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12595 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12596 default:
12597 gcc_unreachable ();
12600 emit_insn (gen (dst, mem, value, model));
12603 /* Operations supported by aarch64_emit_atomic_load_op. */
12605 enum aarch64_atomic_load_op_code
12607 AARCH64_LDOP_PLUS, /* A + B */
12608 AARCH64_LDOP_XOR, /* A ^ B */
12609 AARCH64_LDOP_OR, /* A | B */
12610 AARCH64_LDOP_BIC /* A & ~B */
12613 /* Emit an atomic load-operate. */
12615 static void
12616 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12617 machine_mode mode, rtx dst, rtx src,
12618 rtx mem, rtx model)
12620 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12621 const aarch64_atomic_load_op_fn plus[] =
12623 gen_aarch64_atomic_loadaddqi,
12624 gen_aarch64_atomic_loadaddhi,
12625 gen_aarch64_atomic_loadaddsi,
12626 gen_aarch64_atomic_loadadddi
12628 const aarch64_atomic_load_op_fn eor[] =
12630 gen_aarch64_atomic_loadeorqi,
12631 gen_aarch64_atomic_loadeorhi,
12632 gen_aarch64_atomic_loadeorsi,
12633 gen_aarch64_atomic_loadeordi
12635 const aarch64_atomic_load_op_fn ior[] =
12637 gen_aarch64_atomic_loadsetqi,
12638 gen_aarch64_atomic_loadsethi,
12639 gen_aarch64_atomic_loadsetsi,
12640 gen_aarch64_atomic_loadsetdi
12642 const aarch64_atomic_load_op_fn bic[] =
12644 gen_aarch64_atomic_loadclrqi,
12645 gen_aarch64_atomic_loadclrhi,
12646 gen_aarch64_atomic_loadclrsi,
12647 gen_aarch64_atomic_loadclrdi
12649 aarch64_atomic_load_op_fn gen;
12650 int idx = 0;
12652 switch (mode)
12654 case E_QImode: idx = 0; break;
12655 case E_HImode: idx = 1; break;
12656 case E_SImode: idx = 2; break;
12657 case E_DImode: idx = 3; break;
12658 default:
12659 gcc_unreachable ();
12662 switch (code)
12664 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12665 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12666 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12667 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12668 default:
12669 gcc_unreachable ();
12672 emit_insn (gen (dst, mem, src, model));
12675 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12676 location to store the data read from memory. OUT_RESULT is the location to
12677 store the result of the operation. MEM is the memory location to read and
12678 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12679 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12680 be NULL. */
12682 void
12683 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12684 rtx mem, rtx value, rtx model_rtx)
12686 machine_mode mode = GET_MODE (mem);
12687 machine_mode wmode = (mode == DImode ? DImode : SImode);
12688 const bool short_mode = (mode < SImode);
12689 aarch64_atomic_load_op_code ldop_code;
12690 rtx src;
12691 rtx x;
12693 if (out_data)
12694 out_data = gen_lowpart (mode, out_data);
12696 if (out_result)
12697 out_result = gen_lowpart (mode, out_result);
12699 /* Make sure the value is in a register, putting it into a destination
12700 register if it needs to be manipulated. */
12701 if (!register_operand (value, mode)
12702 || code == AND || code == MINUS)
12704 src = out_result ? out_result : out_data;
12705 emit_move_insn (src, gen_lowpart (mode, value));
12707 else
12708 src = value;
12709 gcc_assert (register_operand (src, mode));
12711 /* Preprocess the data for the operation as necessary. If the operation is
12712 a SET then emit a swap instruction and finish. */
12713 switch (code)
12715 case SET:
12716 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12717 return;
12719 case MINUS:
12720 /* Negate the value and treat it as a PLUS. */
12722 rtx neg_src;
12724 /* Resize the value if necessary. */
12725 if (short_mode)
12726 src = gen_lowpart (wmode, src);
12728 neg_src = gen_rtx_NEG (wmode, src);
12729 emit_insn (gen_rtx_SET (src, neg_src));
12731 if (short_mode)
12732 src = gen_lowpart (mode, src);
12734 /* Fall-through. */
12735 case PLUS:
12736 ldop_code = AARCH64_LDOP_PLUS;
12737 break;
12739 case IOR:
12740 ldop_code = AARCH64_LDOP_OR;
12741 break;
12743 case XOR:
12744 ldop_code = AARCH64_LDOP_XOR;
12745 break;
12747 case AND:
12749 rtx not_src;
12751 /* Resize the value if necessary. */
12752 if (short_mode)
12753 src = gen_lowpart (wmode, src);
12755 not_src = gen_rtx_NOT (wmode, src);
12756 emit_insn (gen_rtx_SET (src, not_src));
12758 if (short_mode)
12759 src = gen_lowpart (mode, src);
12761 ldop_code = AARCH64_LDOP_BIC;
12762 break;
12764 default:
12765 /* The operation can't be done with atomic instructions. */
12766 gcc_unreachable ();
12769 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12771 /* If necessary, calculate the data in memory after the update by redoing the
12772 operation from values in registers. */
12773 if (!out_result)
12774 return;
12776 if (short_mode)
12778 src = gen_lowpart (wmode, src);
12779 out_data = gen_lowpart (wmode, out_data);
12780 out_result = gen_lowpart (wmode, out_result);
12783 x = NULL_RTX;
12785 switch (code)
12787 case MINUS:
12788 case PLUS:
12789 x = gen_rtx_PLUS (wmode, out_data, src);
12790 break;
12791 case IOR:
12792 x = gen_rtx_IOR (wmode, out_data, src);
12793 break;
12794 case XOR:
12795 x = gen_rtx_XOR (wmode, out_data, src);
12796 break;
12797 case AND:
12798 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12799 return;
12800 default:
12801 gcc_unreachable ();
12804 emit_set_insn (out_result, x);
12806 return;
12809 /* Split an atomic operation. */
12811 void
12812 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12813 rtx value, rtx model_rtx, rtx cond)
12815 machine_mode mode = GET_MODE (mem);
12816 machine_mode wmode = (mode == DImode ? DImode : SImode);
12817 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12818 const bool is_sync = is_mm_sync (model);
12819 rtx_code_label *label;
12820 rtx x;
12822 /* Split the atomic operation into a sequence. */
12823 label = gen_label_rtx ();
12824 emit_label (label);
12826 if (new_out)
12827 new_out = gen_lowpart (wmode, new_out);
12828 if (old_out)
12829 old_out = gen_lowpart (wmode, old_out);
12830 else
12831 old_out = new_out;
12832 value = simplify_gen_subreg (wmode, value, mode, 0);
12834 /* The initial load can be relaxed for a __sync operation since a final
12835 barrier will be emitted to stop code hoisting. */
12836 if (is_sync)
12837 aarch64_emit_load_exclusive (mode, old_out, mem,
12838 GEN_INT (MEMMODEL_RELAXED));
12839 else
12840 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12842 switch (code)
12844 case SET:
12845 new_out = value;
12846 break;
12848 case NOT:
12849 x = gen_rtx_AND (wmode, old_out, value);
12850 emit_insn (gen_rtx_SET (new_out, x));
12851 x = gen_rtx_NOT (wmode, new_out);
12852 emit_insn (gen_rtx_SET (new_out, x));
12853 break;
12855 case MINUS:
12856 if (CONST_INT_P (value))
12858 value = GEN_INT (-INTVAL (value));
12859 code = PLUS;
12861 /* Fall through. */
12863 default:
12864 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12865 emit_insn (gen_rtx_SET (new_out, x));
12866 break;
12869 aarch64_emit_store_exclusive (mode, cond, mem,
12870 gen_lowpart (mode, new_out), model_rtx);
12872 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12873 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12874 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12875 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12877 /* Emit any final barrier needed for a __sync operation. */
12878 if (is_sync)
12879 aarch64_emit_post_barrier (model);
12882 static void
12883 aarch64_init_libfuncs (void)
12885 /* Half-precision float operations. The compiler handles all operations
12886 with NULL libfuncs by converting to SFmode. */
12888 /* Conversions. */
12889 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12890 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12892 /* Arithmetic. */
12893 set_optab_libfunc (add_optab, HFmode, NULL);
12894 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12895 set_optab_libfunc (smul_optab, HFmode, NULL);
12896 set_optab_libfunc (neg_optab, HFmode, NULL);
12897 set_optab_libfunc (sub_optab, HFmode, NULL);
12899 /* Comparisons. */
12900 set_optab_libfunc (eq_optab, HFmode, NULL);
12901 set_optab_libfunc (ne_optab, HFmode, NULL);
12902 set_optab_libfunc (lt_optab, HFmode, NULL);
12903 set_optab_libfunc (le_optab, HFmode, NULL);
12904 set_optab_libfunc (ge_optab, HFmode, NULL);
12905 set_optab_libfunc (gt_optab, HFmode, NULL);
12906 set_optab_libfunc (unord_optab, HFmode, NULL);
12909 /* Target hook for c_mode_for_suffix. */
12910 static machine_mode
12911 aarch64_c_mode_for_suffix (char suffix)
12913 if (suffix == 'q')
12914 return TFmode;
12916 return VOIDmode;
12919 /* We can only represent floating point constants which will fit in
12920 "quarter-precision" values. These values are characterised by
12921 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12924 (-1)^s * (n/16) * 2^r
12926 Where:
12927 's' is the sign bit.
12928 'n' is an integer in the range 16 <= n <= 31.
12929 'r' is an integer in the range -3 <= r <= 4. */
12931 /* Return true iff X can be represented by a quarter-precision
12932 floating point immediate operand X. Note, we cannot represent 0.0. */
12933 bool
12934 aarch64_float_const_representable_p (rtx x)
12936 /* This represents our current view of how many bits
12937 make up the mantissa. */
12938 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12939 int exponent;
12940 unsigned HOST_WIDE_INT mantissa, mask;
12941 REAL_VALUE_TYPE r, m;
12942 bool fail;
12944 if (!CONST_DOUBLE_P (x))
12945 return false;
12947 /* We don't support HFmode constants yet. */
12948 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12949 return false;
12951 r = *CONST_DOUBLE_REAL_VALUE (x);
12953 /* We cannot represent infinities, NaNs or +/-zero. We won't
12954 know if we have +zero until we analyse the mantissa, but we
12955 can reject the other invalid values. */
12956 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12957 || REAL_VALUE_MINUS_ZERO (r))
12958 return false;
12960 /* Extract exponent. */
12961 r = real_value_abs (&r);
12962 exponent = REAL_EXP (&r);
12964 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12965 highest (sign) bit, with a fixed binary point at bit point_pos.
12966 m1 holds the low part of the mantissa, m2 the high part.
12967 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12968 bits for the mantissa, this can fail (low bits will be lost). */
12969 real_ldexp (&m, &r, point_pos - exponent);
12970 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12972 /* If the low part of the mantissa has bits set we cannot represent
12973 the value. */
12974 if (w.ulow () != 0)
12975 return false;
12976 /* We have rejected the lower HOST_WIDE_INT, so update our
12977 understanding of how many bits lie in the mantissa and
12978 look only at the high HOST_WIDE_INT. */
12979 mantissa = w.elt (1);
12980 point_pos -= HOST_BITS_PER_WIDE_INT;
12982 /* We can only represent values with a mantissa of the form 1.xxxx. */
12983 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12984 if ((mantissa & mask) != 0)
12985 return false;
12987 /* Having filtered unrepresentable values, we may now remove all
12988 but the highest 5 bits. */
12989 mantissa >>= point_pos - 5;
12991 /* We cannot represent the value 0.0, so reject it. This is handled
12992 elsewhere. */
12993 if (mantissa == 0)
12994 return false;
12996 /* Then, as bit 4 is always set, we can mask it off, leaving
12997 the mantissa in the range [0, 15]. */
12998 mantissa &= ~(1 << 4);
12999 gcc_assert (mantissa <= 15);
13001 /* GCC internally does not use IEEE754-like encoding (where normalized
13002 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
13003 Our mantissa values are shifted 4 places to the left relative to
13004 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
13005 by 5 places to correct for GCC's representation. */
13006 exponent = 5 - exponent;
13008 return (exponent >= 0 && exponent <= 7);
13011 char*
13012 aarch64_output_simd_mov_immediate (rtx const_vector,
13013 machine_mode mode,
13014 unsigned width)
13016 bool is_valid;
13017 static char templ[40];
13018 const char *mnemonic;
13019 const char *shift_op;
13020 unsigned int lane_count = 0;
13021 char element_char;
13023 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13025 /* This will return true to show const_vector is legal for use as either
13026 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
13027 also update INFO to show how the immediate should be generated. */
13028 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13029 gcc_assert (is_valid);
13031 element_char = sizetochar (info.element_width);
13032 lane_count = width / info.element_width;
13034 mode = GET_MODE_INNER (mode);
13035 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13037 gcc_assert (info.shift == 0 && ! info.mvn);
13038 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13039 move immediate path. */
13040 if (aarch64_float_const_zero_rtx_p (info.value))
13041 info.value = GEN_INT (0);
13042 else
13044 const unsigned int buf_size = 20;
13045 char float_buf[buf_size] = {'\0'};
13046 real_to_decimal_for_mode (float_buf,
13047 CONST_DOUBLE_REAL_VALUE (info.value),
13048 buf_size, buf_size, 1, mode);
13050 if (lane_count == 1)
13051 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13052 else
13053 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13054 lane_count, element_char, float_buf);
13055 return templ;
13059 mnemonic = info.mvn ? "mvni" : "movi";
13060 shift_op = info.msl ? "msl" : "lsl";
13062 gcc_assert (CONST_INT_P (info.value));
13063 if (lane_count == 1)
13064 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13065 mnemonic, UINTVAL (info.value));
13066 else if (info.shift)
13067 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13068 ", %s %d", mnemonic, lane_count, element_char,
13069 UINTVAL (info.value), shift_op, info.shift);
13070 else
13071 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13072 mnemonic, lane_count, element_char, UINTVAL (info.value));
13073 return templ;
13076 char*
13077 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13080 /* If a floating point number was passed and we desire to use it in an
13081 integer mode do the conversion to integer. */
13082 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13084 unsigned HOST_WIDE_INT ival;
13085 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13086 gcc_unreachable ();
13087 immediate = gen_int_mode (ival, mode);
13090 machine_mode vmode;
13091 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13092 a 128 bit vector mode. */
13093 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13095 vmode = aarch64_simd_container_mode (mode, width);
13096 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13097 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13100 /* Split operands into moves from op[1] + op[2] into op[0]. */
13102 void
13103 aarch64_split_combinev16qi (rtx operands[3])
13105 unsigned int dest = REGNO (operands[0]);
13106 unsigned int src1 = REGNO (operands[1]);
13107 unsigned int src2 = REGNO (operands[2]);
13108 machine_mode halfmode = GET_MODE (operands[1]);
13109 unsigned int halfregs = REG_NREGS (operands[1]);
13110 rtx destlo, desthi;
13112 gcc_assert (halfmode == V16QImode);
13114 if (src1 == dest && src2 == dest + halfregs)
13116 /* No-op move. Can't split to nothing; emit something. */
13117 emit_note (NOTE_INSN_DELETED);
13118 return;
13121 /* Preserve register attributes for variable tracking. */
13122 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13123 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13124 GET_MODE_SIZE (halfmode));
13126 /* Special case of reversed high/low parts. */
13127 if (reg_overlap_mentioned_p (operands[2], destlo)
13128 && reg_overlap_mentioned_p (operands[1], desthi))
13130 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13131 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13132 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13134 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13136 /* Try to avoid unnecessary moves if part of the result
13137 is in the right place already. */
13138 if (src1 != dest)
13139 emit_move_insn (destlo, operands[1]);
13140 if (src2 != dest + halfregs)
13141 emit_move_insn (desthi, operands[2]);
13143 else
13145 if (src2 != dest + halfregs)
13146 emit_move_insn (desthi, operands[2]);
13147 if (src1 != dest)
13148 emit_move_insn (destlo, operands[1]);
13152 /* vec_perm support. */
13154 #define MAX_VECT_LEN 16
13156 struct expand_vec_perm_d
13158 rtx target, op0, op1;
13159 unsigned char perm[MAX_VECT_LEN];
13160 machine_mode vmode;
13161 unsigned char nelt;
13162 bool one_vector_p;
13163 bool testing_p;
13166 /* Generate a variable permutation. */
13168 static void
13169 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13171 machine_mode vmode = GET_MODE (target);
13172 bool one_vector_p = rtx_equal_p (op0, op1);
13174 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13175 gcc_checking_assert (GET_MODE (op0) == vmode);
13176 gcc_checking_assert (GET_MODE (op1) == vmode);
13177 gcc_checking_assert (GET_MODE (sel) == vmode);
13178 gcc_checking_assert (TARGET_SIMD);
13180 if (one_vector_p)
13182 if (vmode == V8QImode)
13184 /* Expand the argument to a V16QI mode by duplicating it. */
13185 rtx pair = gen_reg_rtx (V16QImode);
13186 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13187 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13189 else
13191 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13194 else
13196 rtx pair;
13198 if (vmode == V8QImode)
13200 pair = gen_reg_rtx (V16QImode);
13201 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13202 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13204 else
13206 pair = gen_reg_rtx (OImode);
13207 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13208 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13213 void
13214 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13216 machine_mode vmode = GET_MODE (target);
13217 unsigned int nelt = GET_MODE_NUNITS (vmode);
13218 bool one_vector_p = rtx_equal_p (op0, op1);
13219 rtx mask;
13221 /* The TBL instruction does not use a modulo index, so we must take care
13222 of that ourselves. */
13223 mask = aarch64_simd_gen_const_vector_dup (vmode,
13224 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13225 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13227 /* For big-endian, we also need to reverse the index within the vector
13228 (but not which vector). */
13229 if (BYTES_BIG_ENDIAN)
13231 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13232 if (!one_vector_p)
13233 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13234 sel = expand_simple_binop (vmode, XOR, sel, mask,
13235 NULL, 0, OPTAB_LIB_WIDEN);
13237 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13240 /* Recognize patterns suitable for the TRN instructions. */
13241 static bool
13242 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13244 unsigned int i, odd, mask, nelt = d->nelt;
13245 rtx out, in0, in1, x;
13246 rtx (*gen) (rtx, rtx, rtx);
13247 machine_mode vmode = d->vmode;
13249 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13250 return false;
13252 /* Note that these are little-endian tests.
13253 We correct for big-endian later. */
13254 if (d->perm[0] == 0)
13255 odd = 0;
13256 else if (d->perm[0] == 1)
13257 odd = 1;
13258 else
13259 return false;
13260 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13262 for (i = 0; i < nelt; i += 2)
13264 if (d->perm[i] != i + odd)
13265 return false;
13266 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13267 return false;
13270 /* Success! */
13271 if (d->testing_p)
13272 return true;
13274 in0 = d->op0;
13275 in1 = d->op1;
13276 if (BYTES_BIG_ENDIAN)
13278 x = in0, in0 = in1, in1 = x;
13279 odd = !odd;
13281 out = d->target;
13283 if (odd)
13285 switch (vmode)
13287 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13288 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13289 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13290 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13291 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13292 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13293 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13294 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13295 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13296 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13297 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13298 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13299 default:
13300 return false;
13303 else
13305 switch (vmode)
13307 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13308 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13309 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13310 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13311 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13312 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13313 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13314 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13315 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13316 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13317 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13318 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13319 default:
13320 return false;
13324 emit_insn (gen (out, in0, in1));
13325 return true;
13328 /* Recognize patterns suitable for the UZP instructions. */
13329 static bool
13330 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13332 unsigned int i, odd, mask, nelt = d->nelt;
13333 rtx out, in0, in1, x;
13334 rtx (*gen) (rtx, rtx, rtx);
13335 machine_mode vmode = d->vmode;
13337 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13338 return false;
13340 /* Note that these are little-endian tests.
13341 We correct for big-endian later. */
13342 if (d->perm[0] == 0)
13343 odd = 0;
13344 else if (d->perm[0] == 1)
13345 odd = 1;
13346 else
13347 return false;
13348 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13350 for (i = 0; i < nelt; i++)
13352 unsigned elt = (i * 2 + odd) & mask;
13353 if (d->perm[i] != elt)
13354 return false;
13357 /* Success! */
13358 if (d->testing_p)
13359 return true;
13361 in0 = d->op0;
13362 in1 = d->op1;
13363 if (BYTES_BIG_ENDIAN)
13365 x = in0, in0 = in1, in1 = x;
13366 odd = !odd;
13368 out = d->target;
13370 if (odd)
13372 switch (vmode)
13374 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13375 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13376 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13377 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13378 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13379 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13380 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13381 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13382 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13383 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13384 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13385 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13386 default:
13387 return false;
13390 else
13392 switch (vmode)
13394 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13395 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13396 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13397 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13398 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13399 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13400 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13401 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13402 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13403 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13404 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13405 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13406 default:
13407 return false;
13411 emit_insn (gen (out, in0, in1));
13412 return true;
13415 /* Recognize patterns suitable for the ZIP instructions. */
13416 static bool
13417 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13419 unsigned int i, high, mask, nelt = d->nelt;
13420 rtx out, in0, in1, x;
13421 rtx (*gen) (rtx, rtx, rtx);
13422 machine_mode vmode = d->vmode;
13424 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13425 return false;
13427 /* Note that these are little-endian tests.
13428 We correct for big-endian later. */
13429 high = nelt / 2;
13430 if (d->perm[0] == high)
13431 /* Do Nothing. */
13433 else if (d->perm[0] == 0)
13434 high = 0;
13435 else
13436 return false;
13437 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13439 for (i = 0; i < nelt / 2; i++)
13441 unsigned elt = (i + high) & mask;
13442 if (d->perm[i * 2] != elt)
13443 return false;
13444 elt = (elt + nelt) & mask;
13445 if (d->perm[i * 2 + 1] != elt)
13446 return false;
13449 /* Success! */
13450 if (d->testing_p)
13451 return true;
13453 in0 = d->op0;
13454 in1 = d->op1;
13455 if (BYTES_BIG_ENDIAN)
13457 x = in0, in0 = in1, in1 = x;
13458 high = !high;
13460 out = d->target;
13462 if (high)
13464 switch (vmode)
13466 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13467 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13468 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13469 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13470 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13471 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13472 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13473 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13474 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13475 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13476 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13477 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13478 default:
13479 return false;
13482 else
13484 switch (vmode)
13486 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13487 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13488 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13489 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13490 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13491 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13492 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13493 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13494 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13495 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13496 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13497 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13498 default:
13499 return false;
13503 emit_insn (gen (out, in0, in1));
13504 return true;
13507 /* Recognize patterns for the EXT insn. */
13509 static bool
13510 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13512 unsigned int i, nelt = d->nelt;
13513 rtx (*gen) (rtx, rtx, rtx, rtx);
13514 rtx offset;
13516 unsigned int location = d->perm[0]; /* Always < nelt. */
13518 /* Check if the extracted indices are increasing by one. */
13519 for (i = 1; i < nelt; i++)
13521 unsigned int required = location + i;
13522 if (d->one_vector_p)
13524 /* We'll pass the same vector in twice, so allow indices to wrap. */
13525 required &= (nelt - 1);
13527 if (d->perm[i] != required)
13528 return false;
13531 switch (d->vmode)
13533 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13534 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13535 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13536 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13537 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13538 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13539 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13540 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13541 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13542 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13543 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13544 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13545 default:
13546 return false;
13549 /* Success! */
13550 if (d->testing_p)
13551 return true;
13553 /* The case where (location == 0) is a no-op for both big- and little-endian,
13554 and is removed by the mid-end at optimization levels -O1 and higher. */
13556 if (BYTES_BIG_ENDIAN && (location != 0))
13558 /* After setup, we want the high elements of the first vector (stored
13559 at the LSB end of the register), and the low elements of the second
13560 vector (stored at the MSB end of the register). So swap. */
13561 std::swap (d->op0, d->op1);
13562 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13563 location = nelt - location;
13566 offset = GEN_INT (location);
13567 emit_insn (gen (d->target, d->op0, d->op1, offset));
13568 return true;
13571 /* Recognize patterns for the REV insns. */
13573 static bool
13574 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13576 unsigned int i, j, diff, nelt = d->nelt;
13577 rtx (*gen) (rtx, rtx);
13579 if (!d->one_vector_p)
13580 return false;
13582 diff = d->perm[0];
13583 switch (diff)
13585 case 7:
13586 switch (d->vmode)
13588 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13589 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13590 default:
13591 return false;
13593 break;
13594 case 3:
13595 switch (d->vmode)
13597 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13598 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13599 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13600 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13601 default:
13602 return false;
13604 break;
13605 case 1:
13606 switch (d->vmode)
13608 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13609 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13610 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13611 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13612 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13613 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13614 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13615 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13616 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13617 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13618 default:
13619 return false;
13621 break;
13622 default:
13623 return false;
13626 for (i = 0; i < nelt ; i += diff + 1)
13627 for (j = 0; j <= diff; j += 1)
13629 /* This is guaranteed to be true as the value of diff
13630 is 7, 3, 1 and we should have enough elements in the
13631 queue to generate this. Getting a vector mask with a
13632 value of diff other than these values implies that
13633 something is wrong by the time we get here. */
13634 gcc_assert (i + j < nelt);
13635 if (d->perm[i + j] != i + diff - j)
13636 return false;
13639 /* Success! */
13640 if (d->testing_p)
13641 return true;
13643 emit_insn (gen (d->target, d->op0));
13644 return true;
13647 static bool
13648 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13650 rtx (*gen) (rtx, rtx, rtx);
13651 rtx out = d->target;
13652 rtx in0;
13653 machine_mode vmode = d->vmode;
13654 unsigned int i, elt, nelt = d->nelt;
13655 rtx lane;
13657 elt = d->perm[0];
13658 for (i = 1; i < nelt; i++)
13660 if (elt != d->perm[i])
13661 return false;
13664 /* The generic preparation in aarch64_expand_vec_perm_const_1
13665 swaps the operand order and the permute indices if it finds
13666 d->perm[0] to be in the second operand. Thus, we can always
13667 use d->op0 and need not do any extra arithmetic to get the
13668 correct lane number. */
13669 in0 = d->op0;
13670 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13672 switch (vmode)
13674 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13675 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13676 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13677 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13678 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13679 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13680 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13681 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13682 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13683 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13684 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13685 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13686 default:
13687 return false;
13690 emit_insn (gen (out, in0, lane));
13691 return true;
13694 static bool
13695 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13697 rtx rperm[MAX_VECT_LEN], sel;
13698 machine_mode vmode = d->vmode;
13699 unsigned int i, nelt = d->nelt;
13701 if (d->testing_p)
13702 return true;
13704 /* Generic code will try constant permutation twice. Once with the
13705 original mode and again with the elements lowered to QImode.
13706 So wait and don't do the selector expansion ourselves. */
13707 if (vmode != V8QImode && vmode != V16QImode)
13708 return false;
13710 for (i = 0; i < nelt; ++i)
13712 int nunits = GET_MODE_NUNITS (vmode);
13714 /* If big-endian and two vectors we end up with a weird mixed-endian
13715 mode on NEON. Reverse the index within each word but not the word
13716 itself. */
13717 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13718 : d->perm[i]);
13720 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13721 sel = force_reg (vmode, sel);
13723 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13724 return true;
13727 static bool
13728 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13730 /* The pattern matching functions above are written to look for a small
13731 number to begin the sequence (0, 1, N/2). If we begin with an index
13732 from the second operand, we can swap the operands. */
13733 if (d->perm[0] >= d->nelt)
13735 unsigned i, nelt = d->nelt;
13737 gcc_assert (nelt == (nelt & -nelt));
13738 for (i = 0; i < nelt; ++i)
13739 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13741 std::swap (d->op0, d->op1);
13744 if (TARGET_SIMD)
13746 if (aarch64_evpc_rev (d))
13747 return true;
13748 else if (aarch64_evpc_ext (d))
13749 return true;
13750 else if (aarch64_evpc_dup (d))
13751 return true;
13752 else if (aarch64_evpc_zip (d))
13753 return true;
13754 else if (aarch64_evpc_uzp (d))
13755 return true;
13756 else if (aarch64_evpc_trn (d))
13757 return true;
13758 return aarch64_evpc_tbl (d);
13760 return false;
13763 /* Expand a vec_perm_const pattern. */
13765 bool
13766 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13768 struct expand_vec_perm_d d;
13769 int i, nelt, which;
13771 d.target = target;
13772 d.op0 = op0;
13773 d.op1 = op1;
13775 d.vmode = GET_MODE (target);
13776 gcc_assert (VECTOR_MODE_P (d.vmode));
13777 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13778 d.testing_p = false;
13780 for (i = which = 0; i < nelt; ++i)
13782 rtx e = XVECEXP (sel, 0, i);
13783 int ei = INTVAL (e) & (2 * nelt - 1);
13784 which |= (ei < nelt ? 1 : 2);
13785 d.perm[i] = ei;
13788 switch (which)
13790 default:
13791 gcc_unreachable ();
13793 case 3:
13794 d.one_vector_p = false;
13795 if (!rtx_equal_p (op0, op1))
13796 break;
13798 /* The elements of PERM do not suggest that only the first operand
13799 is used, but both operands are identical. Allow easier matching
13800 of the permutation by folding the permutation into the single
13801 input vector. */
13802 /* Fall Through. */
13803 case 2:
13804 for (i = 0; i < nelt; ++i)
13805 d.perm[i] &= nelt - 1;
13806 d.op0 = op1;
13807 d.one_vector_p = true;
13808 break;
13810 case 1:
13811 d.op1 = op0;
13812 d.one_vector_p = true;
13813 break;
13816 return aarch64_expand_vec_perm_const_1 (&d);
13819 static bool
13820 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13821 const unsigned char *sel)
13823 struct expand_vec_perm_d d;
13824 unsigned int i, nelt, which;
13825 bool ret;
13827 d.vmode = vmode;
13828 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13829 d.testing_p = true;
13830 memcpy (d.perm, sel, nelt);
13832 /* Calculate whether all elements are in one vector. */
13833 for (i = which = 0; i < nelt; ++i)
13835 unsigned char e = d.perm[i];
13836 gcc_assert (e < 2 * nelt);
13837 which |= (e < nelt ? 1 : 2);
13840 /* If all elements are from the second vector, reindex as if from the
13841 first vector. */
13842 if (which == 2)
13843 for (i = 0; i < nelt; ++i)
13844 d.perm[i] -= nelt;
13846 /* Check whether the mask can be applied to a single vector. */
13847 d.one_vector_p = (which != 3);
13849 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13850 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13851 if (!d.one_vector_p)
13852 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13854 start_sequence ();
13855 ret = aarch64_expand_vec_perm_const_1 (&d);
13856 end_sequence ();
13858 return ret;
13862 aarch64_reverse_mask (machine_mode mode)
13864 /* We have to reverse each vector because we dont have
13865 a permuted load that can reverse-load according to ABI rules. */
13866 rtx mask;
13867 rtvec v = rtvec_alloc (16);
13868 int i, j;
13869 int nunits = GET_MODE_NUNITS (mode);
13870 int usize = GET_MODE_UNIT_SIZE (mode);
13872 gcc_assert (BYTES_BIG_ENDIAN);
13873 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13875 for (i = 0; i < nunits; i++)
13876 for (j = 0; j < usize; j++)
13877 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13878 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13879 return force_reg (V16QImode, mask);
13882 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
13883 true. However due to issues with register allocation it is preferable
13884 to avoid tieing integer scalar and FP scalar modes. Executing integer
13885 operations in general registers is better than treating them as scalar
13886 vector operations. This reduces latency and avoids redundant int<->FP
13887 moves. So tie modes if they are either the same class, or vector modes
13888 with other vector modes, vector structs or any scalar mode. */
13890 static bool
13891 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13893 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13894 return true;
13896 /* We specifically want to allow elements of "structure" modes to
13897 be tieable to the structure. This more general condition allows
13898 other rarer situations too. */
13899 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13900 return true;
13902 /* Also allow any scalar modes with vectors. */
13903 if (aarch64_vector_mode_supported_p (mode1)
13904 || aarch64_vector_mode_supported_p (mode2))
13905 return true;
13907 return false;
13910 /* Return a new RTX holding the result of moving POINTER forward by
13911 AMOUNT bytes. */
13913 static rtx
13914 aarch64_move_pointer (rtx pointer, int amount)
13916 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13918 return adjust_automodify_address (pointer, GET_MODE (pointer),
13919 next, amount);
13922 /* Return a new RTX holding the result of moving POINTER forward by the
13923 size of the mode it points to. */
13925 static rtx
13926 aarch64_progress_pointer (rtx pointer)
13928 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13930 return aarch64_move_pointer (pointer, amount);
13933 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13934 MODE bytes. */
13936 static void
13937 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13938 machine_mode mode)
13940 rtx reg = gen_reg_rtx (mode);
13942 /* "Cast" the pointers to the correct mode. */
13943 *src = adjust_address (*src, mode, 0);
13944 *dst = adjust_address (*dst, mode, 0);
13945 /* Emit the memcpy. */
13946 emit_move_insn (reg, *src);
13947 emit_move_insn (*dst, reg);
13948 /* Move the pointers forward. */
13949 *src = aarch64_progress_pointer (*src);
13950 *dst = aarch64_progress_pointer (*dst);
13953 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13954 we succeed, otherwise return false. */
13956 bool
13957 aarch64_expand_movmem (rtx *operands)
13959 unsigned int n;
13960 rtx dst = operands[0];
13961 rtx src = operands[1];
13962 rtx base;
13963 bool speed_p = !optimize_function_for_size_p (cfun);
13965 /* When optimizing for size, give a better estimate of the length of a
13966 memcpy call, but use the default otherwise. */
13967 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13969 /* We can't do anything smart if the amount to copy is not constant. */
13970 if (!CONST_INT_P (operands[2]))
13971 return false;
13973 n = UINTVAL (operands[2]);
13975 /* Try to keep the number of instructions low. For cases below 16 bytes we
13976 need to make at most two moves. For cases above 16 bytes it will be one
13977 move for each 16 byte chunk, then at most two additional moves. */
13978 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13979 return false;
13981 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13982 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13984 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13985 src = adjust_automodify_address (src, VOIDmode, base, 0);
13987 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13988 1-byte chunk. */
13989 if (n < 4)
13991 if (n >= 2)
13993 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13994 n -= 2;
13997 if (n == 1)
13998 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14000 return true;
14003 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
14004 4-byte chunk, partially overlapping with the previously copied chunk. */
14005 if (n < 8)
14007 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14008 n -= 4;
14009 if (n > 0)
14011 int move = n - 4;
14013 src = aarch64_move_pointer (src, move);
14014 dst = aarch64_move_pointer (dst, move);
14015 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14017 return true;
14020 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
14021 them, then (if applicable) an 8-byte chunk. */
14022 while (n >= 8)
14024 if (n / 16)
14026 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14027 n -= 16;
14029 else
14031 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14032 n -= 8;
14036 /* Finish the final bytes of the copy. We can always do this in one
14037 instruction. We either copy the exact amount we need, or partially
14038 overlap with the previous chunk we copied and copy 8-bytes. */
14039 if (n == 0)
14040 return true;
14041 else if (n == 1)
14042 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14043 else if (n == 2)
14044 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14045 else if (n == 4)
14046 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14047 else
14049 if (n == 3)
14051 src = aarch64_move_pointer (src, -1);
14052 dst = aarch64_move_pointer (dst, -1);
14053 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14055 else
14057 int move = n - 8;
14059 src = aarch64_move_pointer (src, move);
14060 dst = aarch64_move_pointer (dst, move);
14061 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14065 return true;
14068 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14069 SImode stores. Handle the case when the constant has identical
14070 bottom and top halves. This is beneficial when the two stores can be
14071 merged into an STP and we avoid synthesising potentially expensive
14072 immediates twice. Return true if such a split is possible. */
14074 bool
14075 aarch64_split_dimode_const_store (rtx dst, rtx src)
14077 rtx lo = gen_lowpart (SImode, src);
14078 rtx hi = gen_highpart_mode (SImode, DImode, src);
14080 bool size_p = optimize_function_for_size_p (cfun);
14082 if (!rtx_equal_p (lo, hi))
14083 return false;
14085 unsigned int orig_cost
14086 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14087 unsigned int lo_cost
14088 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14090 /* We want to transform:
14091 MOV x1, 49370
14092 MOVK x1, 0x140, lsl 16
14093 MOVK x1, 0xc0da, lsl 32
14094 MOVK x1, 0x140, lsl 48
14095 STR x1, [x0]
14096 into:
14097 MOV w1, 49370
14098 MOVK w1, 0x140, lsl 16
14099 STP w1, w1, [x0]
14100 So we want to perform this only when we save two instructions
14101 or more. When optimizing for size, however, accept any code size
14102 savings we can. */
14103 if (size_p && orig_cost <= lo_cost)
14104 return false;
14106 if (!size_p
14107 && (orig_cost <= lo_cost + 1))
14108 return false;
14110 rtx mem_lo = adjust_address (dst, SImode, 0);
14111 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14112 return false;
14114 rtx tmp_reg = gen_reg_rtx (SImode);
14115 aarch64_expand_mov_immediate (tmp_reg, lo);
14116 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14117 /* Don't emit an explicit store pair as this may not be always profitable.
14118 Let the sched-fusion logic decide whether to merge them. */
14119 emit_move_insn (mem_lo, tmp_reg);
14120 emit_move_insn (mem_hi, tmp_reg);
14122 return true;
14125 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14127 static unsigned HOST_WIDE_INT
14128 aarch64_asan_shadow_offset (void)
14130 return (HOST_WIDE_INT_1 << 36);
14133 static bool
14134 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14135 unsigned int align,
14136 enum by_pieces_operation op,
14137 bool speed_p)
14139 /* STORE_BY_PIECES can be used when copying a constant string, but
14140 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14141 For now we always fail this and let the move_by_pieces code copy
14142 the string from read-only memory. */
14143 if (op == STORE_BY_PIECES)
14144 return false;
14146 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14149 static rtx
14150 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14151 int code, tree treeop0, tree treeop1)
14153 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14154 rtx op0, op1;
14155 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14156 insn_code icode;
14157 struct expand_operand ops[4];
14159 start_sequence ();
14160 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14162 op_mode = GET_MODE (op0);
14163 if (op_mode == VOIDmode)
14164 op_mode = GET_MODE (op1);
14166 switch (op_mode)
14168 case E_QImode:
14169 case E_HImode:
14170 case E_SImode:
14171 cmp_mode = SImode;
14172 icode = CODE_FOR_cmpsi;
14173 break;
14175 case E_DImode:
14176 cmp_mode = DImode;
14177 icode = CODE_FOR_cmpdi;
14178 break;
14180 case E_SFmode:
14181 cmp_mode = SFmode;
14182 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14183 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14184 break;
14186 case E_DFmode:
14187 cmp_mode = DFmode;
14188 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14189 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14190 break;
14192 default:
14193 end_sequence ();
14194 return NULL_RTX;
14197 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14198 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14199 if (!op0 || !op1)
14201 end_sequence ();
14202 return NULL_RTX;
14204 *prep_seq = get_insns ();
14205 end_sequence ();
14207 create_fixed_operand (&ops[0], op0);
14208 create_fixed_operand (&ops[1], op1);
14210 start_sequence ();
14211 if (!maybe_expand_insn (icode, 2, ops))
14213 end_sequence ();
14214 return NULL_RTX;
14216 *gen_seq = get_insns ();
14217 end_sequence ();
14219 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14220 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14223 static rtx
14224 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14225 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14227 rtx op0, op1, target;
14228 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14229 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14230 insn_code icode;
14231 struct expand_operand ops[6];
14232 int aarch64_cond;
14234 push_to_sequence (*prep_seq);
14235 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14237 op_mode = GET_MODE (op0);
14238 if (op_mode == VOIDmode)
14239 op_mode = GET_MODE (op1);
14241 switch (op_mode)
14243 case E_QImode:
14244 case E_HImode:
14245 case E_SImode:
14246 cmp_mode = SImode;
14247 icode = CODE_FOR_ccmpsi;
14248 break;
14250 case E_DImode:
14251 cmp_mode = DImode;
14252 icode = CODE_FOR_ccmpdi;
14253 break;
14255 case E_SFmode:
14256 cmp_mode = SFmode;
14257 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14258 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14259 break;
14261 case E_DFmode:
14262 cmp_mode = DFmode;
14263 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14264 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14265 break;
14267 default:
14268 end_sequence ();
14269 return NULL_RTX;
14272 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14273 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14274 if (!op0 || !op1)
14276 end_sequence ();
14277 return NULL_RTX;
14279 *prep_seq = get_insns ();
14280 end_sequence ();
14282 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14283 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14285 if (bit_code != AND)
14287 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14288 GET_MODE (XEXP (prev, 0))),
14289 VOIDmode, XEXP (prev, 0), const0_rtx);
14290 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14293 create_fixed_operand (&ops[0], XEXP (prev, 0));
14294 create_fixed_operand (&ops[1], target);
14295 create_fixed_operand (&ops[2], op0);
14296 create_fixed_operand (&ops[3], op1);
14297 create_fixed_operand (&ops[4], prev);
14298 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14300 push_to_sequence (*gen_seq);
14301 if (!maybe_expand_insn (icode, 6, ops))
14303 end_sequence ();
14304 return NULL_RTX;
14307 *gen_seq = get_insns ();
14308 end_sequence ();
14310 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14313 #undef TARGET_GEN_CCMP_FIRST
14314 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14316 #undef TARGET_GEN_CCMP_NEXT
14317 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14319 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14320 instruction fusion of some sort. */
14322 static bool
14323 aarch64_macro_fusion_p (void)
14325 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14329 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14330 should be kept together during scheduling. */
14332 static bool
14333 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14335 rtx set_dest;
14336 rtx prev_set = single_set (prev);
14337 rtx curr_set = single_set (curr);
14338 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14339 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14341 if (!aarch64_macro_fusion_p ())
14342 return false;
14344 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14346 /* We are trying to match:
14347 prev (mov) == (set (reg r0) (const_int imm16))
14348 curr (movk) == (set (zero_extract (reg r0)
14349 (const_int 16)
14350 (const_int 16))
14351 (const_int imm16_1)) */
14353 set_dest = SET_DEST (curr_set);
14355 if (GET_CODE (set_dest) == ZERO_EXTRACT
14356 && CONST_INT_P (SET_SRC (curr_set))
14357 && CONST_INT_P (SET_SRC (prev_set))
14358 && CONST_INT_P (XEXP (set_dest, 2))
14359 && INTVAL (XEXP (set_dest, 2)) == 16
14360 && REG_P (XEXP (set_dest, 0))
14361 && REG_P (SET_DEST (prev_set))
14362 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14364 return true;
14368 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14371 /* We're trying to match:
14372 prev (adrp) == (set (reg r1)
14373 (high (symbol_ref ("SYM"))))
14374 curr (add) == (set (reg r0)
14375 (lo_sum (reg r1)
14376 (symbol_ref ("SYM"))))
14377 Note that r0 need not necessarily be the same as r1, especially
14378 during pre-regalloc scheduling. */
14380 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14381 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14383 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14384 && REG_P (XEXP (SET_SRC (curr_set), 0))
14385 && REGNO (XEXP (SET_SRC (curr_set), 0))
14386 == REGNO (SET_DEST (prev_set))
14387 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14388 XEXP (SET_SRC (curr_set), 1)))
14389 return true;
14393 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14396 /* We're trying to match:
14397 prev (movk) == (set (zero_extract (reg r0)
14398 (const_int 16)
14399 (const_int 32))
14400 (const_int imm16_1))
14401 curr (movk) == (set (zero_extract (reg r0)
14402 (const_int 16)
14403 (const_int 48))
14404 (const_int imm16_2)) */
14406 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14407 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14408 && REG_P (XEXP (SET_DEST (prev_set), 0))
14409 && REG_P (XEXP (SET_DEST (curr_set), 0))
14410 && REGNO (XEXP (SET_DEST (prev_set), 0))
14411 == REGNO (XEXP (SET_DEST (curr_set), 0))
14412 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14413 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14414 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14415 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14416 && CONST_INT_P (SET_SRC (prev_set))
14417 && CONST_INT_P (SET_SRC (curr_set)))
14418 return true;
14421 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14423 /* We're trying to match:
14424 prev (adrp) == (set (reg r0)
14425 (high (symbol_ref ("SYM"))))
14426 curr (ldr) == (set (reg r1)
14427 (mem (lo_sum (reg r0)
14428 (symbol_ref ("SYM")))))
14430 curr (ldr) == (set (reg r1)
14431 (zero_extend (mem
14432 (lo_sum (reg r0)
14433 (symbol_ref ("SYM")))))) */
14434 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14435 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14437 rtx curr_src = SET_SRC (curr_set);
14439 if (GET_CODE (curr_src) == ZERO_EXTEND)
14440 curr_src = XEXP (curr_src, 0);
14442 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14443 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14444 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14445 == REGNO (SET_DEST (prev_set))
14446 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14447 XEXP (SET_SRC (prev_set), 0)))
14448 return true;
14452 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14453 && aarch_crypto_can_dual_issue (prev, curr))
14454 return true;
14456 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14457 && any_condjump_p (curr))
14459 enum attr_type prev_type = get_attr_type (prev);
14461 unsigned int condreg1, condreg2;
14462 rtx cc_reg_1;
14463 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14464 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14466 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14467 && prev
14468 && modified_in_p (cc_reg_1, prev))
14470 /* FIXME: this misses some which is considered simple arthematic
14471 instructions for ThunderX. Simple shifts are missed here. */
14472 if (prev_type == TYPE_ALUS_SREG
14473 || prev_type == TYPE_ALUS_IMM
14474 || prev_type == TYPE_LOGICS_REG
14475 || prev_type == TYPE_LOGICS_IMM)
14476 return true;
14480 if (prev_set
14481 && curr_set
14482 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14483 && any_condjump_p (curr))
14485 /* We're trying to match:
14486 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14487 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14488 (const_int 0))
14489 (label_ref ("SYM"))
14490 (pc)) */
14491 if (SET_DEST (curr_set) == (pc_rtx)
14492 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14493 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14494 && REG_P (SET_DEST (prev_set))
14495 && REGNO (SET_DEST (prev_set))
14496 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14498 /* Fuse ALU operations followed by conditional branch instruction. */
14499 switch (get_attr_type (prev))
14501 case TYPE_ALU_IMM:
14502 case TYPE_ALU_SREG:
14503 case TYPE_ADC_REG:
14504 case TYPE_ADC_IMM:
14505 case TYPE_ADCS_REG:
14506 case TYPE_ADCS_IMM:
14507 case TYPE_LOGIC_REG:
14508 case TYPE_LOGIC_IMM:
14509 case TYPE_CSEL:
14510 case TYPE_ADR:
14511 case TYPE_MOV_IMM:
14512 case TYPE_SHIFT_REG:
14513 case TYPE_SHIFT_IMM:
14514 case TYPE_BFM:
14515 case TYPE_RBIT:
14516 case TYPE_REV:
14517 case TYPE_EXTEND:
14518 return true;
14520 default:;
14525 return false;
14528 /* Return true iff the instruction fusion described by OP is enabled. */
14530 bool
14531 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14533 return (aarch64_tune_params.fusible_ops & op) != 0;
14536 /* If MEM is in the form of [base+offset], extract the two parts
14537 of address and set to BASE and OFFSET, otherwise return false
14538 after clearing BASE and OFFSET. */
14540 bool
14541 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14543 rtx addr;
14545 gcc_assert (MEM_P (mem));
14547 addr = XEXP (mem, 0);
14549 if (REG_P (addr))
14551 *base = addr;
14552 *offset = const0_rtx;
14553 return true;
14556 if (GET_CODE (addr) == PLUS
14557 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14559 *base = XEXP (addr, 0);
14560 *offset = XEXP (addr, 1);
14561 return true;
14564 *base = NULL_RTX;
14565 *offset = NULL_RTX;
14567 return false;
14570 /* Types for scheduling fusion. */
14571 enum sched_fusion_type
14573 SCHED_FUSION_NONE = 0,
14574 SCHED_FUSION_LD_SIGN_EXTEND,
14575 SCHED_FUSION_LD_ZERO_EXTEND,
14576 SCHED_FUSION_LD,
14577 SCHED_FUSION_ST,
14578 SCHED_FUSION_NUM
14581 /* If INSN is a load or store of address in the form of [base+offset],
14582 extract the two parts and set to BASE and OFFSET. Return scheduling
14583 fusion type this INSN is. */
14585 static enum sched_fusion_type
14586 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14588 rtx x, dest, src;
14589 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14591 gcc_assert (INSN_P (insn));
14592 x = PATTERN (insn);
14593 if (GET_CODE (x) != SET)
14594 return SCHED_FUSION_NONE;
14596 src = SET_SRC (x);
14597 dest = SET_DEST (x);
14599 machine_mode dest_mode = GET_MODE (dest);
14601 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14602 return SCHED_FUSION_NONE;
14604 if (GET_CODE (src) == SIGN_EXTEND)
14606 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14607 src = XEXP (src, 0);
14608 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14609 return SCHED_FUSION_NONE;
14611 else if (GET_CODE (src) == ZERO_EXTEND)
14613 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14614 src = XEXP (src, 0);
14615 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14616 return SCHED_FUSION_NONE;
14619 if (GET_CODE (src) == MEM && REG_P (dest))
14620 extract_base_offset_in_addr (src, base, offset);
14621 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14623 fusion = SCHED_FUSION_ST;
14624 extract_base_offset_in_addr (dest, base, offset);
14626 else
14627 return SCHED_FUSION_NONE;
14629 if (*base == NULL_RTX || *offset == NULL_RTX)
14630 fusion = SCHED_FUSION_NONE;
14632 return fusion;
14635 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14637 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14638 and PRI are only calculated for these instructions. For other instruction,
14639 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14640 type instruction fusion can be added by returning different priorities.
14642 It's important that irrelevant instructions get the largest FUSION_PRI. */
14644 static void
14645 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14646 int *fusion_pri, int *pri)
14648 int tmp, off_val;
14649 rtx base, offset;
14650 enum sched_fusion_type fusion;
14652 gcc_assert (INSN_P (insn));
14654 tmp = max_pri - 1;
14655 fusion = fusion_load_store (insn, &base, &offset);
14656 if (fusion == SCHED_FUSION_NONE)
14658 *pri = tmp;
14659 *fusion_pri = tmp;
14660 return;
14663 /* Set FUSION_PRI according to fusion type and base register. */
14664 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14666 /* Calculate PRI. */
14667 tmp /= 2;
14669 /* INSN with smaller offset goes first. */
14670 off_val = (int)(INTVAL (offset));
14671 if (off_val >= 0)
14672 tmp -= (off_val & 0xfffff);
14673 else
14674 tmp += ((- off_val) & 0xfffff);
14676 *pri = tmp;
14677 return;
14680 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14681 Adjust priority of sha1h instructions so they are scheduled before
14682 other SHA1 instructions. */
14684 static int
14685 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14687 rtx x = PATTERN (insn);
14689 if (GET_CODE (x) == SET)
14691 x = SET_SRC (x);
14693 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14694 return priority + 10;
14697 return priority;
14700 /* Given OPERANDS of consecutive load/store, check if we can merge
14701 them into ldp/stp. LOAD is true if they are load instructions.
14702 MODE is the mode of memory operands. */
14704 bool
14705 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14706 machine_mode mode)
14708 HOST_WIDE_INT offval_1, offval_2, msize;
14709 enum reg_class rclass_1, rclass_2;
14710 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14712 if (load)
14714 mem_1 = operands[1];
14715 mem_2 = operands[3];
14716 reg_1 = operands[0];
14717 reg_2 = operands[2];
14718 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14719 if (REGNO (reg_1) == REGNO (reg_2))
14720 return false;
14722 else
14724 mem_1 = operands[0];
14725 mem_2 = operands[2];
14726 reg_1 = operands[1];
14727 reg_2 = operands[3];
14730 /* The mems cannot be volatile. */
14731 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14732 return false;
14734 /* If we have SImode and slow unaligned ldp,
14735 check the alignment to be at least 8 byte. */
14736 if (mode == SImode
14737 && (aarch64_tune_params.extra_tuning_flags
14738 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14739 && !optimize_size
14740 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14741 return false;
14743 /* Check if the addresses are in the form of [base+offset]. */
14744 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14745 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14746 return false;
14747 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14748 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14749 return false;
14751 /* Check if the bases are same. */
14752 if (!rtx_equal_p (base_1, base_2))
14753 return false;
14755 offval_1 = INTVAL (offset_1);
14756 offval_2 = INTVAL (offset_2);
14757 msize = GET_MODE_SIZE (mode);
14758 /* Check if the offsets are consecutive. */
14759 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14760 return false;
14762 /* Check if the addresses are clobbered by load. */
14763 if (load)
14765 if (reg_mentioned_p (reg_1, mem_1))
14766 return false;
14768 /* In increasing order, the last load can clobber the address. */
14769 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14770 return false;
14773 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14774 rclass_1 = FP_REGS;
14775 else
14776 rclass_1 = GENERAL_REGS;
14778 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14779 rclass_2 = FP_REGS;
14780 else
14781 rclass_2 = GENERAL_REGS;
14783 /* Check if the registers are of same class. */
14784 if (rclass_1 != rclass_2)
14785 return false;
14787 return true;
14790 /* Given OPERANDS of consecutive load/store, check if we can merge
14791 them into ldp/stp by adjusting the offset. LOAD is true if they
14792 are load instructions. MODE is the mode of memory operands.
14794 Given below consecutive stores:
14796 str w1, [xb, 0x100]
14797 str w1, [xb, 0x104]
14798 str w1, [xb, 0x108]
14799 str w1, [xb, 0x10c]
14801 Though the offsets are out of the range supported by stp, we can
14802 still pair them after adjusting the offset, like:
14804 add scratch, xb, 0x100
14805 stp w1, w1, [scratch]
14806 stp w1, w1, [scratch, 0x8]
14808 The peephole patterns detecting this opportunity should guarantee
14809 the scratch register is avaliable. */
14811 bool
14812 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14813 scalar_mode mode)
14815 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14816 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14817 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14818 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14820 if (load)
14822 reg_1 = operands[0];
14823 mem_1 = operands[1];
14824 reg_2 = operands[2];
14825 mem_2 = operands[3];
14826 reg_3 = operands[4];
14827 mem_3 = operands[5];
14828 reg_4 = operands[6];
14829 mem_4 = operands[7];
14830 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14831 && REG_P (reg_3) && REG_P (reg_4));
14832 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14833 return false;
14835 else
14837 mem_1 = operands[0];
14838 reg_1 = operands[1];
14839 mem_2 = operands[2];
14840 reg_2 = operands[3];
14841 mem_3 = operands[4];
14842 reg_3 = operands[5];
14843 mem_4 = operands[6];
14844 reg_4 = operands[7];
14846 /* Skip if memory operand is by itslef valid for ldp/stp. */
14847 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14848 return false;
14850 /* The mems cannot be volatile. */
14851 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14852 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14853 return false;
14855 /* Check if the addresses are in the form of [base+offset]. */
14856 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14857 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14858 return false;
14859 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14860 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14861 return false;
14862 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14863 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14864 return false;
14865 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14866 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14867 return false;
14869 /* Check if the bases are same. */
14870 if (!rtx_equal_p (base_1, base_2)
14871 || !rtx_equal_p (base_2, base_3)
14872 || !rtx_equal_p (base_3, base_4))
14873 return false;
14875 offval_1 = INTVAL (offset_1);
14876 offval_2 = INTVAL (offset_2);
14877 offval_3 = INTVAL (offset_3);
14878 offval_4 = INTVAL (offset_4);
14879 msize = GET_MODE_SIZE (mode);
14880 /* Check if the offsets are consecutive. */
14881 if ((offval_1 != (offval_2 + msize)
14882 || offval_1 != (offval_3 + msize * 2)
14883 || offval_1 != (offval_4 + msize * 3))
14884 && (offval_4 != (offval_3 + msize)
14885 || offval_4 != (offval_2 + msize * 2)
14886 || offval_4 != (offval_1 + msize * 3)))
14887 return false;
14889 /* Check if the addresses are clobbered by load. */
14890 if (load)
14892 if (reg_mentioned_p (reg_1, mem_1)
14893 || reg_mentioned_p (reg_2, mem_2)
14894 || reg_mentioned_p (reg_3, mem_3))
14895 return false;
14897 /* In increasing order, the last load can clobber the address. */
14898 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14899 return false;
14902 /* If we have SImode and slow unaligned ldp,
14903 check the alignment to be at least 8 byte. */
14904 if (mode == SImode
14905 && (aarch64_tune_params.extra_tuning_flags
14906 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14907 && !optimize_size
14908 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14909 return false;
14911 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14912 rclass_1 = FP_REGS;
14913 else
14914 rclass_1 = GENERAL_REGS;
14916 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14917 rclass_2 = FP_REGS;
14918 else
14919 rclass_2 = GENERAL_REGS;
14921 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14922 rclass_3 = FP_REGS;
14923 else
14924 rclass_3 = GENERAL_REGS;
14926 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14927 rclass_4 = FP_REGS;
14928 else
14929 rclass_4 = GENERAL_REGS;
14931 /* Check if the registers are of same class. */
14932 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14933 return false;
14935 return true;
14938 /* Given OPERANDS of consecutive load/store, this function pairs them
14939 into ldp/stp after adjusting the offset. It depends on the fact
14940 that addresses of load/store instructions are in increasing order.
14941 MODE is the mode of memory operands. CODE is the rtl operator
14942 which should be applied to all memory operands, it's SIGN_EXTEND,
14943 ZERO_EXTEND or UNKNOWN. */
14945 bool
14946 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14947 scalar_mode mode, RTX_CODE code)
14949 rtx base, offset, t1, t2;
14950 rtx mem_1, mem_2, mem_3, mem_4;
14951 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14953 if (load)
14955 mem_1 = operands[1];
14956 mem_2 = operands[3];
14957 mem_3 = operands[5];
14958 mem_4 = operands[7];
14960 else
14962 mem_1 = operands[0];
14963 mem_2 = operands[2];
14964 mem_3 = operands[4];
14965 mem_4 = operands[6];
14966 gcc_assert (code == UNKNOWN);
14969 extract_base_offset_in_addr (mem_1, &base, &offset);
14970 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14972 /* Adjust offset thus it can fit in ldp/stp instruction. */
14973 msize = GET_MODE_SIZE (mode);
14974 stp_off_limit = msize * 0x40;
14975 off_val = INTVAL (offset);
14976 abs_off = (off_val < 0) ? -off_val : off_val;
14977 new_off = abs_off % stp_off_limit;
14978 adj_off = abs_off - new_off;
14980 /* Further adjust to make sure all offsets are OK. */
14981 if ((new_off + msize * 2) >= stp_off_limit)
14983 adj_off += stp_off_limit;
14984 new_off -= stp_off_limit;
14987 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14988 if (adj_off >= 0x1000)
14989 return false;
14991 if (off_val < 0)
14993 adj_off = -adj_off;
14994 new_off = -new_off;
14997 /* Create new memory references. */
14998 mem_1 = change_address (mem_1, VOIDmode,
14999 plus_constant (DImode, operands[8], new_off));
15001 /* Check if the adjusted address is OK for ldp/stp. */
15002 if (!aarch64_mem_pair_operand (mem_1, mode))
15003 return false;
15005 msize = GET_MODE_SIZE (mode);
15006 mem_2 = change_address (mem_2, VOIDmode,
15007 plus_constant (DImode,
15008 operands[8],
15009 new_off + msize));
15010 mem_3 = change_address (mem_3, VOIDmode,
15011 plus_constant (DImode,
15012 operands[8],
15013 new_off + msize * 2));
15014 mem_4 = change_address (mem_4, VOIDmode,
15015 plus_constant (DImode,
15016 operands[8],
15017 new_off + msize * 3));
15019 if (code == ZERO_EXTEND)
15021 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15022 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15023 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15024 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15026 else if (code == SIGN_EXTEND)
15028 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15029 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15030 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15031 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15034 if (load)
15036 operands[1] = mem_1;
15037 operands[3] = mem_2;
15038 operands[5] = mem_3;
15039 operands[7] = mem_4;
15041 else
15043 operands[0] = mem_1;
15044 operands[2] = mem_2;
15045 operands[4] = mem_3;
15046 operands[6] = mem_4;
15049 /* Emit adjusting instruction. */
15050 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15051 /* Emit ldp/stp instructions. */
15052 t1 = gen_rtx_SET (operands[0], operands[1]);
15053 t2 = gen_rtx_SET (operands[2], operands[3]);
15054 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15055 t1 = gen_rtx_SET (operands[4], operands[5]);
15056 t2 = gen_rtx_SET (operands[6], operands[7]);
15057 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15058 return true;
15061 /* Return 1 if pseudo register should be created and used to hold
15062 GOT address for PIC code. */
15064 bool
15065 aarch64_use_pseudo_pic_reg (void)
15067 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15070 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15072 static int
15073 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15075 switch (XINT (x, 1))
15077 case UNSPEC_GOTSMALLPIC:
15078 case UNSPEC_GOTSMALLPIC28K:
15079 case UNSPEC_GOTTINYPIC:
15080 return 0;
15081 default:
15082 break;
15085 return default_unspec_may_trap_p (x, flags);
15089 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15090 return the log2 of that value. Otherwise return -1. */
15093 aarch64_fpconst_pow_of_2 (rtx x)
15095 const REAL_VALUE_TYPE *r;
15097 if (!CONST_DOUBLE_P (x))
15098 return -1;
15100 r = CONST_DOUBLE_REAL_VALUE (x);
15102 if (REAL_VALUE_NEGATIVE (*r)
15103 || REAL_VALUE_ISNAN (*r)
15104 || REAL_VALUE_ISINF (*r)
15105 || !real_isinteger (r, DFmode))
15106 return -1;
15108 return exact_log2 (real_to_integer (r));
15111 /* If X is a vector of equal CONST_DOUBLE values and that value is
15112 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15115 aarch64_vec_fpconst_pow_of_2 (rtx x)
15117 if (GET_CODE (x) != CONST_VECTOR)
15118 return -1;
15120 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15121 return -1;
15123 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15124 if (firstval <= 0)
15125 return -1;
15127 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15128 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15129 return -1;
15131 return firstval;
15134 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15135 to float.
15137 __fp16 always promotes through this hook.
15138 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15139 through the generic excess precision logic rather than here. */
15141 static tree
15142 aarch64_promoted_type (const_tree t)
15144 if (SCALAR_FLOAT_TYPE_P (t)
15145 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15146 return float_type_node;
15148 return NULL_TREE;
15151 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15153 static bool
15154 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15155 optimization_type opt_type)
15157 switch (op)
15159 case rsqrt_optab:
15160 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15162 default:
15163 return true;
15167 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15168 if MODE is HFmode, and punt to the generic implementation otherwise. */
15170 static bool
15171 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15173 return (mode == HFmode
15174 ? true
15175 : default_libgcc_floating_mode_supported_p (mode));
15178 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15179 if MODE is HFmode, and punt to the generic implementation otherwise. */
15181 static bool
15182 aarch64_scalar_mode_supported_p (scalar_mode mode)
15184 return (mode == HFmode
15185 ? true
15186 : default_scalar_mode_supported_p (mode));
15189 /* Set the value of FLT_EVAL_METHOD.
15190 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15192 0: evaluate all operations and constants, whose semantic type has at
15193 most the range and precision of type float, to the range and
15194 precision of float; evaluate all other operations and constants to
15195 the range and precision of the semantic type;
15197 N, where _FloatN is a supported interchange floating type
15198 evaluate all operations and constants, whose semantic type has at
15199 most the range and precision of _FloatN type, to the range and
15200 precision of the _FloatN type; evaluate all other operations and
15201 constants to the range and precision of the semantic type;
15203 If we have the ARMv8.2-A extensions then we support _Float16 in native
15204 precision, so we should set this to 16. Otherwise, we support the type,
15205 but want to evaluate expressions in float precision, so set this to
15206 0. */
15208 static enum flt_eval_method
15209 aarch64_excess_precision (enum excess_precision_type type)
15211 switch (type)
15213 case EXCESS_PRECISION_TYPE_FAST:
15214 case EXCESS_PRECISION_TYPE_STANDARD:
15215 /* We can calculate either in 16-bit range and precision or
15216 32-bit range and precision. Make that decision based on whether
15217 we have native support for the ARMv8.2-A 16-bit floating-point
15218 instructions or not. */
15219 return (TARGET_FP_F16INST
15220 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15221 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15222 case EXCESS_PRECISION_TYPE_IMPLICIT:
15223 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15224 default:
15225 gcc_unreachable ();
15227 return FLT_EVAL_METHOD_UNPREDICTABLE;
15230 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15231 scheduled for speculative execution. Reject the long-running division
15232 and square-root instructions. */
15234 static bool
15235 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15237 switch (get_attr_type (insn))
15239 case TYPE_SDIV:
15240 case TYPE_UDIV:
15241 case TYPE_FDIVS:
15242 case TYPE_FDIVD:
15243 case TYPE_FSQRTS:
15244 case TYPE_FSQRTD:
15245 case TYPE_NEON_FP_SQRT_S:
15246 case TYPE_NEON_FP_SQRT_D:
15247 case TYPE_NEON_FP_SQRT_S_Q:
15248 case TYPE_NEON_FP_SQRT_D_Q:
15249 case TYPE_NEON_FP_DIV_S:
15250 case TYPE_NEON_FP_DIV_D:
15251 case TYPE_NEON_FP_DIV_S_Q:
15252 case TYPE_NEON_FP_DIV_D_Q:
15253 return false;
15254 default:
15255 return true;
15259 /* Target-specific selftests. */
15261 #if CHECKING_P
15263 namespace selftest {
15265 /* Selftest for the RTL loader.
15266 Verify that the RTL loader copes with a dump from
15267 print_rtx_function. This is essentially just a test that class
15268 function_reader can handle a real dump, but it also verifies
15269 that lookup_reg_by_dump_name correctly handles hard regs.
15270 The presence of hard reg names in the dump means that the test is
15271 target-specific, hence it is in this file. */
15273 static void
15274 aarch64_test_loading_full_dump ()
15276 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15278 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15280 rtx_insn *insn_1 = get_insn_by_uid (1);
15281 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15283 rtx_insn *insn_15 = get_insn_by_uid (15);
15284 ASSERT_EQ (INSN, GET_CODE (insn_15));
15285 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15287 /* Verify crtl->return_rtx. */
15288 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15289 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15290 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15293 /* Run all target-specific selftests. */
15295 static void
15296 aarch64_run_selftests (void)
15298 aarch64_test_loading_full_dump ();
15301 } // namespace selftest
15303 #endif /* #if CHECKING_P */
15305 #undef TARGET_ADDRESS_COST
15306 #define TARGET_ADDRESS_COST aarch64_address_cost
15308 /* This hook will determines whether unnamed bitfields affect the alignment
15309 of the containing structure. The hook returns true if the structure
15310 should inherit the alignment requirements of an unnamed bitfield's
15311 type. */
15312 #undef TARGET_ALIGN_ANON_BITFIELD
15313 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15315 #undef TARGET_ASM_ALIGNED_DI_OP
15316 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15318 #undef TARGET_ASM_ALIGNED_HI_OP
15319 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15321 #undef TARGET_ASM_ALIGNED_SI_OP
15322 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15324 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15325 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15326 hook_bool_const_tree_hwi_hwi_const_tree_true
15328 #undef TARGET_ASM_FILE_START
15329 #define TARGET_ASM_FILE_START aarch64_start_file
15331 #undef TARGET_ASM_OUTPUT_MI_THUNK
15332 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15334 #undef TARGET_ASM_SELECT_RTX_SECTION
15335 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15337 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15338 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15340 #undef TARGET_BUILD_BUILTIN_VA_LIST
15341 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15343 #undef TARGET_CALLEE_COPIES
15344 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15346 #undef TARGET_CAN_ELIMINATE
15347 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15349 #undef TARGET_CAN_INLINE_P
15350 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15352 #undef TARGET_CANNOT_FORCE_CONST_MEM
15353 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15355 #undef TARGET_CASE_VALUES_THRESHOLD
15356 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15358 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15359 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15361 /* Only the least significant bit is used for initialization guard
15362 variables. */
15363 #undef TARGET_CXX_GUARD_MASK_BIT
15364 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15366 #undef TARGET_C_MODE_FOR_SUFFIX
15367 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15369 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15370 #undef TARGET_DEFAULT_TARGET_FLAGS
15371 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15372 #endif
15374 #undef TARGET_CLASS_MAX_NREGS
15375 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15377 #undef TARGET_BUILTIN_DECL
15378 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15380 #undef TARGET_BUILTIN_RECIPROCAL
15381 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15383 #undef TARGET_C_EXCESS_PRECISION
15384 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15386 #undef TARGET_EXPAND_BUILTIN
15387 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15389 #undef TARGET_EXPAND_BUILTIN_VA_START
15390 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15392 #undef TARGET_FOLD_BUILTIN
15393 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15395 #undef TARGET_FUNCTION_ARG
15396 #define TARGET_FUNCTION_ARG aarch64_function_arg
15398 #undef TARGET_FUNCTION_ARG_ADVANCE
15399 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15401 #undef TARGET_FUNCTION_ARG_BOUNDARY
15402 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15404 #undef TARGET_FUNCTION_ARG_PADDING
15405 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15407 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15408 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15410 #undef TARGET_FUNCTION_VALUE
15411 #define TARGET_FUNCTION_VALUE aarch64_function_value
15413 #undef TARGET_FUNCTION_VALUE_REGNO_P
15414 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15416 #undef TARGET_FRAME_POINTER_REQUIRED
15417 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15419 #undef TARGET_GIMPLE_FOLD_BUILTIN
15420 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15422 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15423 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15425 #undef TARGET_INIT_BUILTINS
15426 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15428 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15429 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15430 aarch64_ira_change_pseudo_allocno_class
15432 #undef TARGET_LEGITIMATE_ADDRESS_P
15433 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15435 #undef TARGET_LEGITIMATE_CONSTANT_P
15436 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15438 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15439 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15440 aarch64_legitimize_address_displacement
15442 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15443 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15445 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15446 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15447 aarch64_libgcc_floating_mode_supported_p
15449 #undef TARGET_MANGLE_TYPE
15450 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15452 #undef TARGET_MEMORY_MOVE_COST
15453 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15455 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15456 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15458 #undef TARGET_MUST_PASS_IN_STACK
15459 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15461 /* This target hook should return true if accesses to volatile bitfields
15462 should use the narrowest mode possible. It should return false if these
15463 accesses should use the bitfield container type. */
15464 #undef TARGET_NARROW_VOLATILE_BITFIELD
15465 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15467 #undef TARGET_OPTION_OVERRIDE
15468 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15470 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15471 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15472 aarch64_override_options_after_change
15474 #undef TARGET_OPTION_SAVE
15475 #define TARGET_OPTION_SAVE aarch64_option_save
15477 #undef TARGET_OPTION_RESTORE
15478 #define TARGET_OPTION_RESTORE aarch64_option_restore
15480 #undef TARGET_OPTION_PRINT
15481 #define TARGET_OPTION_PRINT aarch64_option_print
15483 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15484 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15486 #undef TARGET_SET_CURRENT_FUNCTION
15487 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15489 #undef TARGET_PASS_BY_REFERENCE
15490 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15492 #undef TARGET_PREFERRED_RELOAD_CLASS
15493 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15495 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15496 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15498 #undef TARGET_PROMOTED_TYPE
15499 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15501 #undef TARGET_SECONDARY_RELOAD
15502 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15504 #undef TARGET_SHIFT_TRUNCATION_MASK
15505 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15507 #undef TARGET_SETUP_INCOMING_VARARGS
15508 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15510 #undef TARGET_STRUCT_VALUE_RTX
15511 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15513 #undef TARGET_REGISTER_MOVE_COST
15514 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15516 #undef TARGET_RETURN_IN_MEMORY
15517 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15519 #undef TARGET_RETURN_IN_MSB
15520 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15522 #undef TARGET_RTX_COSTS
15523 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15525 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15526 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15528 #undef TARGET_SCHED_ISSUE_RATE
15529 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15531 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15532 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15533 aarch64_sched_first_cycle_multipass_dfa_lookahead
15535 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15536 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15537 aarch64_first_cycle_multipass_dfa_lookahead_guard
15539 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15540 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15541 aarch64_get_separate_components
15543 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15544 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15545 aarch64_components_for_bb
15547 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15548 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15549 aarch64_disqualify_components
15551 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15552 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15553 aarch64_emit_prologue_components
15555 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15556 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15557 aarch64_emit_epilogue_components
15559 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15560 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15561 aarch64_set_handled_components
15563 #undef TARGET_TRAMPOLINE_INIT
15564 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15566 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15567 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15569 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15570 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15572 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15573 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15574 aarch64_builtin_support_vector_misalignment
15576 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15577 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15579 #undef TARGET_VECTORIZE_ADD_STMT_COST
15580 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15582 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15583 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15584 aarch64_builtin_vectorization_cost
15586 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15587 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15589 #undef TARGET_VECTORIZE_BUILTINS
15590 #define TARGET_VECTORIZE_BUILTINS
15592 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15593 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15594 aarch64_builtin_vectorized_function
15596 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15597 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15598 aarch64_autovectorize_vector_sizes
15600 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15601 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15602 aarch64_atomic_assign_expand_fenv
15604 /* Section anchor support. */
15606 #undef TARGET_MIN_ANCHOR_OFFSET
15607 #define TARGET_MIN_ANCHOR_OFFSET -256
15609 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15610 byte offset; we can do much more for larger data types, but have no way
15611 to determine the size of the access. We assume accesses are aligned. */
15612 #undef TARGET_MAX_ANCHOR_OFFSET
15613 #define TARGET_MAX_ANCHOR_OFFSET 4095
15615 #undef TARGET_VECTOR_ALIGNMENT
15616 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15618 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15619 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15620 aarch64_simd_vector_alignment_reachable
15622 /* vec_perm support. */
15624 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15625 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15626 aarch64_vectorize_vec_perm_const_ok
15628 #undef TARGET_INIT_LIBFUNCS
15629 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15631 #undef TARGET_FIXED_CONDITION_CODE_REGS
15632 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15634 #undef TARGET_FLAGS_REGNUM
15635 #define TARGET_FLAGS_REGNUM CC_REGNUM
15637 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15638 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15640 #undef TARGET_ASAN_SHADOW_OFFSET
15641 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15643 #undef TARGET_LEGITIMIZE_ADDRESS
15644 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15646 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15647 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15648 aarch64_use_by_pieces_infrastructure_p
15650 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15651 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15653 #undef TARGET_CAN_USE_DOLOOP_P
15654 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15656 #undef TARGET_SCHED_ADJUST_PRIORITY
15657 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15659 #undef TARGET_SCHED_MACRO_FUSION_P
15660 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15662 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15663 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15665 #undef TARGET_SCHED_FUSION_PRIORITY
15666 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15668 #undef TARGET_UNSPEC_MAY_TRAP_P
15669 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15671 #undef TARGET_USE_PSEUDO_PIC_REG
15672 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15674 #undef TARGET_PRINT_OPERAND
15675 #define TARGET_PRINT_OPERAND aarch64_print_operand
15677 #undef TARGET_PRINT_OPERAND_ADDRESS
15678 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15680 #undef TARGET_OPTAB_SUPPORTED_P
15681 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15683 #undef TARGET_OMIT_STRUCT_RETURN_REG
15684 #define TARGET_OMIT_STRUCT_RETURN_REG true
15686 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15687 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15688 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15690 #undef TARGET_HARD_REGNO_MODE_OK
15691 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15693 #undef TARGET_MODES_TIEABLE_P
15694 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15696 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15697 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15698 aarch64_hard_regno_call_part_clobbered
15700 #if CHECKING_P
15701 #undef TARGET_RUN_TARGET_SELFTESTS
15702 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15703 #endif /* #if CHECKING_P */
15705 struct gcc_target targetm = TARGET_INITIALIZER;
15707 #include "gt-aarch64.h"