[2/77] Add an E_ prefix to case statements
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobced6f9bf82948b3c3da022f360f03fd2268551d6
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
118 struct simd_immediate_info
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
145 const unsigned char *sel);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table =
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
260 static const struct cpu_regmove_cost generic_regmove_cost =
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost =
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings =
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings =
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings =
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings =
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings =
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings =
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings =
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings =
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings =
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings =
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings =
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings =
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
869 /* A processor implementing AArch64. */
870 struct processor
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
929 aarch64_cc;
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
992 machine_mode mode;
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
997 if (best_class != ALL_REGS)
998 return best_class;
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1045 return mode == OImode || mode == CImode || mode == XImode;
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1067 return false;
1070 /* Implement HARD_REGNO_NREGS. */
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1075 switch (aarch64_regno_regclass (regno))
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1083 gcc_unreachable ();
1086 /* Implement HARD_REGNO_MODE_OK. */
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return 1;
1106 if (FP_REGNUM_P (regno))
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return
1110 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111 else
1112 return 1;
1115 return 0;
1118 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1119 machine_mode
1120 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1121 machine_mode mode)
1123 /* Handle modes that fit within single registers. */
1124 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1126 if (GET_MODE_SIZE (mode) >= 4)
1127 return mode;
1128 else
1129 return SImode;
1131 /* Fall back to generic for multi-reg and very large modes. */
1132 else
1133 return choose_hard_reg_mode (regno, nregs, false);
1136 /* Return true if calls to DECL should be treated as
1137 long-calls (ie called via a register). */
1138 static bool
1139 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1141 return false;
1144 /* Return true if calls to symbol-ref SYM should be treated as
1145 long-calls (ie called via a register). */
1146 bool
1147 aarch64_is_long_call_p (rtx sym)
1149 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1152 /* Return true if calls to symbol-ref SYM should not go through
1153 plt stubs. */
1155 bool
1156 aarch64_is_noplt_call_p (rtx sym)
1158 const_tree decl = SYMBOL_REF_DECL (sym);
1160 if (flag_pic
1161 && decl
1162 && (!flag_plt
1163 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1164 && !targetm.binds_local_p (decl))
1165 return true;
1167 return false;
1170 /* Return true if the offsets to a zero/sign-extract operation
1171 represent an expression that matches an extend operation. The
1172 operands represent the paramters from
1174 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1175 bool
1176 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1177 rtx extract_imm)
1179 HOST_WIDE_INT mult_val, extract_val;
1181 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1182 return false;
1184 mult_val = INTVAL (mult_imm);
1185 extract_val = INTVAL (extract_imm);
1187 if (extract_val > 8
1188 && extract_val < GET_MODE_BITSIZE (mode)
1189 && exact_log2 (extract_val & ~7) > 0
1190 && (extract_val & 7) <= 4
1191 && mult_val == (1 << (extract_val & 7)))
1192 return true;
1194 return false;
1197 /* Emit an insn that's a simple single-set. Both the operands must be
1198 known to be valid. */
1199 inline static rtx_insn *
1200 emit_set_insn (rtx x, rtx y)
1202 return emit_insn (gen_rtx_SET (x, y));
1205 /* X and Y are two things to compare using CODE. Emit the compare insn and
1206 return the rtx for register 0 in the proper mode. */
1208 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1210 machine_mode mode = SELECT_CC_MODE (code, x, y);
1211 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1213 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1214 return cc_reg;
1217 /* Build the SYMBOL_REF for __tls_get_addr. */
1219 static GTY(()) rtx tls_get_addr_libfunc;
1222 aarch64_tls_get_addr (void)
1224 if (!tls_get_addr_libfunc)
1225 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1226 return tls_get_addr_libfunc;
1229 /* Return the TLS model to use for ADDR. */
1231 static enum tls_model
1232 tls_symbolic_operand_type (rtx addr)
1234 enum tls_model tls_kind = TLS_MODEL_NONE;
1235 rtx sym, addend;
1237 if (GET_CODE (addr) == CONST)
1239 split_const (addr, &sym, &addend);
1240 if (GET_CODE (sym) == SYMBOL_REF)
1241 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1243 else if (GET_CODE (addr) == SYMBOL_REF)
1244 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1246 return tls_kind;
1249 /* We'll allow lo_sum's in addresses in our legitimate addresses
1250 so that combine would take care of combining addresses where
1251 necessary, but for generation purposes, we'll generate the address
1252 as :
1253 RTL Absolute
1254 tmp = hi (symbol_ref); adrp x1, foo
1255 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1258 PIC TLS
1259 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1260 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1261 bl __tls_get_addr
1264 Load TLS symbol, depending on TLS mechanism and TLS access model.
1266 Global Dynamic - Traditional TLS:
1267 adrp tmp, :tlsgd:imm
1268 add dest, tmp, #:tlsgd_lo12:imm
1269 bl __tls_get_addr
1271 Global Dynamic - TLS Descriptors:
1272 adrp dest, :tlsdesc:imm
1273 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1274 add dest, dest, #:tlsdesc_lo12:imm
1275 blr tmp
1276 mrs tp, tpidr_el0
1277 add dest, dest, tp
1279 Initial Exec:
1280 mrs tp, tpidr_el0
1281 adrp tmp, :gottprel:imm
1282 ldr dest, [tmp, #:gottprel_lo12:imm]
1283 add dest, dest, tp
1285 Local Exec:
1286 mrs tp, tpidr_el0
1287 add t0, tp, #:tprel_hi12:imm, lsl #12
1288 add t0, t0, #:tprel_lo12_nc:imm
1291 static void
1292 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1293 enum aarch64_symbol_type type)
1295 switch (type)
1297 case SYMBOL_SMALL_ABSOLUTE:
1299 /* In ILP32, the mode of dest can be either SImode or DImode. */
1300 rtx tmp_reg = dest;
1301 machine_mode mode = GET_MODE (dest);
1303 gcc_assert (mode == Pmode || mode == ptr_mode);
1305 if (can_create_pseudo_p ())
1306 tmp_reg = gen_reg_rtx (mode);
1308 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1309 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1310 return;
1313 case SYMBOL_TINY_ABSOLUTE:
1314 emit_insn (gen_rtx_SET (dest, imm));
1315 return;
1317 case SYMBOL_SMALL_GOT_28K:
1319 machine_mode mode = GET_MODE (dest);
1320 rtx gp_rtx = pic_offset_table_rtx;
1321 rtx insn;
1322 rtx mem;
1324 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1325 here before rtl expand. Tree IVOPT will generate rtl pattern to
1326 decide rtx costs, in which case pic_offset_table_rtx is not
1327 initialized. For that case no need to generate the first adrp
1328 instruction as the final cost for global variable access is
1329 one instruction. */
1330 if (gp_rtx != NULL)
1332 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1333 using the page base as GOT base, the first page may be wasted,
1334 in the worst scenario, there is only 28K space for GOT).
1336 The generate instruction sequence for accessing global variable
1339 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1341 Only one instruction needed. But we must initialize
1342 pic_offset_table_rtx properly. We generate initialize insn for
1343 every global access, and allow CSE to remove all redundant.
1345 The final instruction sequences will look like the following
1346 for multiply global variables access.
1348 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1350 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1351 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1352 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1353 ... */
1355 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1356 crtl->uses_pic_offset_table = 1;
1357 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1359 if (mode != GET_MODE (gp_rtx))
1360 gp_rtx = gen_lowpart (mode, gp_rtx);
1364 if (mode == ptr_mode)
1366 if (mode == DImode)
1367 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1368 else
1369 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1371 mem = XVECEXP (SET_SRC (insn), 0, 0);
1373 else
1375 gcc_assert (mode == Pmode);
1377 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1378 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1381 /* The operand is expected to be MEM. Whenever the related insn
1382 pattern changed, above code which calculate mem should be
1383 updated. */
1384 gcc_assert (GET_CODE (mem) == MEM);
1385 MEM_READONLY_P (mem) = 1;
1386 MEM_NOTRAP_P (mem) = 1;
1387 emit_insn (insn);
1388 return;
1391 case SYMBOL_SMALL_GOT_4G:
1393 /* In ILP32, the mode of dest can be either SImode or DImode,
1394 while the got entry is always of SImode size. The mode of
1395 dest depends on how dest is used: if dest is assigned to a
1396 pointer (e.g. in the memory), it has SImode; it may have
1397 DImode if dest is dereferenced to access the memeory.
1398 This is why we have to handle three different ldr_got_small
1399 patterns here (two patterns for ILP32). */
1401 rtx insn;
1402 rtx mem;
1403 rtx tmp_reg = dest;
1404 machine_mode mode = GET_MODE (dest);
1406 if (can_create_pseudo_p ())
1407 tmp_reg = gen_reg_rtx (mode);
1409 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1410 if (mode == ptr_mode)
1412 if (mode == DImode)
1413 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1414 else
1415 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1417 mem = XVECEXP (SET_SRC (insn), 0, 0);
1419 else
1421 gcc_assert (mode == Pmode);
1423 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1424 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1427 gcc_assert (GET_CODE (mem) == MEM);
1428 MEM_READONLY_P (mem) = 1;
1429 MEM_NOTRAP_P (mem) = 1;
1430 emit_insn (insn);
1431 return;
1434 case SYMBOL_SMALL_TLSGD:
1436 rtx_insn *insns;
1437 machine_mode mode = GET_MODE (dest);
1438 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1440 start_sequence ();
1441 if (TARGET_ILP32)
1442 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1443 else
1444 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1445 insns = get_insns ();
1446 end_sequence ();
1448 RTL_CONST_CALL_P (insns) = 1;
1449 emit_libcall_block (insns, dest, result, imm);
1450 return;
1453 case SYMBOL_SMALL_TLSDESC:
1455 machine_mode mode = GET_MODE (dest);
1456 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1457 rtx tp;
1459 gcc_assert (mode == Pmode || mode == ptr_mode);
1461 /* In ILP32, the got entry is always of SImode size. Unlike
1462 small GOT, the dest is fixed at reg 0. */
1463 if (TARGET_ILP32)
1464 emit_insn (gen_tlsdesc_small_si (imm));
1465 else
1466 emit_insn (gen_tlsdesc_small_di (imm));
1467 tp = aarch64_load_tp (NULL);
1469 if (mode != Pmode)
1470 tp = gen_lowpart (mode, tp);
1472 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1473 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1474 return;
1477 case SYMBOL_SMALL_TLSIE:
1479 /* In ILP32, the mode of dest can be either SImode or DImode,
1480 while the got entry is always of SImode size. The mode of
1481 dest depends on how dest is used: if dest is assigned to a
1482 pointer (e.g. in the memory), it has SImode; it may have
1483 DImode if dest is dereferenced to access the memeory.
1484 This is why we have to handle three different tlsie_small
1485 patterns here (two patterns for ILP32). */
1486 machine_mode mode = GET_MODE (dest);
1487 rtx tmp_reg = gen_reg_rtx (mode);
1488 rtx tp = aarch64_load_tp (NULL);
1490 if (mode == ptr_mode)
1492 if (mode == DImode)
1493 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1494 else
1496 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1497 tp = gen_lowpart (mode, tp);
1500 else
1502 gcc_assert (mode == Pmode);
1503 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1506 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1507 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508 return;
1511 case SYMBOL_TLSLE12:
1512 case SYMBOL_TLSLE24:
1513 case SYMBOL_TLSLE32:
1514 case SYMBOL_TLSLE48:
1516 machine_mode mode = GET_MODE (dest);
1517 rtx tp = aarch64_load_tp (NULL);
1519 if (mode != Pmode)
1520 tp = gen_lowpart (mode, tp);
1522 switch (type)
1524 case SYMBOL_TLSLE12:
1525 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1526 (dest, tp, imm));
1527 break;
1528 case SYMBOL_TLSLE24:
1529 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1530 (dest, tp, imm));
1531 break;
1532 case SYMBOL_TLSLE32:
1533 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1534 (dest, imm));
1535 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1536 (dest, dest, tp));
1537 break;
1538 case SYMBOL_TLSLE48:
1539 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1540 (dest, imm));
1541 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1542 (dest, dest, tp));
1543 break;
1544 default:
1545 gcc_unreachable ();
1548 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1549 return;
1552 case SYMBOL_TINY_GOT:
1553 emit_insn (gen_ldr_got_tiny (dest, imm));
1554 return;
1556 case SYMBOL_TINY_TLSIE:
1558 machine_mode mode = GET_MODE (dest);
1559 rtx tp = aarch64_load_tp (NULL);
1561 if (mode == ptr_mode)
1563 if (mode == DImode)
1564 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1565 else
1567 tp = gen_lowpart (mode, tp);
1568 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1571 else
1573 gcc_assert (mode == Pmode);
1574 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1577 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1578 return;
1581 default:
1582 gcc_unreachable ();
1586 /* Emit a move from SRC to DEST. Assume that the move expanders can
1587 handle all moves if !can_create_pseudo_p (). The distinction is
1588 important because, unlike emit_move_insn, the move expanders know
1589 how to force Pmode objects into the constant pool even when the
1590 constant pool address is not itself legitimate. */
1591 static rtx
1592 aarch64_emit_move (rtx dest, rtx src)
1594 return (can_create_pseudo_p ()
1595 ? emit_move_insn (dest, src)
1596 : emit_move_insn_1 (dest, src));
1599 /* Split a 128-bit move operation into two 64-bit move operations,
1600 taking care to handle partial overlap of register to register
1601 copies. Special cases are needed when moving between GP regs and
1602 FP regs. SRC can be a register, constant or memory; DST a register
1603 or memory. If either operand is memory it must not have any side
1604 effects. */
1605 void
1606 aarch64_split_128bit_move (rtx dst, rtx src)
1608 rtx dst_lo, dst_hi;
1609 rtx src_lo, src_hi;
1611 machine_mode mode = GET_MODE (dst);
1613 gcc_assert (mode == TImode || mode == TFmode);
1614 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1615 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1617 if (REG_P (dst) && REG_P (src))
1619 int src_regno = REGNO (src);
1620 int dst_regno = REGNO (dst);
1622 /* Handle FP <-> GP regs. */
1623 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1625 src_lo = gen_lowpart (word_mode, src);
1626 src_hi = gen_highpart (word_mode, src);
1628 if (mode == TImode)
1630 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1631 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1633 else
1635 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1636 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1638 return;
1640 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1642 dst_lo = gen_lowpart (word_mode, dst);
1643 dst_hi = gen_highpart (word_mode, dst);
1645 if (mode == TImode)
1647 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1648 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1650 else
1652 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1653 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1655 return;
1659 dst_lo = gen_lowpart (word_mode, dst);
1660 dst_hi = gen_highpart (word_mode, dst);
1661 src_lo = gen_lowpart (word_mode, src);
1662 src_hi = gen_highpart_mode (word_mode, mode, src);
1664 /* At most one pairing may overlap. */
1665 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1667 aarch64_emit_move (dst_hi, src_hi);
1668 aarch64_emit_move (dst_lo, src_lo);
1670 else
1672 aarch64_emit_move (dst_lo, src_lo);
1673 aarch64_emit_move (dst_hi, src_hi);
1677 bool
1678 aarch64_split_128bit_move_p (rtx dst, rtx src)
1680 return (! REG_P (src)
1681 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1684 /* Split a complex SIMD combine. */
1686 void
1687 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1689 machine_mode src_mode = GET_MODE (src1);
1690 machine_mode dst_mode = GET_MODE (dst);
1692 gcc_assert (VECTOR_MODE_P (dst_mode));
1693 gcc_assert (register_operand (dst, dst_mode)
1694 && register_operand (src1, src_mode)
1695 && register_operand (src2, src_mode));
1697 rtx (*gen) (rtx, rtx, rtx);
1699 switch (src_mode)
1701 case E_V8QImode:
1702 gen = gen_aarch64_simd_combinev8qi;
1703 break;
1704 case E_V4HImode:
1705 gen = gen_aarch64_simd_combinev4hi;
1706 break;
1707 case E_V2SImode:
1708 gen = gen_aarch64_simd_combinev2si;
1709 break;
1710 case E_V4HFmode:
1711 gen = gen_aarch64_simd_combinev4hf;
1712 break;
1713 case E_V2SFmode:
1714 gen = gen_aarch64_simd_combinev2sf;
1715 break;
1716 case E_DImode:
1717 gen = gen_aarch64_simd_combinedi;
1718 break;
1719 case E_DFmode:
1720 gen = gen_aarch64_simd_combinedf;
1721 break;
1722 default:
1723 gcc_unreachable ();
1726 emit_insn (gen (dst, src1, src2));
1727 return;
1730 /* Split a complex SIMD move. */
1732 void
1733 aarch64_split_simd_move (rtx dst, rtx src)
1735 machine_mode src_mode = GET_MODE (src);
1736 machine_mode dst_mode = GET_MODE (dst);
1738 gcc_assert (VECTOR_MODE_P (dst_mode));
1740 if (REG_P (dst) && REG_P (src))
1742 rtx (*gen) (rtx, rtx);
1744 gcc_assert (VECTOR_MODE_P (src_mode));
1746 switch (src_mode)
1748 case E_V16QImode:
1749 gen = gen_aarch64_split_simd_movv16qi;
1750 break;
1751 case E_V8HImode:
1752 gen = gen_aarch64_split_simd_movv8hi;
1753 break;
1754 case E_V4SImode:
1755 gen = gen_aarch64_split_simd_movv4si;
1756 break;
1757 case E_V2DImode:
1758 gen = gen_aarch64_split_simd_movv2di;
1759 break;
1760 case E_V8HFmode:
1761 gen = gen_aarch64_split_simd_movv8hf;
1762 break;
1763 case E_V4SFmode:
1764 gen = gen_aarch64_split_simd_movv4sf;
1765 break;
1766 case E_V2DFmode:
1767 gen = gen_aarch64_split_simd_movv2df;
1768 break;
1769 default:
1770 gcc_unreachable ();
1773 emit_insn (gen (dst, src));
1774 return;
1778 bool
1779 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1780 machine_mode ymode, rtx y)
1782 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1783 gcc_assert (r != NULL);
1784 return rtx_equal_p (x, r);
1788 static rtx
1789 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1791 if (can_create_pseudo_p ())
1792 return force_reg (mode, value);
1793 else
1795 x = aarch64_emit_move (x, value);
1796 return x;
1801 static rtx
1802 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1804 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1806 rtx high;
1807 /* Load the full offset into a register. This
1808 might be improvable in the future. */
1809 high = GEN_INT (offset);
1810 offset = 0;
1811 high = aarch64_force_temporary (mode, temp, high);
1812 reg = aarch64_force_temporary (mode, temp,
1813 gen_rtx_PLUS (mode, high, reg));
1815 return plus_constant (mode, reg, offset);
1818 static int
1819 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1820 machine_mode mode)
1822 int i;
1823 unsigned HOST_WIDE_INT val, val2, mask;
1824 int one_match, zero_match;
1825 int num_insns;
1827 val = INTVAL (imm);
1829 if (aarch64_move_imm (val, mode))
1831 if (generate)
1832 emit_insn (gen_rtx_SET (dest, imm));
1833 return 1;
1836 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1837 (with XXXX non-zero). In that case check to see if the move can be done in
1838 a smaller mode. */
1839 val2 = val & 0xffffffff;
1840 if (mode == DImode
1841 && aarch64_move_imm (val2, SImode)
1842 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1844 if (generate)
1845 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1847 /* Check if we have to emit a second instruction by checking to see
1848 if any of the upper 32 bits of the original DI mode value is set. */
1849 if (val == val2)
1850 return 1;
1852 i = (val >> 48) ? 48 : 32;
1854 if (generate)
1855 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1856 GEN_INT ((val >> i) & 0xffff)));
1858 return 2;
1861 if ((val >> 32) == 0 || mode == SImode)
1863 if (generate)
1865 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1866 if (mode == SImode)
1867 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1868 GEN_INT ((val >> 16) & 0xffff)));
1869 else
1870 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1871 GEN_INT ((val >> 16) & 0xffff)));
1873 return 2;
1876 /* Remaining cases are all for DImode. */
1878 mask = 0xffff;
1879 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1880 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1881 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1882 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1884 if (zero_match != 2 && one_match != 2)
1886 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1887 For a 64-bit bitmask try whether changing 16 bits to all ones or
1888 zeroes creates a valid bitmask. To check any repeated bitmask,
1889 try using 16 bits from the other 32-bit half of val. */
1891 for (i = 0; i < 64; i += 16, mask <<= 16)
1893 val2 = val & ~mask;
1894 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1895 break;
1896 val2 = val | mask;
1897 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1898 break;
1899 val2 = val2 & ~mask;
1900 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1901 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1902 break;
1904 if (i != 64)
1906 if (generate)
1908 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1909 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1910 GEN_INT ((val >> i) & 0xffff)));
1912 return 2;
1916 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1917 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1918 otherwise skip zero bits. */
1920 num_insns = 1;
1921 mask = 0xffff;
1922 val2 = one_match > zero_match ? ~val : val;
1923 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1925 if (generate)
1926 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1927 ? (val | ~(mask << i))
1928 : (val & (mask << i)))));
1929 for (i += 16; i < 64; i += 16)
1931 if ((val2 & (mask << i)) == 0)
1932 continue;
1933 if (generate)
1934 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1935 GEN_INT ((val >> i) & 0xffff)));
1936 num_insns ++;
1939 return num_insns;
1943 void
1944 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1946 machine_mode mode = GET_MODE (dest);
1948 gcc_assert (mode == SImode || mode == DImode);
1950 /* Check on what type of symbol it is. */
1951 if (GET_CODE (imm) == SYMBOL_REF
1952 || GET_CODE (imm) == LABEL_REF
1953 || GET_CODE (imm) == CONST)
1955 rtx mem, base, offset;
1956 enum aarch64_symbol_type sty;
1958 /* If we have (const (plus symbol offset)), separate out the offset
1959 before we start classifying the symbol. */
1960 split_const (imm, &base, &offset);
1962 sty = aarch64_classify_symbol (base, offset);
1963 switch (sty)
1965 case SYMBOL_FORCE_TO_MEM:
1966 if (offset != const0_rtx
1967 && targetm.cannot_force_const_mem (mode, imm))
1969 gcc_assert (can_create_pseudo_p ());
1970 base = aarch64_force_temporary (mode, dest, base);
1971 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1972 aarch64_emit_move (dest, base);
1973 return;
1976 mem = force_const_mem (ptr_mode, imm);
1977 gcc_assert (mem);
1979 /* If we aren't generating PC relative literals, then
1980 we need to expand the literal pool access carefully.
1981 This is something that needs to be done in a number
1982 of places, so could well live as a separate function. */
1983 if (!aarch64_pcrelative_literal_loads)
1985 gcc_assert (can_create_pseudo_p ());
1986 base = gen_reg_rtx (ptr_mode);
1987 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1988 if (ptr_mode != Pmode)
1989 base = convert_memory_address (Pmode, base);
1990 mem = gen_rtx_MEM (ptr_mode, base);
1993 if (mode != ptr_mode)
1994 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1996 emit_insn (gen_rtx_SET (dest, mem));
1998 return;
2000 case SYMBOL_SMALL_TLSGD:
2001 case SYMBOL_SMALL_TLSDESC:
2002 case SYMBOL_SMALL_TLSIE:
2003 case SYMBOL_SMALL_GOT_28K:
2004 case SYMBOL_SMALL_GOT_4G:
2005 case SYMBOL_TINY_GOT:
2006 case SYMBOL_TINY_TLSIE:
2007 if (offset != const0_rtx)
2009 gcc_assert(can_create_pseudo_p ());
2010 base = aarch64_force_temporary (mode, dest, base);
2011 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2012 aarch64_emit_move (dest, base);
2013 return;
2015 /* FALLTHRU */
2017 case SYMBOL_SMALL_ABSOLUTE:
2018 case SYMBOL_TINY_ABSOLUTE:
2019 case SYMBOL_TLSLE12:
2020 case SYMBOL_TLSLE24:
2021 case SYMBOL_TLSLE32:
2022 case SYMBOL_TLSLE48:
2023 aarch64_load_symref_appropriately (dest, imm, sty);
2024 return;
2026 default:
2027 gcc_unreachable ();
2031 if (!CONST_INT_P (imm))
2033 if (GET_CODE (imm) == HIGH)
2034 emit_insn (gen_rtx_SET (dest, imm));
2035 else
2037 rtx mem = force_const_mem (mode, imm);
2038 gcc_assert (mem);
2039 emit_insn (gen_rtx_SET (dest, mem));
2042 return;
2045 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2048 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2049 temporary value if necessary. FRAME_RELATED_P should be true if
2050 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2051 to the generated instructions. If SCRATCHREG is known to hold
2052 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2053 immediate again.
2055 Since this function may be used to adjust the stack pointer, we must
2056 ensure that it cannot cause transient stack deallocation (for example
2057 by first incrementing SP and then decrementing when adjusting by a
2058 large immediate). */
2060 static void
2061 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2062 HOST_WIDE_INT delta, bool frame_related_p,
2063 bool emit_move_imm)
2065 HOST_WIDE_INT mdelta = abs_hwi (delta);
2066 rtx this_rtx = gen_rtx_REG (mode, regnum);
2067 rtx_insn *insn;
2069 if (!mdelta)
2070 return;
2072 /* Single instruction adjustment. */
2073 if (aarch64_uimm12_shift (mdelta))
2075 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2076 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2077 return;
2080 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2081 Only do this if mdelta is not a 16-bit move as adjusting using a move
2082 is better. */
2083 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2085 HOST_WIDE_INT low_off = mdelta & 0xfff;
2087 low_off = delta < 0 ? -low_off : low_off;
2088 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2089 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2090 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2091 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2092 return;
2095 /* Emit a move immediate if required and an addition/subtraction. */
2096 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2097 if (emit_move_imm)
2098 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2099 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2100 : gen_add2_insn (this_rtx, scratch_rtx));
2101 if (frame_related_p)
2103 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2104 rtx adj = plus_constant (mode, this_rtx, delta);
2105 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2109 static inline void
2110 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2111 HOST_WIDE_INT delta)
2113 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2116 static inline void
2117 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2119 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2120 true, emit_move_imm);
2123 static inline void
2124 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2126 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2127 frame_related_p, true);
2130 static bool
2131 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2132 tree exp ATTRIBUTE_UNUSED)
2134 /* Currently, always true. */
2135 return true;
2138 /* Implement TARGET_PASS_BY_REFERENCE. */
2140 static bool
2141 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2142 machine_mode mode,
2143 const_tree type,
2144 bool named ATTRIBUTE_UNUSED)
2146 HOST_WIDE_INT size;
2147 machine_mode dummymode;
2148 int nregs;
2150 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2151 size = (mode == BLKmode && type)
2152 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2154 /* Aggregates are passed by reference based on their size. */
2155 if (type && AGGREGATE_TYPE_P (type))
2157 size = int_size_in_bytes (type);
2160 /* Variable sized arguments are always returned by reference. */
2161 if (size < 0)
2162 return true;
2164 /* Can this be a candidate to be passed in fp/simd register(s)? */
2165 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2166 &dummymode, &nregs,
2167 NULL))
2168 return false;
2170 /* Arguments which are variable sized or larger than 2 registers are
2171 passed by reference unless they are a homogenous floating point
2172 aggregate. */
2173 return size > 2 * UNITS_PER_WORD;
2176 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2177 static bool
2178 aarch64_return_in_msb (const_tree valtype)
2180 machine_mode dummy_mode;
2181 int dummy_int;
2183 /* Never happens in little-endian mode. */
2184 if (!BYTES_BIG_ENDIAN)
2185 return false;
2187 /* Only composite types smaller than or equal to 16 bytes can
2188 be potentially returned in registers. */
2189 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2190 || int_size_in_bytes (valtype) <= 0
2191 || int_size_in_bytes (valtype) > 16)
2192 return false;
2194 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2195 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2196 is always passed/returned in the least significant bits of fp/simd
2197 register(s). */
2198 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2199 &dummy_mode, &dummy_int, NULL))
2200 return false;
2202 return true;
2205 /* Implement TARGET_FUNCTION_VALUE.
2206 Define how to find the value returned by a function. */
2208 static rtx
2209 aarch64_function_value (const_tree type, const_tree func,
2210 bool outgoing ATTRIBUTE_UNUSED)
2212 machine_mode mode;
2213 int unsignedp;
2214 int count;
2215 machine_mode ag_mode;
2217 mode = TYPE_MODE (type);
2218 if (INTEGRAL_TYPE_P (type))
2219 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2221 if (aarch64_return_in_msb (type))
2223 HOST_WIDE_INT size = int_size_in_bytes (type);
2225 if (size % UNITS_PER_WORD != 0)
2227 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2228 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2232 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2233 &ag_mode, &count, NULL))
2235 if (!aarch64_composite_type_p (type, mode))
2237 gcc_assert (count == 1 && mode == ag_mode);
2238 return gen_rtx_REG (mode, V0_REGNUM);
2240 else
2242 int i;
2243 rtx par;
2245 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2246 for (i = 0; i < count; i++)
2248 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2249 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2250 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2251 XVECEXP (par, 0, i) = tmp;
2253 return par;
2256 else
2257 return gen_rtx_REG (mode, R0_REGNUM);
2260 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2261 Return true if REGNO is the number of a hard register in which the values
2262 of called function may come back. */
2264 static bool
2265 aarch64_function_value_regno_p (const unsigned int regno)
2267 /* Maximum of 16 bytes can be returned in the general registers. Examples
2268 of 16-byte return values are: 128-bit integers and 16-byte small
2269 structures (excluding homogeneous floating-point aggregates). */
2270 if (regno == R0_REGNUM || regno == R1_REGNUM)
2271 return true;
2273 /* Up to four fp/simd registers can return a function value, e.g. a
2274 homogeneous floating-point aggregate having four members. */
2275 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2276 return TARGET_FLOAT;
2278 return false;
2281 /* Implement TARGET_RETURN_IN_MEMORY.
2283 If the type T of the result of a function is such that
2284 void func (T arg)
2285 would require that arg be passed as a value in a register (or set of
2286 registers) according to the parameter passing rules, then the result
2287 is returned in the same registers as would be used for such an
2288 argument. */
2290 static bool
2291 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2293 HOST_WIDE_INT size;
2294 machine_mode ag_mode;
2295 int count;
2297 if (!AGGREGATE_TYPE_P (type)
2298 && TREE_CODE (type) != COMPLEX_TYPE
2299 && TREE_CODE (type) != VECTOR_TYPE)
2300 /* Simple scalar types always returned in registers. */
2301 return false;
2303 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2304 type,
2305 &ag_mode,
2306 &count,
2307 NULL))
2308 return false;
2310 /* Types larger than 2 registers returned in memory. */
2311 size = int_size_in_bytes (type);
2312 return (size < 0 || size > 2 * UNITS_PER_WORD);
2315 static bool
2316 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2317 const_tree type, int *nregs)
2319 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2320 return aarch64_vfp_is_call_or_return_candidate (mode,
2321 type,
2322 &pcum->aapcs_vfp_rmode,
2323 nregs,
2324 NULL);
2327 /* Given MODE and TYPE of a function argument, return the alignment in
2328 bits. The idea is to suppress any stronger alignment requested by
2329 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2330 This is a helper function for local use only. */
2332 static unsigned int
2333 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2335 if (!type)
2336 return GET_MODE_ALIGNMENT (mode);
2338 if (integer_zerop (TYPE_SIZE (type)))
2339 return 0;
2341 gcc_assert (TYPE_MODE (type) == mode);
2343 if (!AGGREGATE_TYPE_P (type))
2344 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2346 if (TREE_CODE (type) == ARRAY_TYPE)
2347 return TYPE_ALIGN (TREE_TYPE (type));
2349 unsigned int alignment = 0;
2350 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2351 if (TREE_CODE (field) == FIELD_DECL)
2352 alignment = std::max (alignment, DECL_ALIGN (field));
2354 return alignment;
2357 /* Layout a function argument according to the AAPCS64 rules. The rule
2358 numbers refer to the rule numbers in the AAPCS64. */
2360 static void
2361 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2362 const_tree type,
2363 bool named ATTRIBUTE_UNUSED)
2365 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2366 int ncrn, nvrn, nregs;
2367 bool allocate_ncrn, allocate_nvrn;
2368 HOST_WIDE_INT size;
2370 /* We need to do this once per argument. */
2371 if (pcum->aapcs_arg_processed)
2372 return;
2374 pcum->aapcs_arg_processed = true;
2376 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2377 size
2378 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2379 UNITS_PER_WORD);
2381 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2382 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2383 mode,
2384 type,
2385 &nregs);
2387 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2388 The following code thus handles passing by SIMD/FP registers first. */
2390 nvrn = pcum->aapcs_nvrn;
2392 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2393 and homogenous short-vector aggregates (HVA). */
2394 if (allocate_nvrn)
2396 if (!TARGET_FLOAT)
2397 aarch64_err_no_fpadvsimd (mode, "argument");
2399 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2401 pcum->aapcs_nextnvrn = nvrn + nregs;
2402 if (!aarch64_composite_type_p (type, mode))
2404 gcc_assert (nregs == 1);
2405 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2407 else
2409 rtx par;
2410 int i;
2411 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2412 for (i = 0; i < nregs; i++)
2414 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2415 V0_REGNUM + nvrn + i);
2416 tmp = gen_rtx_EXPR_LIST
2417 (VOIDmode, tmp,
2418 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2419 XVECEXP (par, 0, i) = tmp;
2421 pcum->aapcs_reg = par;
2423 return;
2425 else
2427 /* C.3 NSRN is set to 8. */
2428 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2429 goto on_stack;
2433 ncrn = pcum->aapcs_ncrn;
2434 nregs = size / UNITS_PER_WORD;
2436 /* C6 - C9. though the sign and zero extension semantics are
2437 handled elsewhere. This is the case where the argument fits
2438 entirely general registers. */
2439 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2442 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2444 /* C.8 if the argument has an alignment of 16 then the NGRN is
2445 rounded up to the next even number. */
2446 if (nregs == 2
2447 && ncrn % 2
2448 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2449 comparison is there because for > 16 * BITS_PER_UNIT
2450 alignment nregs should be > 2 and therefore it should be
2451 passed by reference rather than value. */
2452 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2454 ++ncrn;
2455 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2458 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2459 A reg is still generated for it, but the caller should be smart
2460 enough not to use it. */
2461 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2462 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2463 else
2465 rtx par;
2466 int i;
2468 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2469 for (i = 0; i < nregs; i++)
2471 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2472 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2473 GEN_INT (i * UNITS_PER_WORD));
2474 XVECEXP (par, 0, i) = tmp;
2476 pcum->aapcs_reg = par;
2479 pcum->aapcs_nextncrn = ncrn + nregs;
2480 return;
2483 /* C.11 */
2484 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2486 /* The argument is passed on stack; record the needed number of words for
2487 this argument and align the total size if necessary. */
2488 on_stack:
2489 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2491 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2492 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2493 16 / UNITS_PER_WORD);
2494 return;
2497 /* Implement TARGET_FUNCTION_ARG. */
2499 static rtx
2500 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2501 const_tree type, bool named)
2503 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2504 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2506 if (mode == VOIDmode)
2507 return NULL_RTX;
2509 aarch64_layout_arg (pcum_v, mode, type, named);
2510 return pcum->aapcs_reg;
2513 void
2514 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2515 const_tree fntype ATTRIBUTE_UNUSED,
2516 rtx libname ATTRIBUTE_UNUSED,
2517 const_tree fndecl ATTRIBUTE_UNUSED,
2518 unsigned n_named ATTRIBUTE_UNUSED)
2520 pcum->aapcs_ncrn = 0;
2521 pcum->aapcs_nvrn = 0;
2522 pcum->aapcs_nextncrn = 0;
2523 pcum->aapcs_nextnvrn = 0;
2524 pcum->pcs_variant = ARM_PCS_AAPCS64;
2525 pcum->aapcs_reg = NULL_RTX;
2526 pcum->aapcs_arg_processed = false;
2527 pcum->aapcs_stack_words = 0;
2528 pcum->aapcs_stack_size = 0;
2530 if (!TARGET_FLOAT
2531 && fndecl && TREE_PUBLIC (fndecl)
2532 && fntype && fntype != error_mark_node)
2534 const_tree type = TREE_TYPE (fntype);
2535 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2536 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2537 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2538 &mode, &nregs, NULL))
2539 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2541 return;
2544 static void
2545 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2546 machine_mode mode,
2547 const_tree type,
2548 bool named)
2550 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2551 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2553 aarch64_layout_arg (pcum_v, mode, type, named);
2554 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2555 != (pcum->aapcs_stack_words != 0));
2556 pcum->aapcs_arg_processed = false;
2557 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2558 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2559 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2560 pcum->aapcs_stack_words = 0;
2561 pcum->aapcs_reg = NULL_RTX;
2565 bool
2566 aarch64_function_arg_regno_p (unsigned regno)
2568 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2569 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2572 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2573 PARM_BOUNDARY bits of alignment, but will be given anything up
2574 to STACK_BOUNDARY bits if the type requires it. This makes sure
2575 that both before and after the layout of each argument, the Next
2576 Stacked Argument Address (NSAA) will have a minimum alignment of
2577 8 bytes. */
2579 static unsigned int
2580 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2582 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2583 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2586 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2588 Return true if an argument passed on the stack should be padded upwards,
2589 i.e. if the least-significant byte of the stack slot has useful data.
2591 Small aggregate types are placed in the lowest memory address.
2593 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2595 bool
2596 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2598 /* On little-endian targets, the least significant byte of every stack
2599 argument is passed at the lowest byte address of the stack slot. */
2600 if (!BYTES_BIG_ENDIAN)
2601 return true;
2603 /* Otherwise, integral, floating-point and pointer types are padded downward:
2604 the least significant byte of a stack argument is passed at the highest
2605 byte address of the stack slot. */
2606 if (type
2607 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2608 || POINTER_TYPE_P (type))
2609 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2610 return false;
2612 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2613 return true;
2616 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2618 It specifies padding for the last (may also be the only)
2619 element of a block move between registers and memory. If
2620 assuming the block is in the memory, padding upward means that
2621 the last element is padded after its highest significant byte,
2622 while in downward padding, the last element is padded at the
2623 its least significant byte side.
2625 Small aggregates and small complex types are always padded
2626 upwards.
2628 We don't need to worry about homogeneous floating-point or
2629 short-vector aggregates; their move is not affected by the
2630 padding direction determined here. Regardless of endianness,
2631 each element of such an aggregate is put in the least
2632 significant bits of a fp/simd register.
2634 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2635 register has useful data, and return the opposite if the most
2636 significant byte does. */
2638 bool
2639 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2640 bool first ATTRIBUTE_UNUSED)
2643 /* Small composite types are always padded upward. */
2644 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2646 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2647 : GET_MODE_SIZE (mode));
2648 if (size < 2 * UNITS_PER_WORD)
2649 return true;
2652 /* Otherwise, use the default padding. */
2653 return !BYTES_BIG_ENDIAN;
2656 static machine_mode
2657 aarch64_libgcc_cmp_return_mode (void)
2659 return SImode;
2662 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2664 /* We use the 12-bit shifted immediate arithmetic instructions so values
2665 must be multiple of (1 << 12), i.e. 4096. */
2666 #define ARITH_FACTOR 4096
2668 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2669 #error Cannot use simple address calculation for stack probing
2670 #endif
2672 /* The pair of scratch registers used for stack probing. */
2673 #define PROBE_STACK_FIRST_REG 9
2674 #define PROBE_STACK_SECOND_REG 10
2676 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2677 inclusive. These are offsets from the current stack pointer. */
2679 static void
2680 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2682 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2684 /* See the same assertion on PROBE_INTERVAL above. */
2685 gcc_assert ((first % ARITH_FACTOR) == 0);
2687 /* See if we have a constant small number of probes to generate. If so,
2688 that's the easy case. */
2689 if (size <= PROBE_INTERVAL)
2691 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2693 emit_set_insn (reg1,
2694 plus_constant (Pmode,
2695 stack_pointer_rtx, -(first + base)));
2696 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2699 /* The run-time loop is made up of 8 insns in the generic case while the
2700 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2701 else if (size <= 4 * PROBE_INTERVAL)
2703 HOST_WIDE_INT i, rem;
2705 emit_set_insn (reg1,
2706 plus_constant (Pmode,
2707 stack_pointer_rtx,
2708 -(first + PROBE_INTERVAL)));
2709 emit_stack_probe (reg1);
2711 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2712 it exceeds SIZE. If only two probes are needed, this will not
2713 generate any code. Then probe at FIRST + SIZE. */
2714 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2716 emit_set_insn (reg1,
2717 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2718 emit_stack_probe (reg1);
2721 rem = size - (i - PROBE_INTERVAL);
2722 if (rem > 256)
2724 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2726 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2727 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2729 else
2730 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2733 /* Otherwise, do the same as above, but in a loop. Note that we must be
2734 extra careful with variables wrapping around because we might be at
2735 the very top (or the very bottom) of the address space and we have
2736 to be able to handle this case properly; in particular, we use an
2737 equality test for the loop condition. */
2738 else
2740 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2742 /* Step 1: round SIZE to the previous multiple of the interval. */
2744 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2747 /* Step 2: compute initial and final value of the loop counter. */
2749 /* TEST_ADDR = SP + FIRST. */
2750 emit_set_insn (reg1,
2751 plus_constant (Pmode, stack_pointer_rtx, -first));
2753 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2754 HOST_WIDE_INT adjustment = - (first + rounded_size);
2755 if (! aarch64_uimm12_shift (adjustment))
2757 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2758 true, Pmode);
2759 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2761 else
2763 emit_set_insn (reg2,
2764 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2767 /* Step 3: the loop
2771 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2772 probe at TEST_ADDR
2774 while (TEST_ADDR != LAST_ADDR)
2776 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2777 until it is equal to ROUNDED_SIZE. */
2779 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2782 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2783 that SIZE is equal to ROUNDED_SIZE. */
2785 if (size != rounded_size)
2787 HOST_WIDE_INT rem = size - rounded_size;
2789 if (rem > 256)
2791 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2793 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2794 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2796 else
2797 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2801 /* Make sure nothing is scheduled before we are done. */
2802 emit_insn (gen_blockage ());
2805 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2806 absolute addresses. */
2808 const char *
2809 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2811 static int labelno = 0;
2812 char loop_lab[32];
2813 rtx xops[2];
2815 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2817 /* Loop. */
2818 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2820 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2821 xops[0] = reg1;
2822 xops[1] = GEN_INT (PROBE_INTERVAL);
2823 output_asm_insn ("sub\t%0, %0, %1", xops);
2825 /* Probe at TEST_ADDR. */
2826 output_asm_insn ("str\txzr, [%0]", xops);
2828 /* Test if TEST_ADDR == LAST_ADDR. */
2829 xops[1] = reg2;
2830 output_asm_insn ("cmp\t%0, %1", xops);
2832 /* Branch. */
2833 fputs ("\tb.ne\t", asm_out_file);
2834 assemble_name_raw (asm_out_file, loop_lab);
2835 fputc ('\n', asm_out_file);
2837 return "";
2840 static bool
2841 aarch64_frame_pointer_required (void)
2843 /* In aarch64_override_options_after_change
2844 flag_omit_leaf_frame_pointer turns off the frame pointer by
2845 default. Turn it back on now if we've not got a leaf
2846 function. */
2847 if (flag_omit_leaf_frame_pointer
2848 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2849 return true;
2851 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2852 if (crtl->calls_eh_return)
2853 return true;
2855 return false;
2858 /* Mark the registers that need to be saved by the callee and calculate
2859 the size of the callee-saved registers area and frame record (both FP
2860 and LR may be omitted). */
2861 static void
2862 aarch64_layout_frame (void)
2864 HOST_WIDE_INT offset = 0;
2865 int regno, last_fp_reg = INVALID_REGNUM;
2867 if (reload_completed && cfun->machine->frame.laid_out)
2868 return;
2870 #define SLOT_NOT_REQUIRED (-2)
2871 #define SLOT_REQUIRED (-1)
2873 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2874 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2876 /* First mark all the registers that really need to be saved... */
2877 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2878 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2880 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2881 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2883 /* ... that includes the eh data registers (if needed)... */
2884 if (crtl->calls_eh_return)
2885 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2886 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2887 = SLOT_REQUIRED;
2889 /* ... and any callee saved register that dataflow says is live. */
2890 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2891 if (df_regs_ever_live_p (regno)
2892 && (regno == R30_REGNUM
2893 || !call_used_regs[regno]))
2894 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2896 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2897 if (df_regs_ever_live_p (regno)
2898 && !call_used_regs[regno])
2900 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2901 last_fp_reg = regno;
2904 if (frame_pointer_needed)
2906 /* FP and LR are placed in the linkage record. */
2907 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2908 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2909 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2910 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2911 offset += 2 * UNITS_PER_WORD;
2914 /* Now assign stack slots for them. */
2915 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2916 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2918 cfun->machine->frame.reg_offset[regno] = offset;
2919 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2920 cfun->machine->frame.wb_candidate1 = regno;
2921 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2922 cfun->machine->frame.wb_candidate2 = regno;
2923 offset += UNITS_PER_WORD;
2926 HOST_WIDE_INT max_int_offset = offset;
2927 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2928 bool has_align_gap = offset != max_int_offset;
2930 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2931 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2933 /* If there is an alignment gap between integer and fp callee-saves,
2934 allocate the last fp register to it if possible. */
2935 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2937 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2938 break;
2941 cfun->machine->frame.reg_offset[regno] = offset;
2942 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2943 cfun->machine->frame.wb_candidate1 = regno;
2944 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2945 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2946 cfun->machine->frame.wb_candidate2 = regno;
2947 offset += UNITS_PER_WORD;
2950 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2952 cfun->machine->frame.saved_regs_size = offset;
2954 HOST_WIDE_INT varargs_and_saved_regs_size
2955 = offset + cfun->machine->frame.saved_varargs_size;
2957 cfun->machine->frame.hard_fp_offset
2958 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2959 STACK_BOUNDARY / BITS_PER_UNIT);
2961 cfun->machine->frame.frame_size
2962 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2963 + crtl->outgoing_args_size,
2964 STACK_BOUNDARY / BITS_PER_UNIT);
2966 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2968 cfun->machine->frame.initial_adjust = 0;
2969 cfun->machine->frame.final_adjust = 0;
2970 cfun->machine->frame.callee_adjust = 0;
2971 cfun->machine->frame.callee_offset = 0;
2973 HOST_WIDE_INT max_push_offset = 0;
2974 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2975 max_push_offset = 512;
2976 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2977 max_push_offset = 256;
2979 if (cfun->machine->frame.frame_size < max_push_offset
2980 && crtl->outgoing_args_size == 0)
2982 /* Simple, small frame with no outgoing arguments:
2983 stp reg1, reg2, [sp, -frame_size]!
2984 stp reg3, reg4, [sp, 16] */
2985 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2987 else if ((crtl->outgoing_args_size
2988 + cfun->machine->frame.saved_regs_size < 512)
2989 && !(cfun->calls_alloca
2990 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2992 /* Frame with small outgoing arguments:
2993 sub sp, sp, frame_size
2994 stp reg1, reg2, [sp, outgoing_args_size]
2995 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2996 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2997 cfun->machine->frame.callee_offset
2998 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3000 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3002 /* Frame with large outgoing arguments but a small local area:
3003 stp reg1, reg2, [sp, -hard_fp_offset]!
3004 stp reg3, reg4, [sp, 16]
3005 sub sp, sp, outgoing_args_size */
3006 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3007 cfun->machine->frame.final_adjust
3008 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3010 else if (!frame_pointer_needed
3011 && varargs_and_saved_regs_size < max_push_offset)
3013 /* Frame with large local area and outgoing arguments (this pushes the
3014 callee-saves first, followed by the locals and outgoing area):
3015 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3016 stp reg3, reg4, [sp, 16]
3017 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3018 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3019 cfun->machine->frame.final_adjust
3020 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3021 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3022 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3024 else
3026 /* Frame with large local area and outgoing arguments using frame pointer:
3027 sub sp, sp, hard_fp_offset
3028 stp x29, x30, [sp, 0]
3029 add x29, sp, 0
3030 stp reg3, reg4, [sp, 16]
3031 sub sp, sp, outgoing_args_size */
3032 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3033 cfun->machine->frame.final_adjust
3034 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3037 cfun->machine->frame.laid_out = true;
3040 /* Return true if the register REGNO is saved on entry to
3041 the current function. */
3043 static bool
3044 aarch64_register_saved_on_entry (int regno)
3046 return cfun->machine->frame.reg_offset[regno] >= 0;
3049 /* Return the next register up from REGNO up to LIMIT for the callee
3050 to save. */
3052 static unsigned
3053 aarch64_next_callee_save (unsigned regno, unsigned limit)
3055 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3056 regno ++;
3057 return regno;
3060 /* Push the register number REGNO of mode MODE to the stack with write-back
3061 adjusting the stack by ADJUSTMENT. */
3063 static void
3064 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3065 HOST_WIDE_INT adjustment)
3067 rtx base_rtx = stack_pointer_rtx;
3068 rtx insn, reg, mem;
3070 reg = gen_rtx_REG (mode, regno);
3071 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3072 plus_constant (Pmode, base_rtx, -adjustment));
3073 mem = gen_frame_mem (mode, mem);
3075 insn = emit_move_insn (mem, reg);
3076 RTX_FRAME_RELATED_P (insn) = 1;
3079 /* Generate and return an instruction to store the pair of registers
3080 REG and REG2 of mode MODE to location BASE with write-back adjusting
3081 the stack location BASE by ADJUSTMENT. */
3083 static rtx
3084 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3085 HOST_WIDE_INT adjustment)
3087 switch (mode)
3089 case E_DImode:
3090 return gen_storewb_pairdi_di (base, base, reg, reg2,
3091 GEN_INT (-adjustment),
3092 GEN_INT (UNITS_PER_WORD - adjustment));
3093 case E_DFmode:
3094 return gen_storewb_pairdf_di (base, base, reg, reg2,
3095 GEN_INT (-adjustment),
3096 GEN_INT (UNITS_PER_WORD - adjustment));
3097 default:
3098 gcc_unreachable ();
3102 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3103 stack pointer by ADJUSTMENT. */
3105 static void
3106 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3108 rtx_insn *insn;
3109 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3111 if (regno2 == INVALID_REGNUM)
3112 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3114 rtx reg1 = gen_rtx_REG (mode, regno1);
3115 rtx reg2 = gen_rtx_REG (mode, regno2);
3117 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3118 reg2, adjustment));
3119 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3120 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3121 RTX_FRAME_RELATED_P (insn) = 1;
3124 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3125 adjusting it by ADJUSTMENT afterwards. */
3127 static rtx
3128 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3129 HOST_WIDE_INT adjustment)
3131 switch (mode)
3133 case E_DImode:
3134 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3135 GEN_INT (UNITS_PER_WORD));
3136 case E_DFmode:
3137 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3138 GEN_INT (UNITS_PER_WORD));
3139 default:
3140 gcc_unreachable ();
3144 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3145 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3146 into CFI_OPS. */
3148 static void
3149 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3150 rtx *cfi_ops)
3152 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3153 rtx reg1 = gen_rtx_REG (mode, regno1);
3155 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3157 if (regno2 == INVALID_REGNUM)
3159 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3160 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3161 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3163 else
3165 rtx reg2 = gen_rtx_REG (mode, regno2);
3166 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3167 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3168 reg2, adjustment));
3172 /* Generate and return a store pair instruction of mode MODE to store
3173 register REG1 to MEM1 and register REG2 to MEM2. */
3175 static rtx
3176 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3177 rtx reg2)
3179 switch (mode)
3181 case E_DImode:
3182 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3184 case E_DFmode:
3185 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3187 default:
3188 gcc_unreachable ();
3192 /* Generate and regurn a load pair isntruction of mode MODE to load register
3193 REG1 from MEM1 and register REG2 from MEM2. */
3195 static rtx
3196 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3197 rtx mem2)
3199 switch (mode)
3201 case E_DImode:
3202 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3204 case E_DFmode:
3205 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3207 default:
3208 gcc_unreachable ();
3212 /* Return TRUE if return address signing should be enabled for the current
3213 function, otherwise return FALSE. */
3215 bool
3216 aarch64_return_address_signing_enabled (void)
3218 /* This function should only be called after frame laid out. */
3219 gcc_assert (cfun->machine->frame.laid_out);
3221 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3222 if it's LR is pushed onto stack. */
3223 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3224 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3225 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3228 /* Emit code to save the callee-saved registers from register number START
3229 to LIMIT to the stack at the location starting at offset START_OFFSET,
3230 skipping any write-back candidates if SKIP_WB is true. */
3232 static void
3233 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3234 unsigned start, unsigned limit, bool skip_wb)
3236 rtx_insn *insn;
3237 unsigned regno;
3238 unsigned regno2;
3240 for (regno = aarch64_next_callee_save (start, limit);
3241 regno <= limit;
3242 regno = aarch64_next_callee_save (regno + 1, limit))
3244 rtx reg, mem;
3245 HOST_WIDE_INT offset;
3247 if (skip_wb
3248 && (regno == cfun->machine->frame.wb_candidate1
3249 || regno == cfun->machine->frame.wb_candidate2))
3250 continue;
3252 if (cfun->machine->reg_is_wrapped_separately[regno])
3253 continue;
3255 reg = gen_rtx_REG (mode, regno);
3256 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3257 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3258 offset));
3260 regno2 = aarch64_next_callee_save (regno + 1, limit);
3262 if (regno2 <= limit
3263 && !cfun->machine->reg_is_wrapped_separately[regno2]
3264 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3265 == cfun->machine->frame.reg_offset[regno2]))
3268 rtx reg2 = gen_rtx_REG (mode, regno2);
3269 rtx mem2;
3271 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3272 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3273 offset));
3274 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3275 reg2));
3277 /* The first part of a frame-related parallel insn is
3278 always assumed to be relevant to the frame
3279 calculations; subsequent parts, are only
3280 frame-related if explicitly marked. */
3281 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3282 regno = regno2;
3284 else
3285 insn = emit_move_insn (mem, reg);
3287 RTX_FRAME_RELATED_P (insn) = 1;
3291 /* Emit code to restore the callee registers of mode MODE from register
3292 number START up to and including LIMIT. Restore from the stack offset
3293 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3294 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3296 static void
3297 aarch64_restore_callee_saves (machine_mode mode,
3298 HOST_WIDE_INT start_offset, unsigned start,
3299 unsigned limit, bool skip_wb, rtx *cfi_ops)
3301 rtx base_rtx = stack_pointer_rtx;
3302 unsigned regno;
3303 unsigned regno2;
3304 HOST_WIDE_INT offset;
3306 for (regno = aarch64_next_callee_save (start, limit);
3307 regno <= limit;
3308 regno = aarch64_next_callee_save (regno + 1, limit))
3310 if (cfun->machine->reg_is_wrapped_separately[regno])
3311 continue;
3313 rtx reg, mem;
3315 if (skip_wb
3316 && (regno == cfun->machine->frame.wb_candidate1
3317 || regno == cfun->machine->frame.wb_candidate2))
3318 continue;
3320 reg = gen_rtx_REG (mode, regno);
3321 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3322 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3324 regno2 = aarch64_next_callee_save (regno + 1, limit);
3326 if (regno2 <= limit
3327 && !cfun->machine->reg_is_wrapped_separately[regno2]
3328 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3329 == cfun->machine->frame.reg_offset[regno2]))
3331 rtx reg2 = gen_rtx_REG (mode, regno2);
3332 rtx mem2;
3334 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3335 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3336 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3338 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3339 regno = regno2;
3341 else
3342 emit_move_insn (reg, mem);
3343 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3347 static inline bool
3348 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3349 HOST_WIDE_INT offset)
3351 return offset >= -256 && offset < 256;
3354 static inline bool
3355 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3357 return (offset >= 0
3358 && offset < 4096 * GET_MODE_SIZE (mode)
3359 && offset % GET_MODE_SIZE (mode) == 0);
3362 bool
3363 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3365 return (offset >= -64 * GET_MODE_SIZE (mode)
3366 && offset < 64 * GET_MODE_SIZE (mode)
3367 && offset % GET_MODE_SIZE (mode) == 0);
3370 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3372 static sbitmap
3373 aarch64_get_separate_components (void)
3375 aarch64_layout_frame ();
3377 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3378 bitmap_clear (components);
3380 /* The registers we need saved to the frame. */
3381 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3382 if (aarch64_register_saved_on_entry (regno))
3384 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3385 if (!frame_pointer_needed)
3386 offset += cfun->machine->frame.frame_size
3387 - cfun->machine->frame.hard_fp_offset;
3388 /* Check that we can access the stack slot of the register with one
3389 direct load with no adjustments needed. */
3390 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3391 bitmap_set_bit (components, regno);
3394 /* Don't mess with the hard frame pointer. */
3395 if (frame_pointer_needed)
3396 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3398 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3399 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3400 /* If aarch64_layout_frame has chosen registers to store/restore with
3401 writeback don't interfere with them to avoid having to output explicit
3402 stack adjustment instructions. */
3403 if (reg2 != INVALID_REGNUM)
3404 bitmap_clear_bit (components, reg2);
3405 if (reg1 != INVALID_REGNUM)
3406 bitmap_clear_bit (components, reg1);
3408 bitmap_clear_bit (components, LR_REGNUM);
3409 bitmap_clear_bit (components, SP_REGNUM);
3411 return components;
3414 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3416 static sbitmap
3417 aarch64_components_for_bb (basic_block bb)
3419 bitmap in = DF_LIVE_IN (bb);
3420 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3421 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3423 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3424 bitmap_clear (components);
3426 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3427 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3428 if ((!call_used_regs[regno])
3429 && (bitmap_bit_p (in, regno)
3430 || bitmap_bit_p (gen, regno)
3431 || bitmap_bit_p (kill, regno)))
3432 bitmap_set_bit (components, regno);
3434 return components;
3437 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3438 Nothing to do for aarch64. */
3440 static void
3441 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3445 /* Return the next set bit in BMP from START onwards. Return the total number
3446 of bits in BMP if no set bit is found at or after START. */
3448 static unsigned int
3449 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3451 unsigned int nbits = SBITMAP_SIZE (bmp);
3452 if (start == nbits)
3453 return start;
3455 gcc_assert (start < nbits);
3456 for (unsigned int i = start; i < nbits; i++)
3457 if (bitmap_bit_p (bmp, i))
3458 return i;
3460 return nbits;
3463 /* Do the work for aarch64_emit_prologue_components and
3464 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3465 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3466 for these components or the epilogue sequence. That is, it determines
3467 whether we should emit stores or loads and what kind of CFA notes to attach
3468 to the insns. Otherwise the logic for the two sequences is very
3469 similar. */
3471 static void
3472 aarch64_process_components (sbitmap components, bool prologue_p)
3474 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3475 ? HARD_FRAME_POINTER_REGNUM
3476 : STACK_POINTER_REGNUM);
3478 unsigned last_regno = SBITMAP_SIZE (components);
3479 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3480 rtx_insn *insn = NULL;
3482 while (regno != last_regno)
3484 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3485 so DFmode for the vector registers is enough. */
3486 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3487 rtx reg = gen_rtx_REG (mode, regno);
3488 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3489 if (!frame_pointer_needed)
3490 offset += cfun->machine->frame.frame_size
3491 - cfun->machine->frame.hard_fp_offset;
3492 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3493 rtx mem = gen_frame_mem (mode, addr);
3495 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3496 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3497 /* No more registers to handle after REGNO.
3498 Emit a single save/restore and exit. */
3499 if (regno2 == last_regno)
3501 insn = emit_insn (set);
3502 RTX_FRAME_RELATED_P (insn) = 1;
3503 if (prologue_p)
3504 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3505 else
3506 add_reg_note (insn, REG_CFA_RESTORE, reg);
3507 break;
3510 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3511 /* The next register is not of the same class or its offset is not
3512 mergeable with the current one into a pair. */
3513 if (!satisfies_constraint_Ump (mem)
3514 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3515 || (offset2 - cfun->machine->frame.reg_offset[regno])
3516 != GET_MODE_SIZE (mode))
3518 insn = emit_insn (set);
3519 RTX_FRAME_RELATED_P (insn) = 1;
3520 if (prologue_p)
3521 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3522 else
3523 add_reg_note (insn, REG_CFA_RESTORE, reg);
3525 regno = regno2;
3526 continue;
3529 /* REGNO2 can be saved/restored in a pair with REGNO. */
3530 rtx reg2 = gen_rtx_REG (mode, regno2);
3531 if (!frame_pointer_needed)
3532 offset2 += cfun->machine->frame.frame_size
3533 - cfun->machine->frame.hard_fp_offset;
3534 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3535 rtx mem2 = gen_frame_mem (mode, addr2);
3536 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3537 : gen_rtx_SET (reg2, mem2);
3539 if (prologue_p)
3540 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3541 else
3542 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3544 RTX_FRAME_RELATED_P (insn) = 1;
3545 if (prologue_p)
3547 add_reg_note (insn, REG_CFA_OFFSET, set);
3548 add_reg_note (insn, REG_CFA_OFFSET, set2);
3550 else
3552 add_reg_note (insn, REG_CFA_RESTORE, reg);
3553 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3556 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3560 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3562 static void
3563 aarch64_emit_prologue_components (sbitmap components)
3565 aarch64_process_components (components, true);
3568 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3570 static void
3571 aarch64_emit_epilogue_components (sbitmap components)
3573 aarch64_process_components (components, false);
3576 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3578 static void
3579 aarch64_set_handled_components (sbitmap components)
3581 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3582 if (bitmap_bit_p (components, regno))
3583 cfun->machine->reg_is_wrapped_separately[regno] = true;
3586 /* AArch64 stack frames generated by this compiler look like:
3588 +-------------------------------+
3590 | incoming stack arguments |
3592 +-------------------------------+
3593 | | <-- incoming stack pointer (aligned)
3594 | callee-allocated save area |
3595 | for register varargs |
3597 +-------------------------------+
3598 | local variables | <-- frame_pointer_rtx
3600 +-------------------------------+
3601 | padding0 | \
3602 +-------------------------------+ |
3603 | callee-saved registers | | frame.saved_regs_size
3604 +-------------------------------+ |
3605 | LR' | |
3606 +-------------------------------+ |
3607 | FP' | / <- hard_frame_pointer_rtx (aligned)
3608 +-------------------------------+
3609 | dynamic allocation |
3610 +-------------------------------+
3611 | padding |
3612 +-------------------------------+
3613 | outgoing stack arguments | <-- arg_pointer
3615 +-------------------------------+
3616 | | <-- stack_pointer_rtx (aligned)
3618 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3619 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3620 unchanged. */
3622 /* Generate the prologue instructions for entry into a function.
3623 Establish the stack frame by decreasing the stack pointer with a
3624 properly calculated size and, if necessary, create a frame record
3625 filled with the values of LR and previous frame pointer. The
3626 current FP is also set up if it is in use. */
3628 void
3629 aarch64_expand_prologue (void)
3631 aarch64_layout_frame ();
3633 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3634 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3635 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3636 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3637 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3638 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3639 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3640 rtx_insn *insn;
3642 /* Sign return address for functions. */
3643 if (aarch64_return_address_signing_enabled ())
3645 insn = emit_insn (gen_pacisp ());
3646 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3647 RTX_FRAME_RELATED_P (insn) = 1;
3650 if (flag_stack_usage_info)
3651 current_function_static_stack_size = frame_size;
3653 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3655 if (crtl->is_leaf && !cfun->calls_alloca)
3657 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3658 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3659 frame_size - STACK_CHECK_PROTECT);
3661 else if (frame_size > 0)
3662 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3665 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3667 if (callee_adjust != 0)
3668 aarch64_push_regs (reg1, reg2, callee_adjust);
3670 if (frame_pointer_needed)
3672 if (callee_adjust == 0)
3673 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3674 R30_REGNUM, false);
3675 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3676 stack_pointer_rtx,
3677 GEN_INT (callee_offset)));
3678 RTX_FRAME_RELATED_P (insn) = 1;
3679 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3682 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3683 callee_adjust != 0 || frame_pointer_needed);
3684 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3685 callee_adjust != 0 || frame_pointer_needed);
3686 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3689 /* Return TRUE if we can use a simple_return insn.
3691 This function checks whether the callee saved stack is empty, which
3692 means no restore actions are need. The pro_and_epilogue will use
3693 this to check whether shrink-wrapping opt is feasible. */
3695 bool
3696 aarch64_use_return_insn_p (void)
3698 if (!reload_completed)
3699 return false;
3701 if (crtl->profile)
3702 return false;
3704 aarch64_layout_frame ();
3706 return cfun->machine->frame.frame_size == 0;
3709 /* Generate the epilogue instructions for returning from a function.
3710 This is almost exactly the reverse of the prolog sequence, except
3711 that we need to insert barriers to avoid scheduling loads that read
3712 from a deallocated stack, and we optimize the unwind records by
3713 emitting them all together if possible. */
3714 void
3715 aarch64_expand_epilogue (bool for_sibcall)
3717 aarch64_layout_frame ();
3719 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3720 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3721 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3722 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3723 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3724 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3725 rtx cfi_ops = NULL;
3726 rtx_insn *insn;
3728 /* We need to add memory barrier to prevent read from deallocated stack. */
3729 bool need_barrier_p = (get_frame_size ()
3730 + cfun->machine->frame.saved_varargs_size) != 0;
3732 /* Emit a barrier to prevent loads from a deallocated stack. */
3733 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3734 || crtl->calls_eh_return)
3736 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3737 need_barrier_p = false;
3740 /* Restore the stack pointer from the frame pointer if it may not
3741 be the same as the stack pointer. */
3742 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3744 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3745 hard_frame_pointer_rtx,
3746 GEN_INT (-callee_offset)));
3747 /* If writeback is used when restoring callee-saves, the CFA
3748 is restored on the instruction doing the writeback. */
3749 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3751 else
3752 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3754 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3755 callee_adjust != 0, &cfi_ops);
3756 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3757 callee_adjust != 0, &cfi_ops);
3759 if (need_barrier_p)
3760 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3762 if (callee_adjust != 0)
3763 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3765 if (callee_adjust != 0 || initial_adjust > 65536)
3767 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3768 insn = get_last_insn ();
3769 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3770 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3771 RTX_FRAME_RELATED_P (insn) = 1;
3772 cfi_ops = NULL;
3775 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3777 if (cfi_ops)
3779 /* Emit delayed restores and reset the CFA to be SP. */
3780 insn = get_last_insn ();
3781 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3782 REG_NOTES (insn) = cfi_ops;
3783 RTX_FRAME_RELATED_P (insn) = 1;
3786 /* We prefer to emit the combined return/authenticate instruction RETAA,
3787 however there are three cases in which we must instead emit an explicit
3788 authentication instruction.
3790 1) Sibcalls don't return in a normal way, so if we're about to call one
3791 we must authenticate.
3793 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3794 generating code for !TARGET_ARMV8_3 we can't use it and must
3795 explicitly authenticate.
3797 3) On an eh_return path we make extra stack adjustments to update the
3798 canonical frame address to be the exception handler's CFA. We want
3799 to authenticate using the CFA of the function which calls eh_return.
3801 if (aarch64_return_address_signing_enabled ()
3802 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3804 insn = emit_insn (gen_autisp ());
3805 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3806 RTX_FRAME_RELATED_P (insn) = 1;
3809 /* Stack adjustment for exception handler. */
3810 if (crtl->calls_eh_return)
3812 /* We need to unwind the stack by the offset computed by
3813 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3814 to be SP; letting the CFA move during this adjustment
3815 is just as correct as retaining the CFA from the body
3816 of the function. Therefore, do nothing special. */
3817 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3820 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3821 if (!for_sibcall)
3822 emit_jump_insn (ret_rtx);
3825 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3826 normally or return to a previous frame after unwinding.
3828 An EH return uses a single shared return sequence. The epilogue is
3829 exactly like a normal epilogue except that it has an extra input
3830 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3831 that must be applied after the frame has been destroyed. An extra label
3832 is inserted before the epilogue which initializes this register to zero,
3833 and this is the entry point for a normal return.
3835 An actual EH return updates the return address, initializes the stack
3836 adjustment and jumps directly into the epilogue (bypassing the zeroing
3837 of the adjustment). Since the return address is typically saved on the
3838 stack when a function makes a call, the saved LR must be updated outside
3839 the epilogue.
3841 This poses problems as the store is generated well before the epilogue,
3842 so the offset of LR is not known yet. Also optimizations will remove the
3843 store as it appears dead, even after the epilogue is generated (as the
3844 base or offset for loading LR is different in many cases).
3846 To avoid these problems this implementation forces the frame pointer
3847 in eh_return functions so that the location of LR is fixed and known early.
3848 It also marks the store volatile, so no optimization is permitted to
3849 remove the store. */
3851 aarch64_eh_return_handler_rtx (void)
3853 rtx tmp = gen_frame_mem (Pmode,
3854 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3856 /* Mark the store volatile, so no optimization is permitted to remove it. */
3857 MEM_VOLATILE_P (tmp) = true;
3858 return tmp;
3861 /* Output code to add DELTA to the first argument, and then jump
3862 to FUNCTION. Used for C++ multiple inheritance. */
3863 static void
3864 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3865 HOST_WIDE_INT delta,
3866 HOST_WIDE_INT vcall_offset,
3867 tree function)
3869 /* The this pointer is always in x0. Note that this differs from
3870 Arm where the this pointer maybe bumped to r1 if r0 is required
3871 to return a pointer to an aggregate. On AArch64 a result value
3872 pointer will be in x8. */
3873 int this_regno = R0_REGNUM;
3874 rtx this_rtx, temp0, temp1, addr, funexp;
3875 rtx_insn *insn;
3877 reload_completed = 1;
3878 emit_note (NOTE_INSN_PROLOGUE_END);
3880 if (vcall_offset == 0)
3881 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3882 else
3884 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3886 this_rtx = gen_rtx_REG (Pmode, this_regno);
3887 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3888 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3890 addr = this_rtx;
3891 if (delta != 0)
3893 if (delta >= -256 && delta < 256)
3894 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3895 plus_constant (Pmode, this_rtx, delta));
3896 else
3897 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3900 if (Pmode == ptr_mode)
3901 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3902 else
3903 aarch64_emit_move (temp0,
3904 gen_rtx_ZERO_EXTEND (Pmode,
3905 gen_rtx_MEM (ptr_mode, addr)));
3907 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3908 addr = plus_constant (Pmode, temp0, vcall_offset);
3909 else
3911 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3912 Pmode);
3913 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3916 if (Pmode == ptr_mode)
3917 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3918 else
3919 aarch64_emit_move (temp1,
3920 gen_rtx_SIGN_EXTEND (Pmode,
3921 gen_rtx_MEM (ptr_mode, addr)));
3923 emit_insn (gen_add2_insn (this_rtx, temp1));
3926 /* Generate a tail call to the target function. */
3927 if (!TREE_USED (function))
3929 assemble_external (function);
3930 TREE_USED (function) = 1;
3932 funexp = XEXP (DECL_RTL (function), 0);
3933 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3934 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3935 SIBLING_CALL_P (insn) = 1;
3937 insn = get_insns ();
3938 shorten_branches (insn);
3939 final_start_function (insn, file, 1);
3940 final (insn, file, 1);
3941 final_end_function ();
3943 /* Stop pretending to be a post-reload pass. */
3944 reload_completed = 0;
3947 static bool
3948 aarch64_tls_referenced_p (rtx x)
3950 if (!TARGET_HAVE_TLS)
3951 return false;
3952 subrtx_iterator::array_type array;
3953 FOR_EACH_SUBRTX (iter, array, x, ALL)
3955 const_rtx x = *iter;
3956 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3957 return true;
3958 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3959 TLS offsets, not real symbol references. */
3960 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3961 iter.skip_subrtxes ();
3963 return false;
3967 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3968 a left shift of 0 or 12 bits. */
3969 bool
3970 aarch64_uimm12_shift (HOST_WIDE_INT val)
3972 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3973 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3978 /* Return true if val is an immediate that can be loaded into a
3979 register by a MOVZ instruction. */
3980 static bool
3981 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3983 if (GET_MODE_SIZE (mode) > 4)
3985 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3986 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3987 return 1;
3989 else
3991 /* Ignore sign extension. */
3992 val &= (HOST_WIDE_INT) 0xffffffff;
3994 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3995 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3998 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4000 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4002 0x0000000100000001ull,
4003 0x0001000100010001ull,
4004 0x0101010101010101ull,
4005 0x1111111111111111ull,
4006 0x5555555555555555ull,
4010 /* Return true if val is a valid bitmask immediate. */
4012 bool
4013 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4015 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4016 int bits;
4018 /* Check for a single sequence of one bits and return quickly if so.
4019 The special cases of all ones and all zeroes returns false. */
4020 val = (unsigned HOST_WIDE_INT) val_in;
4021 tmp = val + (val & -val);
4023 if (tmp == (tmp & -tmp))
4024 return (val + 1) > 1;
4026 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4027 if (mode == SImode)
4028 val = (val << 32) | (val & 0xffffffff);
4030 /* Invert if the immediate doesn't start with a zero bit - this means we
4031 only need to search for sequences of one bits. */
4032 if (val & 1)
4033 val = ~val;
4035 /* Find the first set bit and set tmp to val with the first sequence of one
4036 bits removed. Return success if there is a single sequence of ones. */
4037 first_one = val & -val;
4038 tmp = val & (val + first_one);
4040 if (tmp == 0)
4041 return true;
4043 /* Find the next set bit and compute the difference in bit position. */
4044 next_one = tmp & -tmp;
4045 bits = clz_hwi (first_one) - clz_hwi (next_one);
4046 mask = val ^ tmp;
4048 /* Check the bit position difference is a power of 2, and that the first
4049 sequence of one bits fits within 'bits' bits. */
4050 if ((mask >> bits) != 0 || bits != (bits & -bits))
4051 return false;
4053 /* Check the sequence of one bits is repeated 64/bits times. */
4054 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4057 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4058 Assumed precondition: VAL_IN Is not zero. */
4060 unsigned HOST_WIDE_INT
4061 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4063 int lowest_bit_set = ctz_hwi (val_in);
4064 int highest_bit_set = floor_log2 (val_in);
4065 gcc_assert (val_in != 0);
4067 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4068 (HOST_WIDE_INT_1U << lowest_bit_set));
4071 /* Create constant where bits outside of lowest bit set to highest bit set
4072 are set to 1. */
4074 unsigned HOST_WIDE_INT
4075 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4077 return val_in | ~aarch64_and_split_imm1 (val_in);
4080 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4082 bool
4083 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4085 if (aarch64_bitmask_imm (val_in, mode))
4086 return false;
4088 if (aarch64_move_imm (val_in, mode))
4089 return false;
4091 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4093 return aarch64_bitmask_imm (imm2, mode);
4096 /* Return true if val is an immediate that can be loaded into a
4097 register in a single instruction. */
4098 bool
4099 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4101 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4102 return 1;
4103 return aarch64_bitmask_imm (val, mode);
4106 static bool
4107 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4109 rtx base, offset;
4111 if (GET_CODE (x) == HIGH)
4112 return true;
4114 split_const (x, &base, &offset);
4115 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4117 if (aarch64_classify_symbol (base, offset)
4118 != SYMBOL_FORCE_TO_MEM)
4119 return true;
4120 else
4121 /* Avoid generating a 64-bit relocation in ILP32; leave
4122 to aarch64_expand_mov_immediate to handle it properly. */
4123 return mode != ptr_mode;
4126 return aarch64_tls_referenced_p (x);
4129 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4130 The expansion for a table switch is quite expensive due to the number
4131 of instructions, the table lookup and hard to predict indirect jump.
4132 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4133 set, otherwise use tables for > 16 cases as a tradeoff between size and
4134 performance. When optimizing for size, use the default setting. */
4136 static unsigned int
4137 aarch64_case_values_threshold (void)
4139 /* Use the specified limit for the number of cases before using jump
4140 tables at higher optimization levels. */
4141 if (optimize > 2
4142 && selected_cpu->tune->max_case_values != 0)
4143 return selected_cpu->tune->max_case_values;
4144 else
4145 return optimize_size ? default_case_values_threshold () : 17;
4148 /* Return true if register REGNO is a valid index register.
4149 STRICT_P is true if REG_OK_STRICT is in effect. */
4151 bool
4152 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4154 if (!HARD_REGISTER_NUM_P (regno))
4156 if (!strict_p)
4157 return true;
4159 if (!reg_renumber)
4160 return false;
4162 regno = reg_renumber[regno];
4164 return GP_REGNUM_P (regno);
4167 /* Return true if register REGNO is a valid base register for mode MODE.
4168 STRICT_P is true if REG_OK_STRICT is in effect. */
4170 bool
4171 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4173 if (!HARD_REGISTER_NUM_P (regno))
4175 if (!strict_p)
4176 return true;
4178 if (!reg_renumber)
4179 return false;
4181 regno = reg_renumber[regno];
4184 /* The fake registers will be eliminated to either the stack or
4185 hard frame pointer, both of which are usually valid base registers.
4186 Reload deals with the cases where the eliminated form isn't valid. */
4187 return (GP_REGNUM_P (regno)
4188 || regno == SP_REGNUM
4189 || regno == FRAME_POINTER_REGNUM
4190 || regno == ARG_POINTER_REGNUM);
4193 /* Return true if X is a valid base register for mode MODE.
4194 STRICT_P is true if REG_OK_STRICT is in effect. */
4196 static bool
4197 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4199 if (!strict_p && GET_CODE (x) == SUBREG)
4200 x = SUBREG_REG (x);
4202 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4205 /* Return true if address offset is a valid index. If it is, fill in INFO
4206 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4208 static bool
4209 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4210 machine_mode mode, bool strict_p)
4212 enum aarch64_address_type type;
4213 rtx index;
4214 int shift;
4216 /* (reg:P) */
4217 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4218 && GET_MODE (x) == Pmode)
4220 type = ADDRESS_REG_REG;
4221 index = x;
4222 shift = 0;
4224 /* (sign_extend:DI (reg:SI)) */
4225 else if ((GET_CODE (x) == SIGN_EXTEND
4226 || GET_CODE (x) == ZERO_EXTEND)
4227 && GET_MODE (x) == DImode
4228 && GET_MODE (XEXP (x, 0)) == SImode)
4230 type = (GET_CODE (x) == SIGN_EXTEND)
4231 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4232 index = XEXP (x, 0);
4233 shift = 0;
4235 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4236 else if (GET_CODE (x) == MULT
4237 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4238 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4239 && GET_MODE (XEXP (x, 0)) == DImode
4240 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4241 && CONST_INT_P (XEXP (x, 1)))
4243 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4244 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4245 index = XEXP (XEXP (x, 0), 0);
4246 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4248 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4249 else if (GET_CODE (x) == ASHIFT
4250 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4251 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4252 && GET_MODE (XEXP (x, 0)) == DImode
4253 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4254 && CONST_INT_P (XEXP (x, 1)))
4256 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4257 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4258 index = XEXP (XEXP (x, 0), 0);
4259 shift = INTVAL (XEXP (x, 1));
4261 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4262 else if ((GET_CODE (x) == SIGN_EXTRACT
4263 || GET_CODE (x) == ZERO_EXTRACT)
4264 && GET_MODE (x) == DImode
4265 && GET_CODE (XEXP (x, 0)) == MULT
4266 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4267 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4269 type = (GET_CODE (x) == SIGN_EXTRACT)
4270 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4271 index = XEXP (XEXP (x, 0), 0);
4272 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4273 if (INTVAL (XEXP (x, 1)) != 32 + shift
4274 || INTVAL (XEXP (x, 2)) != 0)
4275 shift = -1;
4277 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4278 (const_int 0xffffffff<<shift)) */
4279 else if (GET_CODE (x) == AND
4280 && GET_MODE (x) == DImode
4281 && GET_CODE (XEXP (x, 0)) == MULT
4282 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4283 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4284 && CONST_INT_P (XEXP (x, 1)))
4286 type = ADDRESS_REG_UXTW;
4287 index = XEXP (XEXP (x, 0), 0);
4288 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4289 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4290 shift = -1;
4292 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4293 else if ((GET_CODE (x) == SIGN_EXTRACT
4294 || GET_CODE (x) == ZERO_EXTRACT)
4295 && GET_MODE (x) == DImode
4296 && GET_CODE (XEXP (x, 0)) == ASHIFT
4297 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4298 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4300 type = (GET_CODE (x) == SIGN_EXTRACT)
4301 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4302 index = XEXP (XEXP (x, 0), 0);
4303 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4304 if (INTVAL (XEXP (x, 1)) != 32 + shift
4305 || INTVAL (XEXP (x, 2)) != 0)
4306 shift = -1;
4308 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4309 (const_int 0xffffffff<<shift)) */
4310 else if (GET_CODE (x) == AND
4311 && GET_MODE (x) == DImode
4312 && GET_CODE (XEXP (x, 0)) == ASHIFT
4313 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4314 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4315 && CONST_INT_P (XEXP (x, 1)))
4317 type = ADDRESS_REG_UXTW;
4318 index = XEXP (XEXP (x, 0), 0);
4319 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4320 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4321 shift = -1;
4323 /* (mult:P (reg:P) (const_int scale)) */
4324 else if (GET_CODE (x) == MULT
4325 && GET_MODE (x) == Pmode
4326 && GET_MODE (XEXP (x, 0)) == Pmode
4327 && CONST_INT_P (XEXP (x, 1)))
4329 type = ADDRESS_REG_REG;
4330 index = XEXP (x, 0);
4331 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4333 /* (ashift:P (reg:P) (const_int shift)) */
4334 else if (GET_CODE (x) == ASHIFT
4335 && GET_MODE (x) == Pmode
4336 && GET_MODE (XEXP (x, 0)) == Pmode
4337 && CONST_INT_P (XEXP (x, 1)))
4339 type = ADDRESS_REG_REG;
4340 index = XEXP (x, 0);
4341 shift = INTVAL (XEXP (x, 1));
4343 else
4344 return false;
4346 if (GET_CODE (index) == SUBREG)
4347 index = SUBREG_REG (index);
4349 if ((shift == 0 ||
4350 (shift > 0 && shift <= 3
4351 && (1 << shift) == GET_MODE_SIZE (mode)))
4352 && REG_P (index)
4353 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4355 info->type = type;
4356 info->offset = index;
4357 info->shift = shift;
4358 return true;
4361 return false;
4364 /* Return true if MODE is one of the modes for which we
4365 support LDP/STP operations. */
4367 static bool
4368 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4370 return mode == SImode || mode == DImode
4371 || mode == SFmode || mode == DFmode
4372 || (aarch64_vector_mode_supported_p (mode)
4373 && GET_MODE_SIZE (mode) == 8);
4376 /* Return true if REGNO is a virtual pointer register, or an eliminable
4377 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4378 include stack_pointer or hard_frame_pointer. */
4379 static bool
4380 virt_or_elim_regno_p (unsigned regno)
4382 return ((regno >= FIRST_VIRTUAL_REGISTER
4383 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4384 || regno == FRAME_POINTER_REGNUM
4385 || regno == ARG_POINTER_REGNUM);
4388 /* Return true if X is a valid address for machine mode MODE. If it is,
4389 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4390 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4392 static bool
4393 aarch64_classify_address (struct aarch64_address_info *info,
4394 rtx x, machine_mode mode,
4395 RTX_CODE outer_code, bool strict_p)
4397 enum rtx_code code = GET_CODE (x);
4398 rtx op0, op1;
4400 /* On BE, we use load/store pair for all large int mode load/stores.
4401 TI/TFmode may also use a load/store pair. */
4402 bool load_store_pair_p = (outer_code == PARALLEL
4403 || mode == TImode
4404 || mode == TFmode
4405 || (BYTES_BIG_ENDIAN
4406 && aarch64_vect_struct_mode_p (mode)));
4408 bool allow_reg_index_p =
4409 !load_store_pair_p
4410 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4411 && !aarch64_vect_struct_mode_p (mode);
4413 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4414 REG addressing. */
4415 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4416 && (code != POST_INC && code != REG))
4417 return false;
4419 switch (code)
4421 case REG:
4422 case SUBREG:
4423 info->type = ADDRESS_REG_IMM;
4424 info->base = x;
4425 info->offset = const0_rtx;
4426 return aarch64_base_register_rtx_p (x, strict_p);
4428 case PLUS:
4429 op0 = XEXP (x, 0);
4430 op1 = XEXP (x, 1);
4432 if (! strict_p
4433 && REG_P (op0)
4434 && virt_or_elim_regno_p (REGNO (op0))
4435 && CONST_INT_P (op1))
4437 info->type = ADDRESS_REG_IMM;
4438 info->base = op0;
4439 info->offset = op1;
4441 return true;
4444 if (GET_MODE_SIZE (mode) != 0
4445 && CONST_INT_P (op1)
4446 && aarch64_base_register_rtx_p (op0, strict_p))
4448 HOST_WIDE_INT offset = INTVAL (op1);
4450 info->type = ADDRESS_REG_IMM;
4451 info->base = op0;
4452 info->offset = op1;
4454 /* TImode and TFmode values are allowed in both pairs of X
4455 registers and individual Q registers. The available
4456 address modes are:
4457 X,X: 7-bit signed scaled offset
4458 Q: 9-bit signed offset
4459 We conservatively require an offset representable in either mode.
4460 When performing the check for pairs of X registers i.e. LDP/STP
4461 pass down DImode since that is the natural size of the LDP/STP
4462 instruction memory accesses. */
4463 if (mode == TImode || mode == TFmode)
4464 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4465 && (offset_9bit_signed_unscaled_p (mode, offset)
4466 || offset_12bit_unsigned_scaled_p (mode, offset)));
4468 /* A 7bit offset check because OImode will emit a ldp/stp
4469 instruction (only big endian will get here).
4470 For ldp/stp instructions, the offset is scaled for the size of a
4471 single element of the pair. */
4472 if (mode == OImode)
4473 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4475 /* Three 9/12 bit offsets checks because CImode will emit three
4476 ldr/str instructions (only big endian will get here). */
4477 if (mode == CImode)
4478 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4479 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4480 || offset_12bit_unsigned_scaled_p (V16QImode,
4481 offset + 32)));
4483 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4484 instructions (only big endian will get here). */
4485 if (mode == XImode)
4486 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4487 && aarch64_offset_7bit_signed_scaled_p (TImode,
4488 offset + 32));
4490 if (load_store_pair_p)
4491 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4492 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4493 else
4494 return (offset_9bit_signed_unscaled_p (mode, offset)
4495 || offset_12bit_unsigned_scaled_p (mode, offset));
4498 if (allow_reg_index_p)
4500 /* Look for base + (scaled/extended) index register. */
4501 if (aarch64_base_register_rtx_p (op0, strict_p)
4502 && aarch64_classify_index (info, op1, mode, strict_p))
4504 info->base = op0;
4505 return true;
4507 if (aarch64_base_register_rtx_p (op1, strict_p)
4508 && aarch64_classify_index (info, op0, mode, strict_p))
4510 info->base = op1;
4511 return true;
4515 return false;
4517 case POST_INC:
4518 case POST_DEC:
4519 case PRE_INC:
4520 case PRE_DEC:
4521 info->type = ADDRESS_REG_WB;
4522 info->base = XEXP (x, 0);
4523 info->offset = NULL_RTX;
4524 return aarch64_base_register_rtx_p (info->base, strict_p);
4526 case POST_MODIFY:
4527 case PRE_MODIFY:
4528 info->type = ADDRESS_REG_WB;
4529 info->base = XEXP (x, 0);
4530 if (GET_CODE (XEXP (x, 1)) == PLUS
4531 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4532 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4533 && aarch64_base_register_rtx_p (info->base, strict_p))
4535 HOST_WIDE_INT offset;
4536 info->offset = XEXP (XEXP (x, 1), 1);
4537 offset = INTVAL (info->offset);
4539 /* TImode and TFmode values are allowed in both pairs of X
4540 registers and individual Q registers. The available
4541 address modes are:
4542 X,X: 7-bit signed scaled offset
4543 Q: 9-bit signed offset
4544 We conservatively require an offset representable in either mode.
4546 if (mode == TImode || mode == TFmode)
4547 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4548 && offset_9bit_signed_unscaled_p (mode, offset));
4550 if (load_store_pair_p)
4551 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4552 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4553 else
4554 return offset_9bit_signed_unscaled_p (mode, offset);
4556 return false;
4558 case CONST:
4559 case SYMBOL_REF:
4560 case LABEL_REF:
4561 /* load literal: pc-relative constant pool entry. Only supported
4562 for SI mode or larger. */
4563 info->type = ADDRESS_SYMBOLIC;
4565 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4567 rtx sym, addend;
4569 split_const (x, &sym, &addend);
4570 return ((GET_CODE (sym) == LABEL_REF
4571 || (GET_CODE (sym) == SYMBOL_REF
4572 && CONSTANT_POOL_ADDRESS_P (sym)
4573 && aarch64_pcrelative_literal_loads)));
4575 return false;
4577 case LO_SUM:
4578 info->type = ADDRESS_LO_SUM;
4579 info->base = XEXP (x, 0);
4580 info->offset = XEXP (x, 1);
4581 if (allow_reg_index_p
4582 && aarch64_base_register_rtx_p (info->base, strict_p))
4584 rtx sym, offs;
4585 split_const (info->offset, &sym, &offs);
4586 if (GET_CODE (sym) == SYMBOL_REF
4587 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4589 /* The symbol and offset must be aligned to the access size. */
4590 unsigned int align;
4591 unsigned int ref_size;
4593 if (CONSTANT_POOL_ADDRESS_P (sym))
4594 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4595 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4597 tree exp = SYMBOL_REF_DECL (sym);
4598 align = TYPE_ALIGN (TREE_TYPE (exp));
4599 align = CONSTANT_ALIGNMENT (exp, align);
4601 else if (SYMBOL_REF_DECL (sym))
4602 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4603 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4604 && SYMBOL_REF_BLOCK (sym) != NULL)
4605 align = SYMBOL_REF_BLOCK (sym)->alignment;
4606 else
4607 align = BITS_PER_UNIT;
4609 ref_size = GET_MODE_SIZE (mode);
4610 if (ref_size == 0)
4611 ref_size = GET_MODE_SIZE (DImode);
4613 return ((INTVAL (offs) & (ref_size - 1)) == 0
4614 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4617 return false;
4619 default:
4620 return false;
4624 /* Return true if the address X is valid for a PRFM instruction.
4625 STRICT_P is true if we should do strict checking with
4626 aarch64_classify_address. */
4628 bool
4629 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4631 struct aarch64_address_info addr;
4633 /* PRFM accepts the same addresses as DImode... */
4634 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4635 if (!res)
4636 return false;
4638 /* ... except writeback forms. */
4639 return addr.type != ADDRESS_REG_WB;
4642 bool
4643 aarch64_symbolic_address_p (rtx x)
4645 rtx offset;
4647 split_const (x, &x, &offset);
4648 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4651 /* Classify the base of symbolic expression X. */
4653 enum aarch64_symbol_type
4654 aarch64_classify_symbolic_expression (rtx x)
4656 rtx offset;
4658 split_const (x, &x, &offset);
4659 return aarch64_classify_symbol (x, offset);
4663 /* Return TRUE if X is a legitimate address for accessing memory in
4664 mode MODE. */
4665 static bool
4666 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4668 struct aarch64_address_info addr;
4670 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4673 /* Return TRUE if X is a legitimate address for accessing memory in
4674 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4675 pair operation. */
4676 bool
4677 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4678 RTX_CODE outer_code, bool strict_p)
4680 struct aarch64_address_info addr;
4682 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4685 /* Split an out-of-range address displacement into a base and offset.
4686 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4687 to increase opportunities for sharing the base address of different sizes.
4688 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4689 static bool
4690 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4692 HOST_WIDE_INT offset = INTVAL (*disp);
4693 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4695 if (mode == TImode || mode == TFmode
4696 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4697 base = (offset + 0x100) & ~0x1ff;
4699 *off = GEN_INT (base);
4700 *disp = GEN_INT (offset - base);
4701 return true;
4704 /* Return the binary representation of floating point constant VALUE in INTVAL.
4705 If the value cannot be converted, return false without setting INTVAL.
4706 The conversion is done in the given MODE. */
4707 bool
4708 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4711 /* We make a general exception for 0. */
4712 if (aarch64_float_const_zero_rtx_p (value))
4714 *intval = 0;
4715 return true;
4718 machine_mode mode = GET_MODE (value);
4719 if (GET_CODE (value) != CONST_DOUBLE
4720 || !SCALAR_FLOAT_MODE_P (mode)
4721 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4722 /* Only support up to DF mode. */
4723 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4724 return false;
4726 unsigned HOST_WIDE_INT ival = 0;
4728 long res[2];
4729 real_to_target (res,
4730 CONST_DOUBLE_REAL_VALUE (value),
4731 REAL_MODE_FORMAT (mode));
4733 if (mode == DFmode)
4735 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4736 ival = zext_hwi (res[order], 32);
4737 ival |= (zext_hwi (res[1 - order], 32) << 32);
4739 else
4740 ival = zext_hwi (res[0], 32);
4742 *intval = ival;
4743 return true;
4746 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4747 single MOV(+MOVK) followed by an FMOV. */
4748 bool
4749 aarch64_float_const_rtx_p (rtx x)
4751 machine_mode mode = GET_MODE (x);
4752 if (mode == VOIDmode)
4753 return false;
4755 /* Determine whether it's cheaper to write float constants as
4756 mov/movk pairs over ldr/adrp pairs. */
4757 unsigned HOST_WIDE_INT ival;
4759 if (GET_CODE (x) == CONST_DOUBLE
4760 && SCALAR_FLOAT_MODE_P (mode)
4761 && aarch64_reinterpret_float_as_int (x, &ival))
4763 machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
4764 int num_instr = aarch64_internal_mov_immediate
4765 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4766 return num_instr < 3;
4769 return false;
4772 /* Return TRUE if rtx X is immediate constant 0.0 */
4773 bool
4774 aarch64_float_const_zero_rtx_p (rtx x)
4776 if (GET_MODE (x) == VOIDmode)
4777 return false;
4779 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4780 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4781 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4784 /* Return TRUE if rtx X is immediate constant that fits in a single
4785 MOVI immediate operation. */
4786 bool
4787 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4789 if (!TARGET_SIMD)
4790 return false;
4792 machine_mode vmode, imode;
4793 unsigned HOST_WIDE_INT ival;
4795 if (GET_CODE (x) == CONST_DOUBLE
4796 && SCALAR_FLOAT_MODE_P (mode))
4798 if (!aarch64_reinterpret_float_as_int (x, &ival))
4799 return false;
4801 /* We make a general exception for 0. */
4802 if (aarch64_float_const_zero_rtx_p (x))
4803 return true;
4805 imode = int_mode_for_mode (mode);
4807 else if (GET_CODE (x) == CONST_INT
4808 && SCALAR_INT_MODE_P (mode))
4810 imode = mode;
4811 ival = INTVAL (x);
4813 else
4814 return false;
4816 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4817 a 128 bit vector mode. */
4818 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4820 vmode = aarch64_simd_container_mode (imode, width);
4821 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4823 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4827 /* Return the fixed registers used for condition codes. */
4829 static bool
4830 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4832 *p1 = CC_REGNUM;
4833 *p2 = INVALID_REGNUM;
4834 return true;
4837 /* This function is used by the call expanders of the machine description.
4838 RESULT is the register in which the result is returned. It's NULL for
4839 "call" and "sibcall".
4840 MEM is the location of the function call.
4841 SIBCALL indicates whether this function call is normal call or sibling call.
4842 It will generate different pattern accordingly. */
4844 void
4845 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4847 rtx call, callee, tmp;
4848 rtvec vec;
4849 machine_mode mode;
4851 gcc_assert (MEM_P (mem));
4852 callee = XEXP (mem, 0);
4853 mode = GET_MODE (callee);
4854 gcc_assert (mode == Pmode);
4856 /* Decide if we should generate indirect calls by loading the
4857 address of the callee into a register before performing
4858 the branch-and-link. */
4859 if (SYMBOL_REF_P (callee)
4860 ? (aarch64_is_long_call_p (callee)
4861 || aarch64_is_noplt_call_p (callee))
4862 : !REG_P (callee))
4863 XEXP (mem, 0) = force_reg (mode, callee);
4865 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4867 if (result != NULL_RTX)
4868 call = gen_rtx_SET (result, call);
4870 if (sibcall)
4871 tmp = ret_rtx;
4872 else
4873 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4875 vec = gen_rtvec (2, call, tmp);
4876 call = gen_rtx_PARALLEL (VOIDmode, vec);
4878 aarch64_emit_call_insn (call);
4881 /* Emit call insn with PAT and do aarch64-specific handling. */
4883 void
4884 aarch64_emit_call_insn (rtx pat)
4886 rtx insn = emit_call_insn (pat);
4888 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4889 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4890 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4893 machine_mode
4894 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4896 /* All floating point compares return CCFP if it is an equality
4897 comparison, and CCFPE otherwise. */
4898 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4900 switch (code)
4902 case EQ:
4903 case NE:
4904 case UNORDERED:
4905 case ORDERED:
4906 case UNLT:
4907 case UNLE:
4908 case UNGT:
4909 case UNGE:
4910 case UNEQ:
4911 case LTGT:
4912 return CCFPmode;
4914 case LT:
4915 case LE:
4916 case GT:
4917 case GE:
4918 return CCFPEmode;
4920 default:
4921 gcc_unreachable ();
4925 /* Equality comparisons of short modes against zero can be performed
4926 using the TST instruction with the appropriate bitmask. */
4927 if (y == const0_rtx && REG_P (x)
4928 && (code == EQ || code == NE)
4929 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4930 return CC_NZmode;
4932 /* Similarly, comparisons of zero_extends from shorter modes can
4933 be performed using an ANDS with an immediate mask. */
4934 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4935 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4936 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4937 && (code == EQ || code == NE))
4938 return CC_NZmode;
4940 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4941 && y == const0_rtx
4942 && (code == EQ || code == NE || code == LT || code == GE)
4943 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4944 || GET_CODE (x) == NEG
4945 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4946 && CONST_INT_P (XEXP (x, 2)))))
4947 return CC_NZmode;
4949 /* A compare with a shifted operand. Because of canonicalization,
4950 the comparison will have to be swapped when we emit the assembly
4951 code. */
4952 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4953 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4954 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4955 || GET_CODE (x) == LSHIFTRT
4956 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4957 return CC_SWPmode;
4959 /* Similarly for a negated operand, but we can only do this for
4960 equalities. */
4961 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4962 && (REG_P (y) || GET_CODE (y) == SUBREG)
4963 && (code == EQ || code == NE)
4964 && GET_CODE (x) == NEG)
4965 return CC_Zmode;
4967 /* A test for unsigned overflow. */
4968 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4969 && code == NE
4970 && GET_CODE (x) == PLUS
4971 && GET_CODE (y) == ZERO_EXTEND)
4972 return CC_Cmode;
4974 /* For everything else, return CCmode. */
4975 return CCmode;
4978 static int
4979 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4982 aarch64_get_condition_code (rtx x)
4984 machine_mode mode = GET_MODE (XEXP (x, 0));
4985 enum rtx_code comp_code = GET_CODE (x);
4987 if (GET_MODE_CLASS (mode) != MODE_CC)
4988 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4989 return aarch64_get_condition_code_1 (mode, comp_code);
4992 static int
4993 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
4995 switch (mode)
4997 case E_CCFPmode:
4998 case E_CCFPEmode:
4999 switch (comp_code)
5001 case GE: return AARCH64_GE;
5002 case GT: return AARCH64_GT;
5003 case LE: return AARCH64_LS;
5004 case LT: return AARCH64_MI;
5005 case NE: return AARCH64_NE;
5006 case EQ: return AARCH64_EQ;
5007 case ORDERED: return AARCH64_VC;
5008 case UNORDERED: return AARCH64_VS;
5009 case UNLT: return AARCH64_LT;
5010 case UNLE: return AARCH64_LE;
5011 case UNGT: return AARCH64_HI;
5012 case UNGE: return AARCH64_PL;
5013 default: return -1;
5015 break;
5017 case E_CCmode:
5018 switch (comp_code)
5020 case NE: return AARCH64_NE;
5021 case EQ: return AARCH64_EQ;
5022 case GE: return AARCH64_GE;
5023 case GT: return AARCH64_GT;
5024 case LE: return AARCH64_LE;
5025 case LT: return AARCH64_LT;
5026 case GEU: return AARCH64_CS;
5027 case GTU: return AARCH64_HI;
5028 case LEU: return AARCH64_LS;
5029 case LTU: return AARCH64_CC;
5030 default: return -1;
5032 break;
5034 case E_CC_SWPmode:
5035 switch (comp_code)
5037 case NE: return AARCH64_NE;
5038 case EQ: return AARCH64_EQ;
5039 case GE: return AARCH64_LE;
5040 case GT: return AARCH64_LT;
5041 case LE: return AARCH64_GE;
5042 case LT: return AARCH64_GT;
5043 case GEU: return AARCH64_LS;
5044 case GTU: return AARCH64_CC;
5045 case LEU: return AARCH64_CS;
5046 case LTU: return AARCH64_HI;
5047 default: return -1;
5049 break;
5051 case E_CC_NZmode:
5052 switch (comp_code)
5054 case NE: return AARCH64_NE;
5055 case EQ: return AARCH64_EQ;
5056 case GE: return AARCH64_PL;
5057 case LT: return AARCH64_MI;
5058 default: return -1;
5060 break;
5062 case E_CC_Zmode:
5063 switch (comp_code)
5065 case NE: return AARCH64_NE;
5066 case EQ: return AARCH64_EQ;
5067 default: return -1;
5069 break;
5071 case E_CC_Cmode:
5072 switch (comp_code)
5074 case NE: return AARCH64_CS;
5075 case EQ: return AARCH64_CC;
5076 default: return -1;
5078 break;
5080 default:
5081 return -1;
5084 return -1;
5087 bool
5088 aarch64_const_vec_all_same_in_range_p (rtx x,
5089 HOST_WIDE_INT minval,
5090 HOST_WIDE_INT maxval)
5092 HOST_WIDE_INT firstval;
5093 int count, i;
5095 if (GET_CODE (x) != CONST_VECTOR
5096 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5097 return false;
5099 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5100 if (firstval < minval || firstval > maxval)
5101 return false;
5103 count = CONST_VECTOR_NUNITS (x);
5104 for (i = 1; i < count; i++)
5105 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5106 return false;
5108 return true;
5111 bool
5112 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5114 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5118 /* N Z C V. */
5119 #define AARCH64_CC_V 1
5120 #define AARCH64_CC_C (1 << 1)
5121 #define AARCH64_CC_Z (1 << 2)
5122 #define AARCH64_CC_N (1 << 3)
5124 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5125 static const int aarch64_nzcv_codes[] =
5127 0, /* EQ, Z == 1. */
5128 AARCH64_CC_Z, /* NE, Z == 0. */
5129 0, /* CS, C == 1. */
5130 AARCH64_CC_C, /* CC, C == 0. */
5131 0, /* MI, N == 1. */
5132 AARCH64_CC_N, /* PL, N == 0. */
5133 0, /* VS, V == 1. */
5134 AARCH64_CC_V, /* VC, V == 0. */
5135 0, /* HI, C ==1 && Z == 0. */
5136 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5137 AARCH64_CC_V, /* GE, N == V. */
5138 0, /* LT, N != V. */
5139 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5140 0, /* LE, !(Z == 0 && N == V). */
5141 0, /* AL, Any. */
5142 0 /* NV, Any. */
5145 /* Print operand X to file F in a target specific manner according to CODE.
5146 The acceptable formatting commands given by CODE are:
5147 'c': An integer or symbol address without a preceding #
5148 sign.
5149 'e': Print the sign/zero-extend size as a character 8->b,
5150 16->h, 32->w.
5151 'p': Prints N such that 2^N == X (X must be power of 2 and
5152 const int).
5153 'P': Print the number of non-zero bits in X (a const_int).
5154 'H': Print the higher numbered register of a pair (TImode)
5155 of regs.
5156 'm': Print a condition (eq, ne, etc).
5157 'M': Same as 'm', but invert condition.
5158 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5159 'S/T/U/V': Print a FP/SIMD register name for a register list.
5160 The register printed is the FP/SIMD register name
5161 of X + 0/1/2/3 for S/T/U/V.
5162 'R': Print a scalar FP/SIMD register name + 1.
5163 'X': Print bottom 16 bits of integer constant in hex.
5164 'w/x': Print a general register name or the zero register
5165 (32-bit or 64-bit).
5166 '0': Print a normal operand, if it's a general register,
5167 then we assume DImode.
5168 'k': Print NZCV for conditional compare instructions.
5169 'A': Output address constant representing the first
5170 argument of X, specifying a relocation offset
5171 if appropriate.
5172 'L': Output constant address specified by X
5173 with a relocation offset if appropriate.
5174 'G': Prints address of X, specifying a PC relative
5175 relocation mode if appropriate. */
5177 static void
5178 aarch64_print_operand (FILE *f, rtx x, int code)
5180 switch (code)
5182 case 'c':
5183 switch (GET_CODE (x))
5185 case CONST_INT:
5186 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5187 break;
5189 case SYMBOL_REF:
5190 output_addr_const (f, x);
5191 break;
5193 case CONST:
5194 if (GET_CODE (XEXP (x, 0)) == PLUS
5195 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5197 output_addr_const (f, x);
5198 break;
5200 /* Fall through. */
5202 default:
5203 output_operand_lossage ("Unsupported operand for code '%c'", code);
5205 break;
5207 case 'e':
5209 int n;
5211 if (!CONST_INT_P (x)
5212 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5214 output_operand_lossage ("invalid operand for '%%%c'", code);
5215 return;
5218 switch (n)
5220 case 3:
5221 fputc ('b', f);
5222 break;
5223 case 4:
5224 fputc ('h', f);
5225 break;
5226 case 5:
5227 fputc ('w', f);
5228 break;
5229 default:
5230 output_operand_lossage ("invalid operand for '%%%c'", code);
5231 return;
5234 break;
5236 case 'p':
5238 int n;
5240 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5242 output_operand_lossage ("invalid operand for '%%%c'", code);
5243 return;
5246 asm_fprintf (f, "%d", n);
5248 break;
5250 case 'P':
5251 if (!CONST_INT_P (x))
5253 output_operand_lossage ("invalid operand for '%%%c'", code);
5254 return;
5257 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5258 break;
5260 case 'H':
5261 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5263 output_operand_lossage ("invalid operand for '%%%c'", code);
5264 return;
5267 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5268 break;
5270 case 'M':
5271 case 'm':
5273 int cond_code;
5274 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5275 if (x == const_true_rtx)
5277 if (code == 'M')
5278 fputs ("nv", f);
5279 return;
5282 if (!COMPARISON_P (x))
5284 output_operand_lossage ("invalid operand for '%%%c'", code);
5285 return;
5288 cond_code = aarch64_get_condition_code (x);
5289 gcc_assert (cond_code >= 0);
5290 if (code == 'M')
5291 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5292 fputs (aarch64_condition_codes[cond_code], f);
5294 break;
5296 case 'b':
5297 case 'h':
5298 case 's':
5299 case 'd':
5300 case 'q':
5301 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5303 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5304 return;
5306 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5307 break;
5309 case 'S':
5310 case 'T':
5311 case 'U':
5312 case 'V':
5313 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5315 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5316 return;
5318 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5319 break;
5321 case 'R':
5322 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5324 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5325 return;
5327 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5328 break;
5330 case 'X':
5331 if (!CONST_INT_P (x))
5333 output_operand_lossage ("invalid operand for '%%%c'", code);
5334 return;
5336 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5337 break;
5339 case 'w':
5340 case 'x':
5341 if (x == const0_rtx
5342 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5344 asm_fprintf (f, "%czr", code);
5345 break;
5348 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5350 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5351 break;
5354 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5356 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5357 break;
5360 /* Fall through */
5362 case 0:
5363 if (x == NULL)
5365 output_operand_lossage ("missing operand");
5366 return;
5369 switch (GET_CODE (x))
5371 case REG:
5372 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5373 break;
5375 case MEM:
5376 output_address (GET_MODE (x), XEXP (x, 0));
5377 /* Check all memory references are Pmode - even with ILP32. */
5378 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5379 break;
5381 case CONST:
5382 case LABEL_REF:
5383 case SYMBOL_REF:
5384 output_addr_const (asm_out_file, x);
5385 break;
5387 case CONST_INT:
5388 asm_fprintf (f, "%wd", INTVAL (x));
5389 break;
5391 case CONST_VECTOR:
5392 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5394 gcc_assert (
5395 aarch64_const_vec_all_same_in_range_p (x,
5396 HOST_WIDE_INT_MIN,
5397 HOST_WIDE_INT_MAX));
5398 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5400 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5402 fputc ('0', f);
5404 else
5405 gcc_unreachable ();
5406 break;
5408 case CONST_DOUBLE:
5409 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5410 be getting CONST_DOUBLEs holding integers. */
5411 gcc_assert (GET_MODE (x) != VOIDmode);
5412 if (aarch64_float_const_zero_rtx_p (x))
5414 fputc ('0', f);
5415 break;
5417 else if (aarch64_float_const_representable_p (x))
5419 #define buf_size 20
5420 char float_buf[buf_size] = {'\0'};
5421 real_to_decimal_for_mode (float_buf,
5422 CONST_DOUBLE_REAL_VALUE (x),
5423 buf_size, buf_size,
5424 1, GET_MODE (x));
5425 asm_fprintf (asm_out_file, "%s", float_buf);
5426 break;
5427 #undef buf_size
5429 output_operand_lossage ("invalid constant");
5430 return;
5431 default:
5432 output_operand_lossage ("invalid operand");
5433 return;
5435 break;
5437 case 'A':
5438 if (GET_CODE (x) == HIGH)
5439 x = XEXP (x, 0);
5441 switch (aarch64_classify_symbolic_expression (x))
5443 case SYMBOL_SMALL_GOT_4G:
5444 asm_fprintf (asm_out_file, ":got:");
5445 break;
5447 case SYMBOL_SMALL_TLSGD:
5448 asm_fprintf (asm_out_file, ":tlsgd:");
5449 break;
5451 case SYMBOL_SMALL_TLSDESC:
5452 asm_fprintf (asm_out_file, ":tlsdesc:");
5453 break;
5455 case SYMBOL_SMALL_TLSIE:
5456 asm_fprintf (asm_out_file, ":gottprel:");
5457 break;
5459 case SYMBOL_TLSLE24:
5460 asm_fprintf (asm_out_file, ":tprel:");
5461 break;
5463 case SYMBOL_TINY_GOT:
5464 gcc_unreachable ();
5465 break;
5467 default:
5468 break;
5470 output_addr_const (asm_out_file, x);
5471 break;
5473 case 'L':
5474 switch (aarch64_classify_symbolic_expression (x))
5476 case SYMBOL_SMALL_GOT_4G:
5477 asm_fprintf (asm_out_file, ":lo12:");
5478 break;
5480 case SYMBOL_SMALL_TLSGD:
5481 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5482 break;
5484 case SYMBOL_SMALL_TLSDESC:
5485 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5486 break;
5488 case SYMBOL_SMALL_TLSIE:
5489 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5490 break;
5492 case SYMBOL_TLSLE12:
5493 asm_fprintf (asm_out_file, ":tprel_lo12:");
5494 break;
5496 case SYMBOL_TLSLE24:
5497 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5498 break;
5500 case SYMBOL_TINY_GOT:
5501 asm_fprintf (asm_out_file, ":got:");
5502 break;
5504 case SYMBOL_TINY_TLSIE:
5505 asm_fprintf (asm_out_file, ":gottprel:");
5506 break;
5508 default:
5509 break;
5511 output_addr_const (asm_out_file, x);
5512 break;
5514 case 'G':
5515 switch (aarch64_classify_symbolic_expression (x))
5517 case SYMBOL_TLSLE24:
5518 asm_fprintf (asm_out_file, ":tprel_hi12:");
5519 break;
5520 default:
5521 break;
5523 output_addr_const (asm_out_file, x);
5524 break;
5526 case 'k':
5528 HOST_WIDE_INT cond_code;
5530 if (!CONST_INT_P (x))
5532 output_operand_lossage ("invalid operand for '%%%c'", code);
5533 return;
5536 cond_code = INTVAL (x);
5537 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5538 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5540 break;
5542 default:
5543 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5544 return;
5548 static void
5549 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5551 struct aarch64_address_info addr;
5553 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5554 switch (addr.type)
5556 case ADDRESS_REG_IMM:
5557 if (addr.offset == const0_rtx)
5558 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5559 else
5560 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5561 INTVAL (addr.offset));
5562 return;
5564 case ADDRESS_REG_REG:
5565 if (addr.shift == 0)
5566 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5567 reg_names [REGNO (addr.offset)]);
5568 else
5569 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5570 reg_names [REGNO (addr.offset)], addr.shift);
5571 return;
5573 case ADDRESS_REG_UXTW:
5574 if (addr.shift == 0)
5575 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5576 REGNO (addr.offset) - R0_REGNUM);
5577 else
5578 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5579 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5580 return;
5582 case ADDRESS_REG_SXTW:
5583 if (addr.shift == 0)
5584 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5585 REGNO (addr.offset) - R0_REGNUM);
5586 else
5587 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5588 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5589 return;
5591 case ADDRESS_REG_WB:
5592 switch (GET_CODE (x))
5594 case PRE_INC:
5595 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5596 GET_MODE_SIZE (mode));
5597 return;
5598 case POST_INC:
5599 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5600 GET_MODE_SIZE (mode));
5601 return;
5602 case PRE_DEC:
5603 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5604 GET_MODE_SIZE (mode));
5605 return;
5606 case POST_DEC:
5607 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5608 GET_MODE_SIZE (mode));
5609 return;
5610 case PRE_MODIFY:
5611 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5612 INTVAL (addr.offset));
5613 return;
5614 case POST_MODIFY:
5615 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5616 INTVAL (addr.offset));
5617 return;
5618 default:
5619 break;
5621 break;
5623 case ADDRESS_LO_SUM:
5624 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5625 output_addr_const (f, addr.offset);
5626 asm_fprintf (f, "]");
5627 return;
5629 case ADDRESS_SYMBOLIC:
5630 break;
5633 output_addr_const (f, x);
5636 bool
5637 aarch64_label_mentioned_p (rtx x)
5639 const char *fmt;
5640 int i;
5642 if (GET_CODE (x) == LABEL_REF)
5643 return true;
5645 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5646 referencing instruction, but they are constant offsets, not
5647 symbols. */
5648 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5649 return false;
5651 fmt = GET_RTX_FORMAT (GET_CODE (x));
5652 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5654 if (fmt[i] == 'E')
5656 int j;
5658 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5659 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5660 return 1;
5662 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5663 return 1;
5666 return 0;
5669 /* Implement REGNO_REG_CLASS. */
5671 enum reg_class
5672 aarch64_regno_regclass (unsigned regno)
5674 if (GP_REGNUM_P (regno))
5675 return GENERAL_REGS;
5677 if (regno == SP_REGNUM)
5678 return STACK_REG;
5680 if (regno == FRAME_POINTER_REGNUM
5681 || regno == ARG_POINTER_REGNUM)
5682 return POINTER_REGS;
5684 if (FP_REGNUM_P (regno))
5685 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5687 return NO_REGS;
5690 static rtx
5691 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5693 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5694 where mask is selected by alignment and size of the offset.
5695 We try to pick as large a range for the offset as possible to
5696 maximize the chance of a CSE. However, for aligned addresses
5697 we limit the range to 4k so that structures with different sized
5698 elements are likely to use the same base. We need to be careful
5699 not to split a CONST for some forms of address expression, otherwise
5700 it will generate sub-optimal code. */
5702 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5704 rtx base = XEXP (x, 0);
5705 rtx offset_rtx = XEXP (x, 1);
5706 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5708 if (GET_CODE (base) == PLUS)
5710 rtx op0 = XEXP (base, 0);
5711 rtx op1 = XEXP (base, 1);
5713 /* Force any scaling into a temp for CSE. */
5714 op0 = force_reg (Pmode, op0);
5715 op1 = force_reg (Pmode, op1);
5717 /* Let the pointer register be in op0. */
5718 if (REG_POINTER (op1))
5719 std::swap (op0, op1);
5721 /* If the pointer is virtual or frame related, then we know that
5722 virtual register instantiation or register elimination is going
5723 to apply a second constant. We want the two constants folded
5724 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5725 if (virt_or_elim_regno_p (REGNO (op0)))
5727 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5728 NULL_RTX, true, OPTAB_DIRECT);
5729 return gen_rtx_PLUS (Pmode, base, op1);
5732 /* Otherwise, in order to encourage CSE (and thence loop strength
5733 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5734 base = expand_binop (Pmode, add_optab, op0, op1,
5735 NULL_RTX, true, OPTAB_DIRECT);
5736 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5739 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5740 HOST_WIDE_INT base_offset;
5741 if (GET_MODE_SIZE (mode) > 16)
5742 base_offset = (offset + 0x400) & ~0x7f0;
5743 /* For offsets aren't a multiple of the access size, the limit is
5744 -256...255. */
5745 else if (offset & (GET_MODE_SIZE (mode) - 1))
5747 base_offset = (offset + 0x100) & ~0x1ff;
5749 /* BLKmode typically uses LDP of X-registers. */
5750 if (mode == BLKmode)
5751 base_offset = (offset + 512) & ~0x3ff;
5753 /* Small negative offsets are supported. */
5754 else if (IN_RANGE (offset, -256, 0))
5755 base_offset = 0;
5756 else if (mode == TImode || mode == TFmode)
5757 base_offset = (offset + 0x100) & ~0x1ff;
5758 /* Use 12-bit offset by access size. */
5759 else
5760 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5762 if (base_offset != 0)
5764 base = plus_constant (Pmode, base, base_offset);
5765 base = force_operand (base, NULL_RTX);
5766 return plus_constant (Pmode, base, offset - base_offset);
5770 return x;
5773 /* Return the reload icode required for a constant pool in mode. */
5774 static enum insn_code
5775 aarch64_constant_pool_reload_icode (machine_mode mode)
5777 switch (mode)
5779 case E_SFmode:
5780 return CODE_FOR_aarch64_reload_movcpsfdi;
5782 case E_DFmode:
5783 return CODE_FOR_aarch64_reload_movcpdfdi;
5785 case E_TFmode:
5786 return CODE_FOR_aarch64_reload_movcptfdi;
5788 case E_V8QImode:
5789 return CODE_FOR_aarch64_reload_movcpv8qidi;
5791 case E_V16QImode:
5792 return CODE_FOR_aarch64_reload_movcpv16qidi;
5794 case E_V4HImode:
5795 return CODE_FOR_aarch64_reload_movcpv4hidi;
5797 case E_V8HImode:
5798 return CODE_FOR_aarch64_reload_movcpv8hidi;
5800 case E_V2SImode:
5801 return CODE_FOR_aarch64_reload_movcpv2sidi;
5803 case E_V4SImode:
5804 return CODE_FOR_aarch64_reload_movcpv4sidi;
5806 case E_V2DImode:
5807 return CODE_FOR_aarch64_reload_movcpv2didi;
5809 case E_V2DFmode:
5810 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5812 default:
5813 gcc_unreachable ();
5816 gcc_unreachable ();
5818 static reg_class_t
5819 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5820 reg_class_t rclass,
5821 machine_mode mode,
5822 secondary_reload_info *sri)
5825 /* If we have to disable direct literal pool loads and stores because the
5826 function is too big, then we need a scratch register. */
5827 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5828 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5829 || targetm.vector_mode_supported_p (GET_MODE (x)))
5830 && !aarch64_pcrelative_literal_loads)
5832 sri->icode = aarch64_constant_pool_reload_icode (mode);
5833 return NO_REGS;
5836 /* Without the TARGET_SIMD instructions we cannot move a Q register
5837 to a Q register directly. We need a scratch. */
5838 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5839 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5840 && reg_class_subset_p (rclass, FP_REGS))
5842 if (mode == TFmode)
5843 sri->icode = CODE_FOR_aarch64_reload_movtf;
5844 else if (mode == TImode)
5845 sri->icode = CODE_FOR_aarch64_reload_movti;
5846 return NO_REGS;
5849 /* A TFmode or TImode memory access should be handled via an FP_REGS
5850 because AArch64 has richer addressing modes for LDR/STR instructions
5851 than LDP/STP instructions. */
5852 if (TARGET_FLOAT && rclass == GENERAL_REGS
5853 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5854 return FP_REGS;
5856 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5857 return GENERAL_REGS;
5859 return NO_REGS;
5862 static bool
5863 aarch64_can_eliminate (const int from, const int to)
5865 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5866 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5868 if (frame_pointer_needed)
5870 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5871 return true;
5872 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5873 return false;
5874 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5875 && !cfun->calls_alloca)
5876 return true;
5877 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5878 return true;
5880 return false;
5882 else
5884 /* If we decided that we didn't need a leaf frame pointer but then used
5885 LR in the function, then we'll want a frame pointer after all, so
5886 prevent this elimination to ensure a frame pointer is used. */
5887 if (to == STACK_POINTER_REGNUM
5888 && flag_omit_leaf_frame_pointer
5889 && df_regs_ever_live_p (LR_REGNUM))
5890 return false;
5893 return true;
5896 HOST_WIDE_INT
5897 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5899 aarch64_layout_frame ();
5901 if (to == HARD_FRAME_POINTER_REGNUM)
5903 if (from == ARG_POINTER_REGNUM)
5904 return cfun->machine->frame.hard_fp_offset;
5906 if (from == FRAME_POINTER_REGNUM)
5907 return cfun->machine->frame.hard_fp_offset
5908 - cfun->machine->frame.locals_offset;
5911 if (to == STACK_POINTER_REGNUM)
5913 if (from == FRAME_POINTER_REGNUM)
5914 return cfun->machine->frame.frame_size
5915 - cfun->machine->frame.locals_offset;
5918 return cfun->machine->frame.frame_size;
5921 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5922 previous frame. */
5925 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5927 if (count != 0)
5928 return const0_rtx;
5929 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5933 static void
5934 aarch64_asm_trampoline_template (FILE *f)
5936 if (TARGET_ILP32)
5938 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5939 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5941 else
5943 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5944 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5946 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5947 assemble_aligned_integer (4, const0_rtx);
5948 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5949 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5952 static void
5953 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5955 rtx fnaddr, mem, a_tramp;
5956 const int tramp_code_sz = 16;
5958 /* Don't need to copy the trailing D-words, we fill those in below. */
5959 emit_block_move (m_tramp, assemble_trampoline_template (),
5960 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5961 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5962 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5963 if (GET_MODE (fnaddr) != ptr_mode)
5964 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5965 emit_move_insn (mem, fnaddr);
5967 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5968 emit_move_insn (mem, chain_value);
5970 /* XXX We should really define a "clear_cache" pattern and use
5971 gen_clear_cache(). */
5972 a_tramp = XEXP (m_tramp, 0);
5973 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5974 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5975 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5976 ptr_mode);
5979 static unsigned char
5980 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5982 switch (regclass)
5984 case CALLER_SAVE_REGS:
5985 case POINTER_REGS:
5986 case GENERAL_REGS:
5987 case ALL_REGS:
5988 case FP_REGS:
5989 case FP_LO_REGS:
5990 return
5991 aarch64_vector_mode_p (mode)
5992 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5993 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5994 case STACK_REG:
5995 return 1;
5997 case NO_REGS:
5998 return 0;
6000 default:
6001 break;
6003 gcc_unreachable ();
6006 static reg_class_t
6007 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6009 if (regclass == POINTER_REGS)
6010 return GENERAL_REGS;
6012 if (regclass == STACK_REG)
6014 if (REG_P(x)
6015 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6016 return regclass;
6018 return NO_REGS;
6021 /* Register eliminiation can result in a request for
6022 SP+constant->FP_REGS. We cannot support such operations which
6023 use SP as source and an FP_REG as destination, so reject out
6024 right now. */
6025 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6027 rtx lhs = XEXP (x, 0);
6029 /* Look through a possible SUBREG introduced by ILP32. */
6030 if (GET_CODE (lhs) == SUBREG)
6031 lhs = SUBREG_REG (lhs);
6033 gcc_assert (REG_P (lhs));
6034 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6035 POINTER_REGS));
6036 return NO_REGS;
6039 return regclass;
6042 void
6043 aarch64_asm_output_labelref (FILE* f, const char *name)
6045 asm_fprintf (f, "%U%s", name);
6048 static void
6049 aarch64_elf_asm_constructor (rtx symbol, int priority)
6051 if (priority == DEFAULT_INIT_PRIORITY)
6052 default_ctor_section_asm_out_constructor (symbol, priority);
6053 else
6055 section *s;
6056 /* While priority is known to be in range [0, 65535], so 18 bytes
6057 would be enough, the compiler might not know that. To avoid
6058 -Wformat-truncation false positive, use a larger size. */
6059 char buf[23];
6060 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6061 s = get_section (buf, SECTION_WRITE, NULL);
6062 switch_to_section (s);
6063 assemble_align (POINTER_SIZE);
6064 assemble_aligned_integer (POINTER_BYTES, symbol);
6068 static void
6069 aarch64_elf_asm_destructor (rtx symbol, int priority)
6071 if (priority == DEFAULT_INIT_PRIORITY)
6072 default_dtor_section_asm_out_destructor (symbol, priority);
6073 else
6075 section *s;
6076 /* While priority is known to be in range [0, 65535], so 18 bytes
6077 would be enough, the compiler might not know that. To avoid
6078 -Wformat-truncation false positive, use a larger size. */
6079 char buf[23];
6080 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6081 s = get_section (buf, SECTION_WRITE, NULL);
6082 switch_to_section (s);
6083 assemble_align (POINTER_SIZE);
6084 assemble_aligned_integer (POINTER_BYTES, symbol);
6088 const char*
6089 aarch64_output_casesi (rtx *operands)
6091 char buf[100];
6092 char label[100];
6093 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6094 int index;
6095 static const char *const patterns[4][2] =
6098 "ldrb\t%w3, [%0,%w1,uxtw]",
6099 "add\t%3, %4, %w3, sxtb #2"
6102 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6103 "add\t%3, %4, %w3, sxth #2"
6106 "ldr\t%w3, [%0,%w1,uxtw #2]",
6107 "add\t%3, %4, %w3, sxtw #2"
6109 /* We assume that DImode is only generated when not optimizing and
6110 that we don't really need 64-bit address offsets. That would
6111 imply an object file with 8GB of code in a single function! */
6113 "ldr\t%w3, [%0,%w1,uxtw #2]",
6114 "add\t%3, %4, %w3, sxtw #2"
6118 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6120 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6122 gcc_assert (index >= 0 && index <= 3);
6124 /* Need to implement table size reduction, by chaning the code below. */
6125 output_asm_insn (patterns[index][0], operands);
6126 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6127 snprintf (buf, sizeof (buf),
6128 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6129 output_asm_insn (buf, operands);
6130 output_asm_insn (patterns[index][1], operands);
6131 output_asm_insn ("br\t%3", operands);
6132 assemble_label (asm_out_file, label);
6133 return "";
6137 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6138 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6139 operator. */
6142 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6144 if (shift >= 0 && shift <= 3)
6146 int size;
6147 for (size = 8; size <= 32; size *= 2)
6149 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6150 if (mask == bits << shift)
6151 return size;
6154 return 0;
6157 /* Constant pools are per function only when PC relative
6158 literal loads are true or we are in the large memory
6159 model. */
6161 static inline bool
6162 aarch64_can_use_per_function_literal_pools_p (void)
6164 return (aarch64_pcrelative_literal_loads
6165 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6168 static bool
6169 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6171 /* Fixme:: In an ideal world this would work similar
6172 to the logic in aarch64_select_rtx_section but this
6173 breaks bootstrap in gcc go. For now we workaround
6174 this by returning false here. */
6175 return false;
6178 /* Select appropriate section for constants depending
6179 on where we place literal pools. */
6181 static section *
6182 aarch64_select_rtx_section (machine_mode mode,
6183 rtx x,
6184 unsigned HOST_WIDE_INT align)
6186 if (aarch64_can_use_per_function_literal_pools_p ())
6187 return function_section (current_function_decl);
6189 return default_elf_select_rtx_section (mode, x, align);
6192 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6193 void
6194 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6195 HOST_WIDE_INT offset)
6197 /* When using per-function literal pools, we must ensure that any code
6198 section is aligned to the minimal instruction length, lest we get
6199 errors from the assembler re "unaligned instructions". */
6200 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6201 ASM_OUTPUT_ALIGN (f, 2);
6204 /* Costs. */
6206 /* Helper function for rtx cost calculation. Strip a shift expression
6207 from X. Returns the inner operand if successful, or the original
6208 expression on failure. */
6209 static rtx
6210 aarch64_strip_shift (rtx x)
6212 rtx op = x;
6214 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6215 we can convert both to ROR during final output. */
6216 if ((GET_CODE (op) == ASHIFT
6217 || GET_CODE (op) == ASHIFTRT
6218 || GET_CODE (op) == LSHIFTRT
6219 || GET_CODE (op) == ROTATERT
6220 || GET_CODE (op) == ROTATE)
6221 && CONST_INT_P (XEXP (op, 1)))
6222 return XEXP (op, 0);
6224 if (GET_CODE (op) == MULT
6225 && CONST_INT_P (XEXP (op, 1))
6226 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6227 return XEXP (op, 0);
6229 return x;
6232 /* Helper function for rtx cost calculation. Strip an extend
6233 expression from X. Returns the inner operand if successful, or the
6234 original expression on failure. We deal with a number of possible
6235 canonicalization variations here. If STRIP_SHIFT is true, then
6236 we can strip off a shift also. */
6237 static rtx
6238 aarch64_strip_extend (rtx x, bool strip_shift)
6240 rtx op = x;
6242 /* Zero and sign extraction of a widened value. */
6243 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6244 && XEXP (op, 2) == const0_rtx
6245 && GET_CODE (XEXP (op, 0)) == MULT
6246 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6247 XEXP (op, 1)))
6248 return XEXP (XEXP (op, 0), 0);
6250 /* It can also be represented (for zero-extend) as an AND with an
6251 immediate. */
6252 if (GET_CODE (op) == AND
6253 && GET_CODE (XEXP (op, 0)) == MULT
6254 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6255 && CONST_INT_P (XEXP (op, 1))
6256 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6257 INTVAL (XEXP (op, 1))) != 0)
6258 return XEXP (XEXP (op, 0), 0);
6260 /* Now handle extended register, as this may also have an optional
6261 left shift by 1..4. */
6262 if (strip_shift
6263 && GET_CODE (op) == ASHIFT
6264 && CONST_INT_P (XEXP (op, 1))
6265 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6266 op = XEXP (op, 0);
6268 if (GET_CODE (op) == ZERO_EXTEND
6269 || GET_CODE (op) == SIGN_EXTEND)
6270 op = XEXP (op, 0);
6272 if (op != x)
6273 return op;
6275 return x;
6278 /* Return true iff CODE is a shift supported in combination
6279 with arithmetic instructions. */
6281 static bool
6282 aarch64_shift_p (enum rtx_code code)
6284 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6288 /* Return true iff X is a cheap shift without a sign extend. */
6290 static bool
6291 aarch64_cheap_mult_shift_p (rtx x)
6293 rtx op0, op1;
6295 op0 = XEXP (x, 0);
6296 op1 = XEXP (x, 1);
6298 if (!(aarch64_tune_params.extra_tuning_flags
6299 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6300 return false;
6302 if (GET_CODE (op0) == SIGN_EXTEND)
6303 return false;
6305 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6306 && UINTVAL (op1) <= 4)
6307 return true;
6309 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6310 return false;
6312 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6314 if (l2 > 0 && l2 <= 4)
6315 return true;
6317 return false;
6320 /* Helper function for rtx cost calculation. Calculate the cost of
6321 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6322 Return the calculated cost of the expression, recursing manually in to
6323 operands where needed. */
6325 static int
6326 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6328 rtx op0, op1;
6329 const struct cpu_cost_table *extra_cost
6330 = aarch64_tune_params.insn_extra_cost;
6331 int cost = 0;
6332 bool compound_p = (outer == PLUS || outer == MINUS);
6333 machine_mode mode = GET_MODE (x);
6335 gcc_checking_assert (code == MULT);
6337 op0 = XEXP (x, 0);
6338 op1 = XEXP (x, 1);
6340 if (VECTOR_MODE_P (mode))
6341 mode = GET_MODE_INNER (mode);
6343 /* Integer multiply/fma. */
6344 if (GET_MODE_CLASS (mode) == MODE_INT)
6346 /* The multiply will be canonicalized as a shift, cost it as such. */
6347 if (aarch64_shift_p (GET_CODE (x))
6348 || (CONST_INT_P (op1)
6349 && exact_log2 (INTVAL (op1)) > 0))
6351 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6352 || GET_CODE (op0) == SIGN_EXTEND;
6353 if (speed)
6355 if (compound_p)
6357 /* If the shift is considered cheap,
6358 then don't add any cost. */
6359 if (aarch64_cheap_mult_shift_p (x))
6361 else if (REG_P (op1))
6362 /* ARITH + shift-by-register. */
6363 cost += extra_cost->alu.arith_shift_reg;
6364 else if (is_extend)
6365 /* ARITH + extended register. We don't have a cost field
6366 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6367 cost += extra_cost->alu.extend_arith;
6368 else
6369 /* ARITH + shift-by-immediate. */
6370 cost += extra_cost->alu.arith_shift;
6372 else
6373 /* LSL (immediate). */
6374 cost += extra_cost->alu.shift;
6377 /* Strip extends as we will have costed them in the case above. */
6378 if (is_extend)
6379 op0 = aarch64_strip_extend (op0, true);
6381 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6383 return cost;
6386 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6387 compound and let the below cases handle it. After all, MNEG is a
6388 special-case alias of MSUB. */
6389 if (GET_CODE (op0) == NEG)
6391 op0 = XEXP (op0, 0);
6392 compound_p = true;
6395 /* Integer multiplies or FMAs have zero/sign extending variants. */
6396 if ((GET_CODE (op0) == ZERO_EXTEND
6397 && GET_CODE (op1) == ZERO_EXTEND)
6398 || (GET_CODE (op0) == SIGN_EXTEND
6399 && GET_CODE (op1) == SIGN_EXTEND))
6401 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6402 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6404 if (speed)
6406 if (compound_p)
6407 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6408 cost += extra_cost->mult[0].extend_add;
6409 else
6410 /* MUL/SMULL/UMULL. */
6411 cost += extra_cost->mult[0].extend;
6414 return cost;
6417 /* This is either an integer multiply or a MADD. In both cases
6418 we want to recurse and cost the operands. */
6419 cost += rtx_cost (op0, mode, MULT, 0, speed);
6420 cost += rtx_cost (op1, mode, MULT, 1, speed);
6422 if (speed)
6424 if (compound_p)
6425 /* MADD/MSUB. */
6426 cost += extra_cost->mult[mode == DImode].add;
6427 else
6428 /* MUL. */
6429 cost += extra_cost->mult[mode == DImode].simple;
6432 return cost;
6434 else
6436 if (speed)
6438 /* Floating-point FMA/FMUL can also support negations of the
6439 operands, unless the rounding mode is upward or downward in
6440 which case FNMUL is different than FMUL with operand negation. */
6441 bool neg0 = GET_CODE (op0) == NEG;
6442 bool neg1 = GET_CODE (op1) == NEG;
6443 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6445 if (neg0)
6446 op0 = XEXP (op0, 0);
6447 if (neg1)
6448 op1 = XEXP (op1, 0);
6451 if (compound_p)
6452 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6453 cost += extra_cost->fp[mode == DFmode].fma;
6454 else
6455 /* FMUL/FNMUL. */
6456 cost += extra_cost->fp[mode == DFmode].mult;
6459 cost += rtx_cost (op0, mode, MULT, 0, speed);
6460 cost += rtx_cost (op1, mode, MULT, 1, speed);
6461 return cost;
6465 static int
6466 aarch64_address_cost (rtx x,
6467 machine_mode mode,
6468 addr_space_t as ATTRIBUTE_UNUSED,
6469 bool speed)
6471 enum rtx_code c = GET_CODE (x);
6472 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6473 struct aarch64_address_info info;
6474 int cost = 0;
6475 info.shift = 0;
6477 if (!aarch64_classify_address (&info, x, mode, c, false))
6479 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6481 /* This is a CONST or SYMBOL ref which will be split
6482 in a different way depending on the code model in use.
6483 Cost it through the generic infrastructure. */
6484 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6485 /* Divide through by the cost of one instruction to
6486 bring it to the same units as the address costs. */
6487 cost_symbol_ref /= COSTS_N_INSNS (1);
6488 /* The cost is then the cost of preparing the address,
6489 followed by an immediate (possibly 0) offset. */
6490 return cost_symbol_ref + addr_cost->imm_offset;
6492 else
6494 /* This is most likely a jump table from a case
6495 statement. */
6496 return addr_cost->register_offset;
6500 switch (info.type)
6502 case ADDRESS_LO_SUM:
6503 case ADDRESS_SYMBOLIC:
6504 case ADDRESS_REG_IMM:
6505 cost += addr_cost->imm_offset;
6506 break;
6508 case ADDRESS_REG_WB:
6509 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6510 cost += addr_cost->pre_modify;
6511 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6512 cost += addr_cost->post_modify;
6513 else
6514 gcc_unreachable ();
6516 break;
6518 case ADDRESS_REG_REG:
6519 cost += addr_cost->register_offset;
6520 break;
6522 case ADDRESS_REG_SXTW:
6523 cost += addr_cost->register_sextend;
6524 break;
6526 case ADDRESS_REG_UXTW:
6527 cost += addr_cost->register_zextend;
6528 break;
6530 default:
6531 gcc_unreachable ();
6535 if (info.shift > 0)
6537 /* For the sake of calculating the cost of the shifted register
6538 component, we can treat same sized modes in the same way. */
6539 switch (GET_MODE_BITSIZE (mode))
6541 case 16:
6542 cost += addr_cost->addr_scale_costs.hi;
6543 break;
6545 case 32:
6546 cost += addr_cost->addr_scale_costs.si;
6547 break;
6549 case 64:
6550 cost += addr_cost->addr_scale_costs.di;
6551 break;
6553 /* We can't tell, or this is a 128-bit vector. */
6554 default:
6555 cost += addr_cost->addr_scale_costs.ti;
6556 break;
6560 return cost;
6563 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6564 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6565 to be taken. */
6568 aarch64_branch_cost (bool speed_p, bool predictable_p)
6570 /* When optimizing for speed, use the cost of unpredictable branches. */
6571 const struct cpu_branch_cost *branch_costs =
6572 aarch64_tune_params.branch_costs;
6574 if (!speed_p || predictable_p)
6575 return branch_costs->predictable;
6576 else
6577 return branch_costs->unpredictable;
6580 /* Return true if the RTX X in mode MODE is a zero or sign extract
6581 usable in an ADD or SUB (extended register) instruction. */
6582 static bool
6583 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6585 /* Catch add with a sign extract.
6586 This is add_<optab><mode>_multp2. */
6587 if (GET_CODE (x) == SIGN_EXTRACT
6588 || GET_CODE (x) == ZERO_EXTRACT)
6590 rtx op0 = XEXP (x, 0);
6591 rtx op1 = XEXP (x, 1);
6592 rtx op2 = XEXP (x, 2);
6594 if (GET_CODE (op0) == MULT
6595 && CONST_INT_P (op1)
6596 && op2 == const0_rtx
6597 && CONST_INT_P (XEXP (op0, 1))
6598 && aarch64_is_extend_from_extract (mode,
6599 XEXP (op0, 1),
6600 op1))
6602 return true;
6605 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6606 No shift. */
6607 else if (GET_CODE (x) == SIGN_EXTEND
6608 || GET_CODE (x) == ZERO_EXTEND)
6609 return REG_P (XEXP (x, 0));
6611 return false;
6614 static bool
6615 aarch64_frint_unspec_p (unsigned int u)
6617 switch (u)
6619 case UNSPEC_FRINTZ:
6620 case UNSPEC_FRINTP:
6621 case UNSPEC_FRINTM:
6622 case UNSPEC_FRINTA:
6623 case UNSPEC_FRINTN:
6624 case UNSPEC_FRINTX:
6625 case UNSPEC_FRINTI:
6626 return true;
6628 default:
6629 return false;
6633 /* Return true iff X is an rtx that will match an extr instruction
6634 i.e. as described in the *extr<mode>5_insn family of patterns.
6635 OP0 and OP1 will be set to the operands of the shifts involved
6636 on success and will be NULL_RTX otherwise. */
6638 static bool
6639 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6641 rtx op0, op1;
6642 machine_mode mode = GET_MODE (x);
6644 *res_op0 = NULL_RTX;
6645 *res_op1 = NULL_RTX;
6647 if (GET_CODE (x) != IOR)
6648 return false;
6650 op0 = XEXP (x, 0);
6651 op1 = XEXP (x, 1);
6653 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6654 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6656 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6657 if (GET_CODE (op1) == ASHIFT)
6658 std::swap (op0, op1);
6660 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6661 return false;
6663 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6664 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6666 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6667 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6669 *res_op0 = XEXP (op0, 0);
6670 *res_op1 = XEXP (op1, 0);
6671 return true;
6675 return false;
6678 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6679 storing it in *COST. Result is true if the total cost of the operation
6680 has now been calculated. */
6681 static bool
6682 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6684 rtx inner;
6685 rtx comparator;
6686 enum rtx_code cmpcode;
6688 if (COMPARISON_P (op0))
6690 inner = XEXP (op0, 0);
6691 comparator = XEXP (op0, 1);
6692 cmpcode = GET_CODE (op0);
6694 else
6696 inner = op0;
6697 comparator = const0_rtx;
6698 cmpcode = NE;
6701 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6703 /* Conditional branch. */
6704 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6705 return true;
6706 else
6708 if (cmpcode == NE || cmpcode == EQ)
6710 if (comparator == const0_rtx)
6712 /* TBZ/TBNZ/CBZ/CBNZ. */
6713 if (GET_CODE (inner) == ZERO_EXTRACT)
6714 /* TBZ/TBNZ. */
6715 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6716 ZERO_EXTRACT, 0, speed);
6717 else
6718 /* CBZ/CBNZ. */
6719 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6721 return true;
6724 else if (cmpcode == LT || cmpcode == GE)
6726 /* TBZ/TBNZ. */
6727 if (comparator == const0_rtx)
6728 return true;
6732 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6734 /* CCMP. */
6735 if (GET_CODE (op1) == COMPARE)
6737 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6738 if (XEXP (op1, 1) == const0_rtx)
6739 *cost += 1;
6740 if (speed)
6742 machine_mode mode = GET_MODE (XEXP (op1, 0));
6743 const struct cpu_cost_table *extra_cost
6744 = aarch64_tune_params.insn_extra_cost;
6746 if (GET_MODE_CLASS (mode) == MODE_INT)
6747 *cost += extra_cost->alu.arith;
6748 else
6749 *cost += extra_cost->fp[mode == DFmode].compare;
6751 return true;
6754 /* It's a conditional operation based on the status flags,
6755 so it must be some flavor of CSEL. */
6757 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6758 if (GET_CODE (op1) == NEG
6759 || GET_CODE (op1) == NOT
6760 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6761 op1 = XEXP (op1, 0);
6762 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6764 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6765 op1 = XEXP (op1, 0);
6766 op2 = XEXP (op2, 0);
6769 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6770 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6771 return true;
6774 /* We don't know what this is, cost all operands. */
6775 return false;
6778 /* Check whether X is a bitfield operation of the form shift + extend that
6779 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6780 operand to which the bitfield operation is applied. Otherwise return
6781 NULL_RTX. */
6783 static rtx
6784 aarch64_extend_bitfield_pattern_p (rtx x)
6786 rtx_code outer_code = GET_CODE (x);
6787 machine_mode outer_mode = GET_MODE (x);
6789 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6790 && outer_mode != SImode && outer_mode != DImode)
6791 return NULL_RTX;
6793 rtx inner = XEXP (x, 0);
6794 rtx_code inner_code = GET_CODE (inner);
6795 machine_mode inner_mode = GET_MODE (inner);
6796 rtx op = NULL_RTX;
6798 switch (inner_code)
6800 case ASHIFT:
6801 if (CONST_INT_P (XEXP (inner, 1))
6802 && (inner_mode == QImode || inner_mode == HImode))
6803 op = XEXP (inner, 0);
6804 break;
6805 case LSHIFTRT:
6806 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6807 && (inner_mode == QImode || inner_mode == HImode))
6808 op = XEXP (inner, 0);
6809 break;
6810 case ASHIFTRT:
6811 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6812 && (inner_mode == QImode || inner_mode == HImode))
6813 op = XEXP (inner, 0);
6814 break;
6815 default:
6816 break;
6819 return op;
6822 /* Return true if the mask and a shift amount from an RTX of the form
6823 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6824 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6826 bool
6827 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6829 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6830 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6831 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6832 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6835 /* Calculate the cost of calculating X, storing it in *COST. Result
6836 is true if the total cost of the operation has now been calculated. */
6837 static bool
6838 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6839 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6841 rtx op0, op1, op2;
6842 const struct cpu_cost_table *extra_cost
6843 = aarch64_tune_params.insn_extra_cost;
6844 int code = GET_CODE (x);
6846 /* By default, assume that everything has equivalent cost to the
6847 cheapest instruction. Any additional costs are applied as a delta
6848 above this default. */
6849 *cost = COSTS_N_INSNS (1);
6851 switch (code)
6853 case SET:
6854 /* The cost depends entirely on the operands to SET. */
6855 *cost = 0;
6856 op0 = SET_DEST (x);
6857 op1 = SET_SRC (x);
6859 switch (GET_CODE (op0))
6861 case MEM:
6862 if (speed)
6864 rtx address = XEXP (op0, 0);
6865 if (VECTOR_MODE_P (mode))
6866 *cost += extra_cost->ldst.storev;
6867 else if (GET_MODE_CLASS (mode) == MODE_INT)
6868 *cost += extra_cost->ldst.store;
6869 else if (mode == SFmode)
6870 *cost += extra_cost->ldst.storef;
6871 else if (mode == DFmode)
6872 *cost += extra_cost->ldst.stored;
6874 *cost +=
6875 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6876 0, speed));
6879 *cost += rtx_cost (op1, mode, SET, 1, speed);
6880 return true;
6882 case SUBREG:
6883 if (! REG_P (SUBREG_REG (op0)))
6884 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6886 /* Fall through. */
6887 case REG:
6888 /* The cost is one per vector-register copied. */
6889 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6891 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6892 / GET_MODE_SIZE (V4SImode);
6893 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6895 /* const0_rtx is in general free, but we will use an
6896 instruction to set a register to 0. */
6897 else if (REG_P (op1) || op1 == const0_rtx)
6899 /* The cost is 1 per register copied. */
6900 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6901 / UNITS_PER_WORD;
6902 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6904 else
6905 /* Cost is just the cost of the RHS of the set. */
6906 *cost += rtx_cost (op1, mode, SET, 1, speed);
6907 return true;
6909 case ZERO_EXTRACT:
6910 case SIGN_EXTRACT:
6911 /* Bit-field insertion. Strip any redundant widening of
6912 the RHS to meet the width of the target. */
6913 if (GET_CODE (op1) == SUBREG)
6914 op1 = SUBREG_REG (op1);
6915 if ((GET_CODE (op1) == ZERO_EXTEND
6916 || GET_CODE (op1) == SIGN_EXTEND)
6917 && CONST_INT_P (XEXP (op0, 1))
6918 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6919 >= INTVAL (XEXP (op0, 1))))
6920 op1 = XEXP (op1, 0);
6922 if (CONST_INT_P (op1))
6924 /* MOV immediate is assumed to always be cheap. */
6925 *cost = COSTS_N_INSNS (1);
6927 else
6929 /* BFM. */
6930 if (speed)
6931 *cost += extra_cost->alu.bfi;
6932 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6935 return true;
6937 default:
6938 /* We can't make sense of this, assume default cost. */
6939 *cost = COSTS_N_INSNS (1);
6940 return false;
6942 return false;
6944 case CONST_INT:
6945 /* If an instruction can incorporate a constant within the
6946 instruction, the instruction's expression avoids calling
6947 rtx_cost() on the constant. If rtx_cost() is called on a
6948 constant, then it is usually because the constant must be
6949 moved into a register by one or more instructions.
6951 The exception is constant 0, which can be expressed
6952 as XZR/WZR and is therefore free. The exception to this is
6953 if we have (set (reg) (const0_rtx)) in which case we must cost
6954 the move. However, we can catch that when we cost the SET, so
6955 we don't need to consider that here. */
6956 if (x == const0_rtx)
6957 *cost = 0;
6958 else
6960 /* To an approximation, building any other constant is
6961 proportionally expensive to the number of instructions
6962 required to build that constant. This is true whether we
6963 are compiling for SPEED or otherwise. */
6964 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6965 (NULL_RTX, x, false, mode));
6967 return true;
6969 case CONST_DOUBLE:
6971 /* First determine number of instructions to do the move
6972 as an integer constant. */
6973 if (!aarch64_float_const_representable_p (x)
6974 && !aarch64_can_const_movi_rtx_p (x, mode)
6975 && aarch64_float_const_rtx_p (x))
6977 unsigned HOST_WIDE_INT ival;
6978 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6979 gcc_assert (succeed);
6981 machine_mode imode = mode == HFmode ? SImode
6982 : int_mode_for_mode (mode);
6983 int ncost = aarch64_internal_mov_immediate
6984 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6985 *cost += COSTS_N_INSNS (ncost);
6986 return true;
6989 if (speed)
6991 /* mov[df,sf]_aarch64. */
6992 if (aarch64_float_const_representable_p (x))
6993 /* FMOV (scalar immediate). */
6994 *cost += extra_cost->fp[mode == DFmode].fpconst;
6995 else if (!aarch64_float_const_zero_rtx_p (x))
6997 /* This will be a load from memory. */
6998 if (mode == DFmode)
6999 *cost += extra_cost->ldst.loadd;
7000 else
7001 *cost += extra_cost->ldst.loadf;
7003 else
7004 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7005 or MOV v0.s[0], wzr - neither of which are modeled by the
7006 cost tables. Just use the default cost. */
7011 return true;
7013 case MEM:
7014 if (speed)
7016 /* For loads we want the base cost of a load, plus an
7017 approximation for the additional cost of the addressing
7018 mode. */
7019 rtx address = XEXP (x, 0);
7020 if (VECTOR_MODE_P (mode))
7021 *cost += extra_cost->ldst.loadv;
7022 else if (GET_MODE_CLASS (mode) == MODE_INT)
7023 *cost += extra_cost->ldst.load;
7024 else if (mode == SFmode)
7025 *cost += extra_cost->ldst.loadf;
7026 else if (mode == DFmode)
7027 *cost += extra_cost->ldst.loadd;
7029 *cost +=
7030 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7031 0, speed));
7034 return true;
7036 case NEG:
7037 op0 = XEXP (x, 0);
7039 if (VECTOR_MODE_P (mode))
7041 if (speed)
7043 /* FNEG. */
7044 *cost += extra_cost->vect.alu;
7046 return false;
7049 if (GET_MODE_CLASS (mode) == MODE_INT)
7051 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7052 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7054 /* CSETM. */
7055 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7056 return true;
7059 /* Cost this as SUB wzr, X. */
7060 op0 = CONST0_RTX (mode);
7061 op1 = XEXP (x, 0);
7062 goto cost_minus;
7065 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7067 /* Support (neg(fma...)) as a single instruction only if
7068 sign of zeros is unimportant. This matches the decision
7069 making in aarch64.md. */
7070 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7072 /* FNMADD. */
7073 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7074 return true;
7076 if (GET_CODE (op0) == MULT)
7078 /* FNMUL. */
7079 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7080 return true;
7082 if (speed)
7083 /* FNEG. */
7084 *cost += extra_cost->fp[mode == DFmode].neg;
7085 return false;
7088 return false;
7090 case CLRSB:
7091 case CLZ:
7092 if (speed)
7094 if (VECTOR_MODE_P (mode))
7095 *cost += extra_cost->vect.alu;
7096 else
7097 *cost += extra_cost->alu.clz;
7100 return false;
7102 case COMPARE:
7103 op0 = XEXP (x, 0);
7104 op1 = XEXP (x, 1);
7106 if (op1 == const0_rtx
7107 && GET_CODE (op0) == AND)
7109 x = op0;
7110 mode = GET_MODE (op0);
7111 goto cost_logic;
7114 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7116 /* TODO: A write to the CC flags possibly costs extra, this
7117 needs encoding in the cost tables. */
7119 mode = GET_MODE (op0);
7120 /* ANDS. */
7121 if (GET_CODE (op0) == AND)
7123 x = op0;
7124 goto cost_logic;
7127 if (GET_CODE (op0) == PLUS)
7129 /* ADDS (and CMN alias). */
7130 x = op0;
7131 goto cost_plus;
7134 if (GET_CODE (op0) == MINUS)
7136 /* SUBS. */
7137 x = op0;
7138 goto cost_minus;
7141 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7142 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7143 && CONST_INT_P (XEXP (op0, 2)))
7145 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7146 Handle it here directly rather than going to cost_logic
7147 since we know the immediate generated for the TST is valid
7148 so we can avoid creating an intermediate rtx for it only
7149 for costing purposes. */
7150 if (speed)
7151 *cost += extra_cost->alu.logical;
7153 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7154 ZERO_EXTRACT, 0, speed);
7155 return true;
7158 if (GET_CODE (op1) == NEG)
7160 /* CMN. */
7161 if (speed)
7162 *cost += extra_cost->alu.arith;
7164 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7165 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7166 return true;
7169 /* CMP.
7171 Compare can freely swap the order of operands, and
7172 canonicalization puts the more complex operation first.
7173 But the integer MINUS logic expects the shift/extend
7174 operation in op1. */
7175 if (! (REG_P (op0)
7176 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7178 op0 = XEXP (x, 1);
7179 op1 = XEXP (x, 0);
7181 goto cost_minus;
7184 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7186 /* FCMP. */
7187 if (speed)
7188 *cost += extra_cost->fp[mode == DFmode].compare;
7190 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7192 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7193 /* FCMP supports constant 0.0 for no extra cost. */
7194 return true;
7196 return false;
7199 if (VECTOR_MODE_P (mode))
7201 /* Vector compare. */
7202 if (speed)
7203 *cost += extra_cost->vect.alu;
7205 if (aarch64_float_const_zero_rtx_p (op1))
7207 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7208 cost. */
7209 return true;
7211 return false;
7213 return false;
7215 case MINUS:
7217 op0 = XEXP (x, 0);
7218 op1 = XEXP (x, 1);
7220 cost_minus:
7221 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7223 /* Detect valid immediates. */
7224 if ((GET_MODE_CLASS (mode) == MODE_INT
7225 || (GET_MODE_CLASS (mode) == MODE_CC
7226 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7227 && CONST_INT_P (op1)
7228 && aarch64_uimm12_shift (INTVAL (op1)))
7230 if (speed)
7231 /* SUB(S) (immediate). */
7232 *cost += extra_cost->alu.arith;
7233 return true;
7236 /* Look for SUB (extended register). */
7237 if (aarch64_rtx_arith_op_extract_p (op1, mode))
7239 if (speed)
7240 *cost += extra_cost->alu.extend_arith;
7242 op1 = aarch64_strip_extend (op1, true);
7243 *cost += rtx_cost (op1, VOIDmode,
7244 (enum rtx_code) GET_CODE (op1), 0, speed);
7245 return true;
7248 rtx new_op1 = aarch64_strip_extend (op1, false);
7250 /* Cost this as an FMA-alike operation. */
7251 if ((GET_CODE (new_op1) == MULT
7252 || aarch64_shift_p (GET_CODE (new_op1)))
7253 && code != COMPARE)
7255 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7256 (enum rtx_code) code,
7257 speed);
7258 return true;
7261 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7263 if (speed)
7265 if (VECTOR_MODE_P (mode))
7267 /* Vector SUB. */
7268 *cost += extra_cost->vect.alu;
7270 else if (GET_MODE_CLASS (mode) == MODE_INT)
7272 /* SUB(S). */
7273 *cost += extra_cost->alu.arith;
7275 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7277 /* FSUB. */
7278 *cost += extra_cost->fp[mode == DFmode].addsub;
7281 return true;
7284 case PLUS:
7286 rtx new_op0;
7288 op0 = XEXP (x, 0);
7289 op1 = XEXP (x, 1);
7291 cost_plus:
7292 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7293 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7295 /* CSINC. */
7296 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7297 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7298 return true;
7301 if (GET_MODE_CLASS (mode) == MODE_INT
7302 && CONST_INT_P (op1)
7303 && aarch64_uimm12_shift (INTVAL (op1)))
7305 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7307 if (speed)
7308 /* ADD (immediate). */
7309 *cost += extra_cost->alu.arith;
7310 return true;
7313 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7315 /* Look for ADD (extended register). */
7316 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7318 if (speed)
7319 *cost += extra_cost->alu.extend_arith;
7321 op0 = aarch64_strip_extend (op0, true);
7322 *cost += rtx_cost (op0, VOIDmode,
7323 (enum rtx_code) GET_CODE (op0), 0, speed);
7324 return true;
7327 /* Strip any extend, leave shifts behind as we will
7328 cost them through mult_cost. */
7329 new_op0 = aarch64_strip_extend (op0, false);
7331 if (GET_CODE (new_op0) == MULT
7332 || aarch64_shift_p (GET_CODE (new_op0)))
7334 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7335 speed);
7336 return true;
7339 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7341 if (speed)
7343 if (VECTOR_MODE_P (mode))
7345 /* Vector ADD. */
7346 *cost += extra_cost->vect.alu;
7348 else if (GET_MODE_CLASS (mode) == MODE_INT)
7350 /* ADD. */
7351 *cost += extra_cost->alu.arith;
7353 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7355 /* FADD. */
7356 *cost += extra_cost->fp[mode == DFmode].addsub;
7359 return true;
7362 case BSWAP:
7363 *cost = COSTS_N_INSNS (1);
7365 if (speed)
7367 if (VECTOR_MODE_P (mode))
7368 *cost += extra_cost->vect.alu;
7369 else
7370 *cost += extra_cost->alu.rev;
7372 return false;
7374 case IOR:
7375 if (aarch_rev16_p (x))
7377 *cost = COSTS_N_INSNS (1);
7379 if (speed)
7381 if (VECTOR_MODE_P (mode))
7382 *cost += extra_cost->vect.alu;
7383 else
7384 *cost += extra_cost->alu.rev;
7386 return true;
7389 if (aarch64_extr_rtx_p (x, &op0, &op1))
7391 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7392 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7393 if (speed)
7394 *cost += extra_cost->alu.shift;
7396 return true;
7398 /* Fall through. */
7399 case XOR:
7400 case AND:
7401 cost_logic:
7402 op0 = XEXP (x, 0);
7403 op1 = XEXP (x, 1);
7405 if (VECTOR_MODE_P (mode))
7407 if (speed)
7408 *cost += extra_cost->vect.alu;
7409 return true;
7412 if (code == AND
7413 && GET_CODE (op0) == MULT
7414 && CONST_INT_P (XEXP (op0, 1))
7415 && CONST_INT_P (op1)
7416 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7417 INTVAL (op1)) != 0)
7419 /* This is a UBFM/SBFM. */
7420 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7421 if (speed)
7422 *cost += extra_cost->alu.bfx;
7423 return true;
7426 if (GET_MODE_CLASS (mode) == MODE_INT)
7428 if (CONST_INT_P (op1))
7430 /* We have a mask + shift version of a UBFIZ
7431 i.e. the *andim_ashift<mode>_bfiz pattern. */
7432 if (GET_CODE (op0) == ASHIFT
7433 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7434 XEXP (op0, 1)))
7436 *cost += rtx_cost (XEXP (op0, 0), mode,
7437 (enum rtx_code) code, 0, speed);
7438 if (speed)
7439 *cost += extra_cost->alu.bfx;
7441 return true;
7443 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7445 /* We possibly get the immediate for free, this is not
7446 modelled. */
7447 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7448 if (speed)
7449 *cost += extra_cost->alu.logical;
7451 return true;
7454 else
7456 rtx new_op0 = op0;
7458 /* Handle ORN, EON, or BIC. */
7459 if (GET_CODE (op0) == NOT)
7460 op0 = XEXP (op0, 0);
7462 new_op0 = aarch64_strip_shift (op0);
7464 /* If we had a shift on op0 then this is a logical-shift-
7465 by-register/immediate operation. Otherwise, this is just
7466 a logical operation. */
7467 if (speed)
7469 if (new_op0 != op0)
7471 /* Shift by immediate. */
7472 if (CONST_INT_P (XEXP (op0, 1)))
7473 *cost += extra_cost->alu.log_shift;
7474 else
7475 *cost += extra_cost->alu.log_shift_reg;
7477 else
7478 *cost += extra_cost->alu.logical;
7481 /* In both cases we want to cost both operands. */
7482 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7483 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7485 return true;
7488 return false;
7490 case NOT:
7491 x = XEXP (x, 0);
7492 op0 = aarch64_strip_shift (x);
7494 if (VECTOR_MODE_P (mode))
7496 /* Vector NOT. */
7497 *cost += extra_cost->vect.alu;
7498 return false;
7501 /* MVN-shifted-reg. */
7502 if (op0 != x)
7504 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7506 if (speed)
7507 *cost += extra_cost->alu.log_shift;
7509 return true;
7511 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7512 Handle the second form here taking care that 'a' in the above can
7513 be a shift. */
7514 else if (GET_CODE (op0) == XOR)
7516 rtx newop0 = XEXP (op0, 0);
7517 rtx newop1 = XEXP (op0, 1);
7518 rtx op0_stripped = aarch64_strip_shift (newop0);
7520 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7521 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7523 if (speed)
7525 if (op0_stripped != newop0)
7526 *cost += extra_cost->alu.log_shift;
7527 else
7528 *cost += extra_cost->alu.logical;
7531 return true;
7533 /* MVN. */
7534 if (speed)
7535 *cost += extra_cost->alu.logical;
7537 return false;
7539 case ZERO_EXTEND:
7541 op0 = XEXP (x, 0);
7542 /* If a value is written in SI mode, then zero extended to DI
7543 mode, the operation will in general be free as a write to
7544 a 'w' register implicitly zeroes the upper bits of an 'x'
7545 register. However, if this is
7547 (set (reg) (zero_extend (reg)))
7549 we must cost the explicit register move. */
7550 if (mode == DImode
7551 && GET_MODE (op0) == SImode
7552 && outer == SET)
7554 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7556 /* If OP_COST is non-zero, then the cost of the zero extend
7557 is effectively the cost of the inner operation. Otherwise
7558 we have a MOV instruction and we take the cost from the MOV
7559 itself. This is true independently of whether we are
7560 optimizing for space or time. */
7561 if (op_cost)
7562 *cost = op_cost;
7564 return true;
7566 else if (MEM_P (op0))
7568 /* All loads can zero extend to any size for free. */
7569 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7570 return true;
7573 op0 = aarch64_extend_bitfield_pattern_p (x);
7574 if (op0)
7576 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7577 if (speed)
7578 *cost += extra_cost->alu.bfx;
7579 return true;
7582 if (speed)
7584 if (VECTOR_MODE_P (mode))
7586 /* UMOV. */
7587 *cost += extra_cost->vect.alu;
7589 else
7591 /* We generate an AND instead of UXTB/UXTH. */
7592 *cost += extra_cost->alu.logical;
7595 return false;
7597 case SIGN_EXTEND:
7598 if (MEM_P (XEXP (x, 0)))
7600 /* LDRSH. */
7601 if (speed)
7603 rtx address = XEXP (XEXP (x, 0), 0);
7604 *cost += extra_cost->ldst.load_sign_extend;
7606 *cost +=
7607 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7608 0, speed));
7610 return true;
7613 op0 = aarch64_extend_bitfield_pattern_p (x);
7614 if (op0)
7616 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7617 if (speed)
7618 *cost += extra_cost->alu.bfx;
7619 return true;
7622 if (speed)
7624 if (VECTOR_MODE_P (mode))
7625 *cost += extra_cost->vect.alu;
7626 else
7627 *cost += extra_cost->alu.extend;
7629 return false;
7631 case ASHIFT:
7632 op0 = XEXP (x, 0);
7633 op1 = XEXP (x, 1);
7635 if (CONST_INT_P (op1))
7637 if (speed)
7639 if (VECTOR_MODE_P (mode))
7641 /* Vector shift (immediate). */
7642 *cost += extra_cost->vect.alu;
7644 else
7646 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7647 aliases. */
7648 *cost += extra_cost->alu.shift;
7652 /* We can incorporate zero/sign extend for free. */
7653 if (GET_CODE (op0) == ZERO_EXTEND
7654 || GET_CODE (op0) == SIGN_EXTEND)
7655 op0 = XEXP (op0, 0);
7657 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7658 return true;
7660 else
7662 if (VECTOR_MODE_P (mode))
7664 if (speed)
7665 /* Vector shift (register). */
7666 *cost += extra_cost->vect.alu;
7668 else
7670 if (speed)
7671 /* LSLV. */
7672 *cost += extra_cost->alu.shift_reg;
7674 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7675 && CONST_INT_P (XEXP (op1, 1))
7676 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7678 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7679 /* We already demanded XEXP (op1, 0) to be REG_P, so
7680 don't recurse into it. */
7681 return true;
7684 return false; /* All arguments need to be in registers. */
7687 case ROTATE:
7688 case ROTATERT:
7689 case LSHIFTRT:
7690 case ASHIFTRT:
7691 op0 = XEXP (x, 0);
7692 op1 = XEXP (x, 1);
7694 if (CONST_INT_P (op1))
7696 /* ASR (immediate) and friends. */
7697 if (speed)
7699 if (VECTOR_MODE_P (mode))
7700 *cost += extra_cost->vect.alu;
7701 else
7702 *cost += extra_cost->alu.shift;
7705 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7706 return true;
7708 else
7710 if (VECTOR_MODE_P (mode))
7712 if (speed)
7713 /* Vector shift (register). */
7714 *cost += extra_cost->vect.alu;
7716 else
7718 if (speed)
7719 /* ASR (register) and friends. */
7720 *cost += extra_cost->alu.shift_reg;
7722 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7723 && CONST_INT_P (XEXP (op1, 1))
7724 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7726 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7727 /* We already demanded XEXP (op1, 0) to be REG_P, so
7728 don't recurse into it. */
7729 return true;
7732 return false; /* All arguments need to be in registers. */
7735 case SYMBOL_REF:
7737 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7738 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7740 /* LDR. */
7741 if (speed)
7742 *cost += extra_cost->ldst.load;
7744 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7745 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7747 /* ADRP, followed by ADD. */
7748 *cost += COSTS_N_INSNS (1);
7749 if (speed)
7750 *cost += 2 * extra_cost->alu.arith;
7752 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7753 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7755 /* ADR. */
7756 if (speed)
7757 *cost += extra_cost->alu.arith;
7760 if (flag_pic)
7762 /* One extra load instruction, after accessing the GOT. */
7763 *cost += COSTS_N_INSNS (1);
7764 if (speed)
7765 *cost += extra_cost->ldst.load;
7767 return true;
7769 case HIGH:
7770 case LO_SUM:
7771 /* ADRP/ADD (immediate). */
7772 if (speed)
7773 *cost += extra_cost->alu.arith;
7774 return true;
7776 case ZERO_EXTRACT:
7777 case SIGN_EXTRACT:
7778 /* UBFX/SBFX. */
7779 if (speed)
7781 if (VECTOR_MODE_P (mode))
7782 *cost += extra_cost->vect.alu;
7783 else
7784 *cost += extra_cost->alu.bfx;
7787 /* We can trust that the immediates used will be correct (there
7788 are no by-register forms), so we need only cost op0. */
7789 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7790 return true;
7792 case MULT:
7793 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7794 /* aarch64_rtx_mult_cost always handles recursion to its
7795 operands. */
7796 return true;
7798 case MOD:
7799 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7800 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7801 an unconditional negate. This case should only ever be reached through
7802 the set_smod_pow2_cheap check in expmed.c. */
7803 if (CONST_INT_P (XEXP (x, 1))
7804 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7805 && (mode == SImode || mode == DImode))
7807 /* We expand to 4 instructions. Reset the baseline. */
7808 *cost = COSTS_N_INSNS (4);
7810 if (speed)
7811 *cost += 2 * extra_cost->alu.logical
7812 + 2 * extra_cost->alu.arith;
7814 return true;
7817 /* Fall-through. */
7818 case UMOD:
7819 if (speed)
7821 /* Slighly prefer UMOD over SMOD. */
7822 if (VECTOR_MODE_P (mode))
7823 *cost += extra_cost->vect.alu;
7824 else if (GET_MODE_CLASS (mode) == MODE_INT)
7825 *cost += (extra_cost->mult[mode == DImode].add
7826 + extra_cost->mult[mode == DImode].idiv
7827 + (code == MOD ? 1 : 0));
7829 return false; /* All arguments need to be in registers. */
7831 case DIV:
7832 case UDIV:
7833 case SQRT:
7834 if (speed)
7836 if (VECTOR_MODE_P (mode))
7837 *cost += extra_cost->vect.alu;
7838 else if (GET_MODE_CLASS (mode) == MODE_INT)
7839 /* There is no integer SQRT, so only DIV and UDIV can get
7840 here. */
7841 *cost += (extra_cost->mult[mode == DImode].idiv
7842 /* Slighly prefer UDIV over SDIV. */
7843 + (code == DIV ? 1 : 0));
7844 else
7845 *cost += extra_cost->fp[mode == DFmode].div;
7847 return false; /* All arguments need to be in registers. */
7849 case IF_THEN_ELSE:
7850 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7851 XEXP (x, 2), cost, speed);
7853 case EQ:
7854 case NE:
7855 case GT:
7856 case GTU:
7857 case LT:
7858 case LTU:
7859 case GE:
7860 case GEU:
7861 case LE:
7862 case LEU:
7864 return false; /* All arguments must be in registers. */
7866 case FMA:
7867 op0 = XEXP (x, 0);
7868 op1 = XEXP (x, 1);
7869 op2 = XEXP (x, 2);
7871 if (speed)
7873 if (VECTOR_MODE_P (mode))
7874 *cost += extra_cost->vect.alu;
7875 else
7876 *cost += extra_cost->fp[mode == DFmode].fma;
7879 /* FMSUB, FNMADD, and FNMSUB are free. */
7880 if (GET_CODE (op0) == NEG)
7881 op0 = XEXP (op0, 0);
7883 if (GET_CODE (op2) == NEG)
7884 op2 = XEXP (op2, 0);
7886 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7887 and the by-element operand as operand 0. */
7888 if (GET_CODE (op1) == NEG)
7889 op1 = XEXP (op1, 0);
7891 /* Catch vector-by-element operations. The by-element operand can
7892 either be (vec_duplicate (vec_select (x))) or just
7893 (vec_select (x)), depending on whether we are multiplying by
7894 a vector or a scalar.
7896 Canonicalization is not very good in these cases, FMA4 will put the
7897 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7898 if (GET_CODE (op0) == VEC_DUPLICATE)
7899 op0 = XEXP (op0, 0);
7900 else if (GET_CODE (op1) == VEC_DUPLICATE)
7901 op1 = XEXP (op1, 0);
7903 if (GET_CODE (op0) == VEC_SELECT)
7904 op0 = XEXP (op0, 0);
7905 else if (GET_CODE (op1) == VEC_SELECT)
7906 op1 = XEXP (op1, 0);
7908 /* If the remaining parameters are not registers,
7909 get the cost to put them into registers. */
7910 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7911 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7912 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7913 return true;
7915 case FLOAT:
7916 case UNSIGNED_FLOAT:
7917 if (speed)
7918 *cost += extra_cost->fp[mode == DFmode].fromint;
7919 return false;
7921 case FLOAT_EXTEND:
7922 if (speed)
7924 if (VECTOR_MODE_P (mode))
7926 /*Vector truncate. */
7927 *cost += extra_cost->vect.alu;
7929 else
7930 *cost += extra_cost->fp[mode == DFmode].widen;
7932 return false;
7934 case FLOAT_TRUNCATE:
7935 if (speed)
7937 if (VECTOR_MODE_P (mode))
7939 /*Vector conversion. */
7940 *cost += extra_cost->vect.alu;
7942 else
7943 *cost += extra_cost->fp[mode == DFmode].narrow;
7945 return false;
7947 case FIX:
7948 case UNSIGNED_FIX:
7949 x = XEXP (x, 0);
7950 /* Strip the rounding part. They will all be implemented
7951 by the fcvt* family of instructions anyway. */
7952 if (GET_CODE (x) == UNSPEC)
7954 unsigned int uns_code = XINT (x, 1);
7956 if (uns_code == UNSPEC_FRINTA
7957 || uns_code == UNSPEC_FRINTM
7958 || uns_code == UNSPEC_FRINTN
7959 || uns_code == UNSPEC_FRINTP
7960 || uns_code == UNSPEC_FRINTZ)
7961 x = XVECEXP (x, 0, 0);
7964 if (speed)
7966 if (VECTOR_MODE_P (mode))
7967 *cost += extra_cost->vect.alu;
7968 else
7969 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7972 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7973 fixed-point fcvt. */
7974 if (GET_CODE (x) == MULT
7975 && ((VECTOR_MODE_P (mode)
7976 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7977 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7979 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7980 0, speed);
7981 return true;
7984 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7985 return true;
7987 case ABS:
7988 if (VECTOR_MODE_P (mode))
7990 /* ABS (vector). */
7991 if (speed)
7992 *cost += extra_cost->vect.alu;
7994 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7996 op0 = XEXP (x, 0);
7998 /* FABD, which is analogous to FADD. */
7999 if (GET_CODE (op0) == MINUS)
8001 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8002 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8003 if (speed)
8004 *cost += extra_cost->fp[mode == DFmode].addsub;
8006 return true;
8008 /* Simple FABS is analogous to FNEG. */
8009 if (speed)
8010 *cost += extra_cost->fp[mode == DFmode].neg;
8012 else
8014 /* Integer ABS will either be split to
8015 two arithmetic instructions, or will be an ABS
8016 (scalar), which we don't model. */
8017 *cost = COSTS_N_INSNS (2);
8018 if (speed)
8019 *cost += 2 * extra_cost->alu.arith;
8021 return false;
8023 case SMAX:
8024 case SMIN:
8025 if (speed)
8027 if (VECTOR_MODE_P (mode))
8028 *cost += extra_cost->vect.alu;
8029 else
8031 /* FMAXNM/FMINNM/FMAX/FMIN.
8032 TODO: This may not be accurate for all implementations, but
8033 we do not model this in the cost tables. */
8034 *cost += extra_cost->fp[mode == DFmode].addsub;
8037 return false;
8039 case UNSPEC:
8040 /* The floating point round to integer frint* instructions. */
8041 if (aarch64_frint_unspec_p (XINT (x, 1)))
8043 if (speed)
8044 *cost += extra_cost->fp[mode == DFmode].roundint;
8046 return false;
8049 if (XINT (x, 1) == UNSPEC_RBIT)
8051 if (speed)
8052 *cost += extra_cost->alu.rev;
8054 return false;
8056 break;
8058 case TRUNCATE:
8060 /* Decompose <su>muldi3_highpart. */
8061 if (/* (truncate:DI */
8062 mode == DImode
8063 /* (lshiftrt:TI */
8064 && GET_MODE (XEXP (x, 0)) == TImode
8065 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8066 /* (mult:TI */
8067 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8068 /* (ANY_EXTEND:TI (reg:DI))
8069 (ANY_EXTEND:TI (reg:DI))) */
8070 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8071 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8072 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8073 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8074 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8075 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8076 /* (const_int 64) */
8077 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8078 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8080 /* UMULH/SMULH. */
8081 if (speed)
8082 *cost += extra_cost->mult[mode == DImode].extend;
8083 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8084 mode, MULT, 0, speed);
8085 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8086 mode, MULT, 1, speed);
8087 return true;
8090 /* Fall through. */
8091 default:
8092 break;
8095 if (dump_file
8096 && flag_aarch64_verbose_cost)
8097 fprintf (dump_file,
8098 "\nFailed to cost RTX. Assuming default cost.\n");
8100 return true;
8103 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8104 calculated for X. This cost is stored in *COST. Returns true
8105 if the total cost of X was calculated. */
8106 static bool
8107 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8108 int param, int *cost, bool speed)
8110 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8112 if (dump_file
8113 && flag_aarch64_verbose_cost)
8115 print_rtl_single (dump_file, x);
8116 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8117 speed ? "Hot" : "Cold",
8118 *cost, result ? "final" : "partial");
8121 return result;
8124 static int
8125 aarch64_register_move_cost (machine_mode mode,
8126 reg_class_t from_i, reg_class_t to_i)
8128 enum reg_class from = (enum reg_class) from_i;
8129 enum reg_class to = (enum reg_class) to_i;
8130 const struct cpu_regmove_cost *regmove_cost
8131 = aarch64_tune_params.regmove_cost;
8133 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8134 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8135 to = GENERAL_REGS;
8137 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8138 from = GENERAL_REGS;
8140 /* Moving between GPR and stack cost is the same as GP2GP. */
8141 if ((from == GENERAL_REGS && to == STACK_REG)
8142 || (to == GENERAL_REGS && from == STACK_REG))
8143 return regmove_cost->GP2GP;
8145 /* To/From the stack register, we move via the gprs. */
8146 if (to == STACK_REG || from == STACK_REG)
8147 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8148 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8150 if (GET_MODE_SIZE (mode) == 16)
8152 /* 128-bit operations on general registers require 2 instructions. */
8153 if (from == GENERAL_REGS && to == GENERAL_REGS)
8154 return regmove_cost->GP2GP * 2;
8155 else if (from == GENERAL_REGS)
8156 return regmove_cost->GP2FP * 2;
8157 else if (to == GENERAL_REGS)
8158 return regmove_cost->FP2GP * 2;
8160 /* When AdvSIMD instructions are disabled it is not possible to move
8161 a 128-bit value directly between Q registers. This is handled in
8162 secondary reload. A general register is used as a scratch to move
8163 the upper DI value and the lower DI value is moved directly,
8164 hence the cost is the sum of three moves. */
8165 if (! TARGET_SIMD)
8166 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8168 return regmove_cost->FP2FP;
8171 if (from == GENERAL_REGS && to == GENERAL_REGS)
8172 return regmove_cost->GP2GP;
8173 else if (from == GENERAL_REGS)
8174 return regmove_cost->GP2FP;
8175 else if (to == GENERAL_REGS)
8176 return regmove_cost->FP2GP;
8178 return regmove_cost->FP2FP;
8181 static int
8182 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8183 reg_class_t rclass ATTRIBUTE_UNUSED,
8184 bool in ATTRIBUTE_UNUSED)
8186 return aarch64_tune_params.memmov_cost;
8189 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8190 to optimize 1.0/sqrt. */
8192 static bool
8193 use_rsqrt_p (machine_mode mode)
8195 return (!flag_trapping_math
8196 && flag_unsafe_math_optimizations
8197 && ((aarch64_tune_params.approx_modes->recip_sqrt
8198 & AARCH64_APPROX_MODE (mode))
8199 || flag_mrecip_low_precision_sqrt));
8202 /* Function to decide when to use the approximate reciprocal square root
8203 builtin. */
8205 static tree
8206 aarch64_builtin_reciprocal (tree fndecl)
8208 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8210 if (!use_rsqrt_p (mode))
8211 return NULL_TREE;
8212 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8215 typedef rtx (*rsqrte_type) (rtx, rtx);
8217 /* Select reciprocal square root initial estimate insn depending on machine
8218 mode. */
8220 static rsqrte_type
8221 get_rsqrte_type (machine_mode mode)
8223 switch (mode)
8225 case E_DFmode: return gen_aarch64_rsqrtedf;
8226 case E_SFmode: return gen_aarch64_rsqrtesf;
8227 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8228 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8229 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8230 default: gcc_unreachable ();
8234 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8236 /* Select reciprocal square root series step insn depending on machine mode. */
8238 static rsqrts_type
8239 get_rsqrts_type (machine_mode mode)
8241 switch (mode)
8243 case E_DFmode: return gen_aarch64_rsqrtsdf;
8244 case E_SFmode: return gen_aarch64_rsqrtssf;
8245 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8246 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8247 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8248 default: gcc_unreachable ();
8252 /* Emit instruction sequence to compute either the approximate square root
8253 or its approximate reciprocal, depending on the flag RECP, and return
8254 whether the sequence was emitted or not. */
8256 bool
8257 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8259 machine_mode mode = GET_MODE (dst);
8261 if (GET_MODE_INNER (mode) == HFmode)
8263 gcc_assert (!recp);
8264 return false;
8267 machine_mode mmsk
8268 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8269 GET_MODE_NUNITS (mode));
8270 if (!recp)
8272 if (!(flag_mlow_precision_sqrt
8273 || (aarch64_tune_params.approx_modes->sqrt
8274 & AARCH64_APPROX_MODE (mode))))
8275 return false;
8277 if (flag_finite_math_only
8278 || flag_trapping_math
8279 || !flag_unsafe_math_optimizations
8280 || optimize_function_for_size_p (cfun))
8281 return false;
8283 else
8284 /* Caller assumes we cannot fail. */
8285 gcc_assert (use_rsqrt_p (mode));
8288 rtx xmsk = gen_reg_rtx (mmsk);
8289 if (!recp)
8290 /* When calculating the approximate square root, compare the
8291 argument with 0.0 and create a mask. */
8292 emit_insn (gen_rtx_SET (xmsk,
8293 gen_rtx_NEG (mmsk,
8294 gen_rtx_EQ (mmsk, src,
8295 CONST0_RTX (mode)))));
8297 /* Estimate the approximate reciprocal square root. */
8298 rtx xdst = gen_reg_rtx (mode);
8299 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8301 /* Iterate over the series twice for SF and thrice for DF. */
8302 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8304 /* Optionally iterate over the series once less for faster performance
8305 while sacrificing the accuracy. */
8306 if ((recp && flag_mrecip_low_precision_sqrt)
8307 || (!recp && flag_mlow_precision_sqrt))
8308 iterations--;
8310 /* Iterate over the series to calculate the approximate reciprocal square
8311 root. */
8312 rtx x1 = gen_reg_rtx (mode);
8313 while (iterations--)
8315 rtx x2 = gen_reg_rtx (mode);
8316 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8318 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8320 if (iterations > 0)
8321 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8324 if (!recp)
8326 /* Qualify the approximate reciprocal square root when the argument is
8327 0.0 by squashing the intermediary result to 0.0. */
8328 rtx xtmp = gen_reg_rtx (mmsk);
8329 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8330 gen_rtx_SUBREG (mmsk, xdst, 0)));
8331 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8333 /* Calculate the approximate square root. */
8334 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8337 /* Finalize the approximation. */
8338 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8340 return true;
8343 typedef rtx (*recpe_type) (rtx, rtx);
8345 /* Select reciprocal initial estimate insn depending on machine mode. */
8347 static recpe_type
8348 get_recpe_type (machine_mode mode)
8350 switch (mode)
8352 case E_SFmode: return (gen_aarch64_frecpesf);
8353 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8354 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8355 case E_DFmode: return (gen_aarch64_frecpedf);
8356 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8357 default: gcc_unreachable ();
8361 typedef rtx (*recps_type) (rtx, rtx, rtx);
8363 /* Select reciprocal series step insn depending on machine mode. */
8365 static recps_type
8366 get_recps_type (machine_mode mode)
8368 switch (mode)
8370 case E_SFmode: return (gen_aarch64_frecpssf);
8371 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8372 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8373 case E_DFmode: return (gen_aarch64_frecpsdf);
8374 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8375 default: gcc_unreachable ();
8379 /* Emit the instruction sequence to compute the approximation for the division
8380 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8382 bool
8383 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8385 machine_mode mode = GET_MODE (quo);
8387 if (GET_MODE_INNER (mode) == HFmode)
8388 return false;
8390 bool use_approx_division_p = (flag_mlow_precision_div
8391 || (aarch64_tune_params.approx_modes->division
8392 & AARCH64_APPROX_MODE (mode)));
8394 if (!flag_finite_math_only
8395 || flag_trapping_math
8396 || !flag_unsafe_math_optimizations
8397 || optimize_function_for_size_p (cfun)
8398 || !use_approx_division_p)
8399 return false;
8401 /* Estimate the approximate reciprocal. */
8402 rtx xrcp = gen_reg_rtx (mode);
8403 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8405 /* Iterate over the series twice for SF and thrice for DF. */
8406 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8408 /* Optionally iterate over the series once less for faster performance,
8409 while sacrificing the accuracy. */
8410 if (flag_mlow_precision_div)
8411 iterations--;
8413 /* Iterate over the series to calculate the approximate reciprocal. */
8414 rtx xtmp = gen_reg_rtx (mode);
8415 while (iterations--)
8417 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8419 if (iterations > 0)
8420 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8423 if (num != CONST1_RTX (mode))
8425 /* As the approximate reciprocal of DEN is already calculated, only
8426 calculate the approximate division when NUM is not 1.0. */
8427 rtx xnum = force_reg (mode, num);
8428 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8431 /* Finalize the approximation. */
8432 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8433 return true;
8436 /* Return the number of instructions that can be issued per cycle. */
8437 static int
8438 aarch64_sched_issue_rate (void)
8440 return aarch64_tune_params.issue_rate;
8443 static int
8444 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8446 int issue_rate = aarch64_sched_issue_rate ();
8448 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8452 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8453 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8454 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8456 static int
8457 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8458 int ready_index)
8460 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8464 /* Vectorizer cost model target hooks. */
8466 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8467 static int
8468 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8469 tree vectype,
8470 int misalign ATTRIBUTE_UNUSED)
8472 unsigned elements;
8473 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8474 bool fp = false;
8476 if (vectype != NULL)
8477 fp = FLOAT_TYPE_P (vectype);
8479 switch (type_of_cost)
8481 case scalar_stmt:
8482 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8484 case scalar_load:
8485 return costs->scalar_load_cost;
8487 case scalar_store:
8488 return costs->scalar_store_cost;
8490 case vector_stmt:
8491 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8493 case vector_load:
8494 return costs->vec_align_load_cost;
8496 case vector_store:
8497 return costs->vec_store_cost;
8499 case vec_to_scalar:
8500 return costs->vec_to_scalar_cost;
8502 case scalar_to_vec:
8503 return costs->scalar_to_vec_cost;
8505 case unaligned_load:
8506 return costs->vec_unalign_load_cost;
8508 case unaligned_store:
8509 return costs->vec_unalign_store_cost;
8511 case cond_branch_taken:
8512 return costs->cond_taken_branch_cost;
8514 case cond_branch_not_taken:
8515 return costs->cond_not_taken_branch_cost;
8517 case vec_perm:
8518 return costs->vec_permute_cost;
8520 case vec_promote_demote:
8521 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8523 case vec_construct:
8524 elements = TYPE_VECTOR_SUBPARTS (vectype);
8525 return elements / 2 + 1;
8527 default:
8528 gcc_unreachable ();
8532 /* Implement targetm.vectorize.add_stmt_cost. */
8533 static unsigned
8534 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8535 struct _stmt_vec_info *stmt_info, int misalign,
8536 enum vect_cost_model_location where)
8538 unsigned *cost = (unsigned *) data;
8539 unsigned retval = 0;
8541 if (flag_vect_cost_model)
8543 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8544 int stmt_cost =
8545 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8547 /* Statements in an inner loop relative to the loop being
8548 vectorized are weighted more heavily. The value here is
8549 arbitrary and could potentially be improved with analysis. */
8550 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8551 count *= 50; /* FIXME */
8553 retval = (unsigned) (count * stmt_cost);
8554 cost[where] += retval;
8557 return retval;
8560 static void initialize_aarch64_code_model (struct gcc_options *);
8562 /* Parse the TO_PARSE string and put the architecture struct that it
8563 selects into RES and the architectural features into ISA_FLAGS.
8564 Return an aarch64_parse_opt_result describing the parse result.
8565 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8567 static enum aarch64_parse_opt_result
8568 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8569 unsigned long *isa_flags)
8571 char *ext;
8572 const struct processor *arch;
8573 char *str = (char *) alloca (strlen (to_parse) + 1);
8574 size_t len;
8576 strcpy (str, to_parse);
8578 ext = strchr (str, '+');
8580 if (ext != NULL)
8581 len = ext - str;
8582 else
8583 len = strlen (str);
8585 if (len == 0)
8586 return AARCH64_PARSE_MISSING_ARG;
8589 /* Loop through the list of supported ARCHes to find a match. */
8590 for (arch = all_architectures; arch->name != NULL; arch++)
8592 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8594 unsigned long isa_temp = arch->flags;
8596 if (ext != NULL)
8598 /* TO_PARSE string contains at least one extension. */
8599 enum aarch64_parse_opt_result ext_res
8600 = aarch64_parse_extension (ext, &isa_temp);
8602 if (ext_res != AARCH64_PARSE_OK)
8603 return ext_res;
8605 /* Extension parsing was successful. Confirm the result
8606 arch and ISA flags. */
8607 *res = arch;
8608 *isa_flags = isa_temp;
8609 return AARCH64_PARSE_OK;
8613 /* ARCH name not found in list. */
8614 return AARCH64_PARSE_INVALID_ARG;
8617 /* Parse the TO_PARSE string and put the result tuning in RES and the
8618 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8619 describing the parse result. If there is an error parsing, RES and
8620 ISA_FLAGS are left unchanged. */
8622 static enum aarch64_parse_opt_result
8623 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8624 unsigned long *isa_flags)
8626 char *ext;
8627 const struct processor *cpu;
8628 char *str = (char *) alloca (strlen (to_parse) + 1);
8629 size_t len;
8631 strcpy (str, to_parse);
8633 ext = strchr (str, '+');
8635 if (ext != NULL)
8636 len = ext - str;
8637 else
8638 len = strlen (str);
8640 if (len == 0)
8641 return AARCH64_PARSE_MISSING_ARG;
8644 /* Loop through the list of supported CPUs to find a match. */
8645 for (cpu = all_cores; cpu->name != NULL; cpu++)
8647 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8649 unsigned long isa_temp = cpu->flags;
8652 if (ext != NULL)
8654 /* TO_PARSE string contains at least one extension. */
8655 enum aarch64_parse_opt_result ext_res
8656 = aarch64_parse_extension (ext, &isa_temp);
8658 if (ext_res != AARCH64_PARSE_OK)
8659 return ext_res;
8661 /* Extension parsing was successfull. Confirm the result
8662 cpu and ISA flags. */
8663 *res = cpu;
8664 *isa_flags = isa_temp;
8665 return AARCH64_PARSE_OK;
8669 /* CPU name not found in list. */
8670 return AARCH64_PARSE_INVALID_ARG;
8673 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8674 Return an aarch64_parse_opt_result describing the parse result.
8675 If the parsing fails the RES does not change. */
8677 static enum aarch64_parse_opt_result
8678 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8680 const struct processor *cpu;
8681 char *str = (char *) alloca (strlen (to_parse) + 1);
8683 strcpy (str, to_parse);
8685 /* Loop through the list of supported CPUs to find a match. */
8686 for (cpu = all_cores; cpu->name != NULL; cpu++)
8688 if (strcmp (cpu->name, str) == 0)
8690 *res = cpu;
8691 return AARCH64_PARSE_OK;
8695 /* CPU name not found in list. */
8696 return AARCH64_PARSE_INVALID_ARG;
8699 /* Parse TOKEN, which has length LENGTH to see if it is an option
8700 described in FLAG. If it is, return the index bit for that fusion type.
8701 If not, error (printing OPTION_NAME) and return zero. */
8703 static unsigned int
8704 aarch64_parse_one_option_token (const char *token,
8705 size_t length,
8706 const struct aarch64_flag_desc *flag,
8707 const char *option_name)
8709 for (; flag->name != NULL; flag++)
8711 if (length == strlen (flag->name)
8712 && !strncmp (flag->name, token, length))
8713 return flag->flag;
8716 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8717 return 0;
8720 /* Parse OPTION which is a comma-separated list of flags to enable.
8721 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8722 default state we inherit from the CPU tuning structures. OPTION_NAME
8723 gives the top-level option we are parsing in the -moverride string,
8724 for use in error messages. */
8726 static unsigned int
8727 aarch64_parse_boolean_options (const char *option,
8728 const struct aarch64_flag_desc *flags,
8729 unsigned int initial_state,
8730 const char *option_name)
8732 const char separator = '.';
8733 const char* specs = option;
8734 const char* ntoken = option;
8735 unsigned int found_flags = initial_state;
8737 while ((ntoken = strchr (specs, separator)))
8739 size_t token_length = ntoken - specs;
8740 unsigned token_ops = aarch64_parse_one_option_token (specs,
8741 token_length,
8742 flags,
8743 option_name);
8744 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8745 in the token stream, reset the supported operations. So:
8747 adrp+add.cmp+branch.none.adrp+add
8749 would have the result of turning on only adrp+add fusion. */
8750 if (!token_ops)
8751 found_flags = 0;
8753 found_flags |= token_ops;
8754 specs = ++ntoken;
8757 /* We ended with a comma, print something. */
8758 if (!(*specs))
8760 error ("%s string ill-formed\n", option_name);
8761 return 0;
8764 /* We still have one more token to parse. */
8765 size_t token_length = strlen (specs);
8766 unsigned token_ops = aarch64_parse_one_option_token (specs,
8767 token_length,
8768 flags,
8769 option_name);
8770 if (!token_ops)
8771 found_flags = 0;
8773 found_flags |= token_ops;
8774 return found_flags;
8777 /* Support for overriding instruction fusion. */
8779 static void
8780 aarch64_parse_fuse_string (const char *fuse_string,
8781 struct tune_params *tune)
8783 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8784 aarch64_fusible_pairs,
8785 tune->fusible_ops,
8786 "fuse=");
8789 /* Support for overriding other tuning flags. */
8791 static void
8792 aarch64_parse_tune_string (const char *tune_string,
8793 struct tune_params *tune)
8795 tune->extra_tuning_flags
8796 = aarch64_parse_boolean_options (tune_string,
8797 aarch64_tuning_flags,
8798 tune->extra_tuning_flags,
8799 "tune=");
8802 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8803 we understand. If it is, extract the option string and handoff to
8804 the appropriate function. */
8806 void
8807 aarch64_parse_one_override_token (const char* token,
8808 size_t length,
8809 struct tune_params *tune)
8811 const struct aarch64_tuning_override_function *fn
8812 = aarch64_tuning_override_functions;
8814 const char *option_part = strchr (token, '=');
8815 if (!option_part)
8817 error ("tuning string missing in option (%s)", token);
8818 return;
8821 /* Get the length of the option name. */
8822 length = option_part - token;
8823 /* Skip the '=' to get to the option string. */
8824 option_part++;
8826 for (; fn->name != NULL; fn++)
8828 if (!strncmp (fn->name, token, length))
8830 fn->parse_override (option_part, tune);
8831 return;
8835 error ("unknown tuning option (%s)",token);
8836 return;
8839 /* A checking mechanism for the implementation of the tls size. */
8841 static void
8842 initialize_aarch64_tls_size (struct gcc_options *opts)
8844 if (aarch64_tls_size == 0)
8845 aarch64_tls_size = 24;
8847 switch (opts->x_aarch64_cmodel_var)
8849 case AARCH64_CMODEL_TINY:
8850 /* Both the default and maximum TLS size allowed under tiny is 1M which
8851 needs two instructions to address, so we clamp the size to 24. */
8852 if (aarch64_tls_size > 24)
8853 aarch64_tls_size = 24;
8854 break;
8855 case AARCH64_CMODEL_SMALL:
8856 /* The maximum TLS size allowed under small is 4G. */
8857 if (aarch64_tls_size > 32)
8858 aarch64_tls_size = 32;
8859 break;
8860 case AARCH64_CMODEL_LARGE:
8861 /* The maximum TLS size allowed under large is 16E.
8862 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8863 if (aarch64_tls_size > 48)
8864 aarch64_tls_size = 48;
8865 break;
8866 default:
8867 gcc_unreachable ();
8870 return;
8873 /* Parse STRING looking for options in the format:
8874 string :: option:string
8875 option :: name=substring
8876 name :: {a-z}
8877 substring :: defined by option. */
8879 static void
8880 aarch64_parse_override_string (const char* input_string,
8881 struct tune_params* tune)
8883 const char separator = ':';
8884 size_t string_length = strlen (input_string) + 1;
8885 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8886 char *string = string_root;
8887 strncpy (string, input_string, string_length);
8888 string[string_length - 1] = '\0';
8890 char* ntoken = string;
8892 while ((ntoken = strchr (string, separator)))
8894 size_t token_length = ntoken - string;
8895 /* Make this substring look like a string. */
8896 *ntoken = '\0';
8897 aarch64_parse_one_override_token (string, token_length, tune);
8898 string = ++ntoken;
8901 /* One last option to parse. */
8902 aarch64_parse_one_override_token (string, strlen (string), tune);
8903 free (string_root);
8907 static void
8908 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8910 /* The logic here is that if we are disabling all frame pointer generation
8911 then we do not need to disable leaf frame pointer generation as a
8912 separate operation. But if we are *only* disabling leaf frame pointer
8913 generation then we set flag_omit_frame_pointer to true, but in
8914 aarch64_frame_pointer_required we return false only for leaf functions.
8916 PR 70044: We have to be careful about being called multiple times for the
8917 same function. Once we have decided to set flag_omit_frame_pointer just
8918 so that we can omit leaf frame pointers, we must then not interpret a
8919 second call as meaning that all frame pointer generation should be
8920 omitted. We do this by setting flag_omit_frame_pointer to a special,
8921 non-zero value. */
8922 if (opts->x_flag_omit_frame_pointer == 2)
8923 opts->x_flag_omit_frame_pointer = 0;
8925 if (opts->x_flag_omit_frame_pointer)
8926 opts->x_flag_omit_leaf_frame_pointer = false;
8927 else if (opts->x_flag_omit_leaf_frame_pointer)
8928 opts->x_flag_omit_frame_pointer = 2;
8930 /* If not optimizing for size, set the default
8931 alignment to what the target wants. */
8932 if (!opts->x_optimize_size)
8934 if (opts->x_align_loops <= 0)
8935 opts->x_align_loops = aarch64_tune_params.loop_align;
8936 if (opts->x_align_jumps <= 0)
8937 opts->x_align_jumps = aarch64_tune_params.jump_align;
8938 if (opts->x_align_functions <= 0)
8939 opts->x_align_functions = aarch64_tune_params.function_align;
8942 /* We default to no pc-relative literal loads. */
8944 aarch64_pcrelative_literal_loads = false;
8946 /* If -mpc-relative-literal-loads is set on the command line, this
8947 implies that the user asked for PC relative literal loads. */
8948 if (opts->x_pcrelative_literal_loads == 1)
8949 aarch64_pcrelative_literal_loads = true;
8951 /* This is PR70113. When building the Linux kernel with
8952 CONFIG_ARM64_ERRATUM_843419, support for relocations
8953 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8954 removed from the kernel to avoid loading objects with possibly
8955 offending sequences. Without -mpc-relative-literal-loads we would
8956 generate such relocations, preventing the kernel build from
8957 succeeding. */
8958 if (opts->x_pcrelative_literal_loads == 2
8959 && TARGET_FIX_ERR_A53_843419)
8960 aarch64_pcrelative_literal_loads = true;
8962 /* In the tiny memory model it makes no sense to disallow PC relative
8963 literal pool loads. */
8964 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8965 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8966 aarch64_pcrelative_literal_loads = true;
8968 /* When enabling the lower precision Newton series for the square root, also
8969 enable it for the reciprocal square root, since the latter is an
8970 intermediary step for the former. */
8971 if (flag_mlow_precision_sqrt)
8972 flag_mrecip_low_precision_sqrt = true;
8975 /* 'Unpack' up the internal tuning structs and update the options
8976 in OPTS. The caller must have set up selected_tune and selected_arch
8977 as all the other target-specific codegen decisions are
8978 derived from them. */
8980 void
8981 aarch64_override_options_internal (struct gcc_options *opts)
8983 aarch64_tune_flags = selected_tune->flags;
8984 aarch64_tune = selected_tune->sched_core;
8985 /* Make a copy of the tuning parameters attached to the core, which
8986 we may later overwrite. */
8987 aarch64_tune_params = *(selected_tune->tune);
8988 aarch64_architecture_version = selected_arch->architecture_version;
8990 if (opts->x_aarch64_override_tune_string)
8991 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8992 &aarch64_tune_params);
8994 /* This target defaults to strict volatile bitfields. */
8995 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8996 opts->x_flag_strict_volatile_bitfields = 1;
8998 initialize_aarch64_code_model (opts);
8999 initialize_aarch64_tls_size (opts);
9001 int queue_depth = 0;
9002 switch (aarch64_tune_params.autoprefetcher_model)
9004 case tune_params::AUTOPREFETCHER_OFF:
9005 queue_depth = -1;
9006 break;
9007 case tune_params::AUTOPREFETCHER_WEAK:
9008 queue_depth = 0;
9009 break;
9010 case tune_params::AUTOPREFETCHER_STRONG:
9011 queue_depth = max_insn_queue_index + 1;
9012 break;
9013 default:
9014 gcc_unreachable ();
9017 /* We don't mind passing in global_options_set here as we don't use
9018 the *options_set structs anyway. */
9019 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9020 queue_depth,
9021 opts->x_param_values,
9022 global_options_set.x_param_values);
9024 /* Set up parameters to be used in prefetching algorithm. Do not
9025 override the defaults unless we are tuning for a core we have
9026 researched values for. */
9027 if (aarch64_tune_params.prefetch->num_slots > 0)
9028 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9029 aarch64_tune_params.prefetch->num_slots,
9030 opts->x_param_values,
9031 global_options_set.x_param_values);
9032 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9033 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9034 aarch64_tune_params.prefetch->l1_cache_size,
9035 opts->x_param_values,
9036 global_options_set.x_param_values);
9037 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9038 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9039 aarch64_tune_params.prefetch->l1_cache_line_size,
9040 opts->x_param_values,
9041 global_options_set.x_param_values);
9042 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9043 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9044 aarch64_tune_params.prefetch->l2_cache_size,
9045 opts->x_param_values,
9046 global_options_set.x_param_values);
9048 /* Enable sw prefetching at specified optimization level for
9049 CPUS that have prefetch. Lower optimization level threshold by 1
9050 when profiling is enabled. */
9051 if (opts->x_flag_prefetch_loop_arrays < 0
9052 && !opts->x_optimize_size
9053 && aarch64_tune_params.prefetch->default_opt_level >= 0
9054 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9055 opts->x_flag_prefetch_loop_arrays = 1;
9057 aarch64_override_options_after_change_1 (opts);
9060 /* Print a hint with a suggestion for a core or architecture name that
9061 most closely resembles what the user passed in STR. ARCH is true if
9062 the user is asking for an architecture name. ARCH is false if the user
9063 is asking for a core name. */
9065 static void
9066 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9068 auto_vec<const char *> candidates;
9069 const struct processor *entry = arch ? all_architectures : all_cores;
9070 for (; entry->name != NULL; entry++)
9071 candidates.safe_push (entry->name);
9072 char *s;
9073 const char *hint = candidates_list_and_hint (str, s, candidates);
9074 if (hint)
9075 inform (input_location, "valid arguments are: %s;"
9076 " did you mean %qs?", s, hint);
9077 XDELETEVEC (s);
9080 /* Print a hint with a suggestion for a core name that most closely resembles
9081 what the user passed in STR. */
9083 inline static void
9084 aarch64_print_hint_for_core (const char *str)
9086 aarch64_print_hint_for_core_or_arch (str, false);
9089 /* Print a hint with a suggestion for an architecture name that most closely
9090 resembles what the user passed in STR. */
9092 inline static void
9093 aarch64_print_hint_for_arch (const char *str)
9095 aarch64_print_hint_for_core_or_arch (str, true);
9098 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9099 specified in STR and throw errors if appropriate. Put the results if
9100 they are valid in RES and ISA_FLAGS. Return whether the option is
9101 valid. */
9103 static bool
9104 aarch64_validate_mcpu (const char *str, const struct processor **res,
9105 unsigned long *isa_flags)
9107 enum aarch64_parse_opt_result parse_res
9108 = aarch64_parse_cpu (str, res, isa_flags);
9110 if (parse_res == AARCH64_PARSE_OK)
9111 return true;
9113 switch (parse_res)
9115 case AARCH64_PARSE_MISSING_ARG:
9116 error ("missing cpu name in %<-mcpu=%s%>", str);
9117 break;
9118 case AARCH64_PARSE_INVALID_ARG:
9119 error ("unknown value %qs for -mcpu", str);
9120 aarch64_print_hint_for_core (str);
9121 break;
9122 case AARCH64_PARSE_INVALID_FEATURE:
9123 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9124 break;
9125 default:
9126 gcc_unreachable ();
9129 return false;
9132 /* Validate a command-line -march option. Parse the arch and extensions
9133 (if any) specified in STR and throw errors if appropriate. Put the
9134 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9135 option is valid. */
9137 static bool
9138 aarch64_validate_march (const char *str, const struct processor **res,
9139 unsigned long *isa_flags)
9141 enum aarch64_parse_opt_result parse_res
9142 = aarch64_parse_arch (str, res, isa_flags);
9144 if (parse_res == AARCH64_PARSE_OK)
9145 return true;
9147 switch (parse_res)
9149 case AARCH64_PARSE_MISSING_ARG:
9150 error ("missing arch name in %<-march=%s%>", str);
9151 break;
9152 case AARCH64_PARSE_INVALID_ARG:
9153 error ("unknown value %qs for -march", str);
9154 aarch64_print_hint_for_arch (str);
9155 break;
9156 case AARCH64_PARSE_INVALID_FEATURE:
9157 error ("invalid feature modifier in %<-march=%s%>", str);
9158 break;
9159 default:
9160 gcc_unreachable ();
9163 return false;
9166 /* Validate a command-line -mtune option. Parse the cpu
9167 specified in STR and throw errors if appropriate. Put the
9168 result, if it is valid, in RES. Return whether the option is
9169 valid. */
9171 static bool
9172 aarch64_validate_mtune (const char *str, const struct processor **res)
9174 enum aarch64_parse_opt_result parse_res
9175 = aarch64_parse_tune (str, res);
9177 if (parse_res == AARCH64_PARSE_OK)
9178 return true;
9180 switch (parse_res)
9182 case AARCH64_PARSE_MISSING_ARG:
9183 error ("missing cpu name in %<-mtune=%s%>", str);
9184 break;
9185 case AARCH64_PARSE_INVALID_ARG:
9186 error ("unknown value %qs for -mtune", str);
9187 aarch64_print_hint_for_core (str);
9188 break;
9189 default:
9190 gcc_unreachable ();
9192 return false;
9195 /* Return the CPU corresponding to the enum CPU.
9196 If it doesn't specify a cpu, return the default. */
9198 static const struct processor *
9199 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9201 if (cpu != aarch64_none)
9202 return &all_cores[cpu];
9204 /* The & 0x3f is to extract the bottom 6 bits that encode the
9205 default cpu as selected by the --with-cpu GCC configure option
9206 in config.gcc.
9207 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9208 flags mechanism should be reworked to make it more sane. */
9209 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9212 /* Return the architecture corresponding to the enum ARCH.
9213 If it doesn't specify a valid architecture, return the default. */
9215 static const struct processor *
9216 aarch64_get_arch (enum aarch64_arch arch)
9218 if (arch != aarch64_no_arch)
9219 return &all_architectures[arch];
9221 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9223 return &all_architectures[cpu->arch];
9226 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9227 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9228 tuning structs. In particular it must set selected_tune and
9229 aarch64_isa_flags that define the available ISA features and tuning
9230 decisions. It must also set selected_arch as this will be used to
9231 output the .arch asm tags for each function. */
9233 static void
9234 aarch64_override_options (void)
9236 unsigned long cpu_isa = 0;
9237 unsigned long arch_isa = 0;
9238 aarch64_isa_flags = 0;
9240 bool valid_cpu = true;
9241 bool valid_tune = true;
9242 bool valid_arch = true;
9244 selected_cpu = NULL;
9245 selected_arch = NULL;
9246 selected_tune = NULL;
9248 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9249 If either of -march or -mtune is given, they override their
9250 respective component of -mcpu. */
9251 if (aarch64_cpu_string)
9252 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9253 &cpu_isa);
9255 if (aarch64_arch_string)
9256 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9257 &arch_isa);
9259 if (aarch64_tune_string)
9260 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9262 /* If the user did not specify a processor, choose the default
9263 one for them. This will be the CPU set during configuration using
9264 --with-cpu, otherwise it is "generic". */
9265 if (!selected_cpu)
9267 if (selected_arch)
9269 selected_cpu = &all_cores[selected_arch->ident];
9270 aarch64_isa_flags = arch_isa;
9271 explicit_arch = selected_arch->arch;
9273 else
9275 /* Get default configure-time CPU. */
9276 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9277 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9280 if (selected_tune)
9281 explicit_tune_core = selected_tune->ident;
9283 /* If both -mcpu and -march are specified check that they are architecturally
9284 compatible, warn if they're not and prefer the -march ISA flags. */
9285 else if (selected_arch)
9287 if (selected_arch->arch != selected_cpu->arch)
9289 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9290 all_architectures[selected_cpu->arch].name,
9291 selected_arch->name);
9293 aarch64_isa_flags = arch_isa;
9294 explicit_arch = selected_arch->arch;
9295 explicit_tune_core = selected_tune ? selected_tune->ident
9296 : selected_cpu->ident;
9298 else
9300 /* -mcpu but no -march. */
9301 aarch64_isa_flags = cpu_isa;
9302 explicit_tune_core = selected_tune ? selected_tune->ident
9303 : selected_cpu->ident;
9304 gcc_assert (selected_cpu);
9305 selected_arch = &all_architectures[selected_cpu->arch];
9306 explicit_arch = selected_arch->arch;
9309 /* Set the arch as well as we will need it when outputing
9310 the .arch directive in assembly. */
9311 if (!selected_arch)
9313 gcc_assert (selected_cpu);
9314 selected_arch = &all_architectures[selected_cpu->arch];
9317 if (!selected_tune)
9318 selected_tune = selected_cpu;
9320 #ifndef HAVE_AS_MABI_OPTION
9321 /* The compiler may have been configured with 2.23.* binutils, which does
9322 not have support for ILP32. */
9323 if (TARGET_ILP32)
9324 error ("Assembler does not support -mabi=ilp32");
9325 #endif
9327 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9328 sorry ("Return address signing is only supported for -mabi=lp64");
9330 /* Make sure we properly set up the explicit options. */
9331 if ((aarch64_cpu_string && valid_cpu)
9332 || (aarch64_tune_string && valid_tune))
9333 gcc_assert (explicit_tune_core != aarch64_none);
9335 if ((aarch64_cpu_string && valid_cpu)
9336 || (aarch64_arch_string && valid_arch))
9337 gcc_assert (explicit_arch != aarch64_no_arch);
9339 aarch64_override_options_internal (&global_options);
9341 /* Save these options as the default ones in case we push and pop them later
9342 while processing functions with potential target attributes. */
9343 target_option_default_node = target_option_current_node
9344 = build_target_option_node (&global_options);
9347 /* Implement targetm.override_options_after_change. */
9349 static void
9350 aarch64_override_options_after_change (void)
9352 aarch64_override_options_after_change_1 (&global_options);
9355 static struct machine_function *
9356 aarch64_init_machine_status (void)
9358 struct machine_function *machine;
9359 machine = ggc_cleared_alloc<machine_function> ();
9360 return machine;
9363 void
9364 aarch64_init_expanders (void)
9366 init_machine_status = aarch64_init_machine_status;
9369 /* A checking mechanism for the implementation of the various code models. */
9370 static void
9371 initialize_aarch64_code_model (struct gcc_options *opts)
9373 if (opts->x_flag_pic)
9375 switch (opts->x_aarch64_cmodel_var)
9377 case AARCH64_CMODEL_TINY:
9378 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9379 break;
9380 case AARCH64_CMODEL_SMALL:
9381 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9382 aarch64_cmodel = (flag_pic == 2
9383 ? AARCH64_CMODEL_SMALL_PIC
9384 : AARCH64_CMODEL_SMALL_SPIC);
9385 #else
9386 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9387 #endif
9388 break;
9389 case AARCH64_CMODEL_LARGE:
9390 sorry ("code model %qs with -f%s", "large",
9391 opts->x_flag_pic > 1 ? "PIC" : "pic");
9392 break;
9393 default:
9394 gcc_unreachable ();
9397 else
9398 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9401 /* Implement TARGET_OPTION_SAVE. */
9403 static void
9404 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9406 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9409 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9410 using the information saved in PTR. */
9412 static void
9413 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9415 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9416 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9417 opts->x_explicit_arch = ptr->x_explicit_arch;
9418 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9419 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9421 aarch64_override_options_internal (opts);
9424 /* Implement TARGET_OPTION_PRINT. */
9426 static void
9427 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9429 const struct processor *cpu
9430 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9431 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9432 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9433 std::string extension
9434 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9436 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9437 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9438 arch->name, extension.c_str ());
9441 static GTY(()) tree aarch64_previous_fndecl;
9443 void
9444 aarch64_reset_previous_fndecl (void)
9446 aarch64_previous_fndecl = NULL;
9449 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9450 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9451 make sure optab availability predicates are recomputed when necessary. */
9453 void
9454 aarch64_save_restore_target_globals (tree new_tree)
9456 if (TREE_TARGET_GLOBALS (new_tree))
9457 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9458 else if (new_tree == target_option_default_node)
9459 restore_target_globals (&default_target_globals);
9460 else
9461 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9464 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9465 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9466 of the function, if such exists. This function may be called multiple
9467 times on a single function so use aarch64_previous_fndecl to avoid
9468 setting up identical state. */
9470 static void
9471 aarch64_set_current_function (tree fndecl)
9473 if (!fndecl || fndecl == aarch64_previous_fndecl)
9474 return;
9476 tree old_tree = (aarch64_previous_fndecl
9477 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9478 : NULL_TREE);
9480 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9482 /* If current function has no attributes but the previous one did,
9483 use the default node. */
9484 if (!new_tree && old_tree)
9485 new_tree = target_option_default_node;
9487 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9488 the default have been handled by aarch64_save_restore_target_globals from
9489 aarch64_pragma_target_parse. */
9490 if (old_tree == new_tree)
9491 return;
9493 aarch64_previous_fndecl = fndecl;
9495 /* First set the target options. */
9496 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9498 aarch64_save_restore_target_globals (new_tree);
9501 /* Enum describing the various ways we can handle attributes.
9502 In many cases we can reuse the generic option handling machinery. */
9504 enum aarch64_attr_opt_type
9506 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9507 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9508 aarch64_attr_enum, /* Attribute sets an enum variable. */
9509 aarch64_attr_custom /* Attribute requires a custom handling function. */
9512 /* All the information needed to handle a target attribute.
9513 NAME is the name of the attribute.
9514 ATTR_TYPE specifies the type of behavior of the attribute as described
9515 in the definition of enum aarch64_attr_opt_type.
9516 ALLOW_NEG is true if the attribute supports a "no-" form.
9517 HANDLER is the function that takes the attribute string and whether
9518 it is a pragma or attribute and handles the option. It is needed only
9519 when the ATTR_TYPE is aarch64_attr_custom.
9520 OPT_NUM is the enum specifying the option that the attribute modifies.
9521 This is needed for attributes that mirror the behavior of a command-line
9522 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9523 aarch64_attr_enum. */
9525 struct aarch64_attribute_info
9527 const char *name;
9528 enum aarch64_attr_opt_type attr_type;
9529 bool allow_neg;
9530 bool (*handler) (const char *, const char *);
9531 enum opt_code opt_num;
9534 /* Handle the ARCH_STR argument to the arch= target attribute.
9535 PRAGMA_OR_ATTR is used in potential error messages. */
9537 static bool
9538 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9540 const struct processor *tmp_arch = NULL;
9541 enum aarch64_parse_opt_result parse_res
9542 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9544 if (parse_res == AARCH64_PARSE_OK)
9546 gcc_assert (tmp_arch);
9547 selected_arch = tmp_arch;
9548 explicit_arch = selected_arch->arch;
9549 return true;
9552 switch (parse_res)
9554 case AARCH64_PARSE_MISSING_ARG:
9555 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9556 break;
9557 case AARCH64_PARSE_INVALID_ARG:
9558 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9559 aarch64_print_hint_for_arch (str);
9560 break;
9561 case AARCH64_PARSE_INVALID_FEATURE:
9562 error ("invalid feature modifier %qs for 'arch' target %s",
9563 str, pragma_or_attr);
9564 break;
9565 default:
9566 gcc_unreachable ();
9569 return false;
9572 /* Handle the argument CPU_STR to the cpu= target attribute.
9573 PRAGMA_OR_ATTR is used in potential error messages. */
9575 static bool
9576 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9578 const struct processor *tmp_cpu = NULL;
9579 enum aarch64_parse_opt_result parse_res
9580 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9582 if (parse_res == AARCH64_PARSE_OK)
9584 gcc_assert (tmp_cpu);
9585 selected_tune = tmp_cpu;
9586 explicit_tune_core = selected_tune->ident;
9588 selected_arch = &all_architectures[tmp_cpu->arch];
9589 explicit_arch = selected_arch->arch;
9590 return true;
9593 switch (parse_res)
9595 case AARCH64_PARSE_MISSING_ARG:
9596 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9597 break;
9598 case AARCH64_PARSE_INVALID_ARG:
9599 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9600 aarch64_print_hint_for_core (str);
9601 break;
9602 case AARCH64_PARSE_INVALID_FEATURE:
9603 error ("invalid feature modifier %qs for 'cpu' target %s",
9604 str, pragma_or_attr);
9605 break;
9606 default:
9607 gcc_unreachable ();
9610 return false;
9613 /* Handle the argument STR to the tune= target attribute.
9614 PRAGMA_OR_ATTR is used in potential error messages. */
9616 static bool
9617 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9619 const struct processor *tmp_tune = NULL;
9620 enum aarch64_parse_opt_result parse_res
9621 = aarch64_parse_tune (str, &tmp_tune);
9623 if (parse_res == AARCH64_PARSE_OK)
9625 gcc_assert (tmp_tune);
9626 selected_tune = tmp_tune;
9627 explicit_tune_core = selected_tune->ident;
9628 return true;
9631 switch (parse_res)
9633 case AARCH64_PARSE_INVALID_ARG:
9634 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9635 aarch64_print_hint_for_core (str);
9636 break;
9637 default:
9638 gcc_unreachable ();
9641 return false;
9644 /* Parse an architecture extensions target attribute string specified in STR.
9645 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9646 if successful. Update aarch64_isa_flags to reflect the ISA features
9647 modified.
9648 PRAGMA_OR_ATTR is used in potential error messages. */
9650 static bool
9651 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9653 enum aarch64_parse_opt_result parse_res;
9654 unsigned long isa_flags = aarch64_isa_flags;
9656 /* We allow "+nothing" in the beginning to clear out all architectural
9657 features if the user wants to handpick specific features. */
9658 if (strncmp ("+nothing", str, 8) == 0)
9660 isa_flags = 0;
9661 str += 8;
9664 parse_res = aarch64_parse_extension (str, &isa_flags);
9666 if (parse_res == AARCH64_PARSE_OK)
9668 aarch64_isa_flags = isa_flags;
9669 return true;
9672 switch (parse_res)
9674 case AARCH64_PARSE_MISSING_ARG:
9675 error ("missing feature modifier in target %s %qs",
9676 pragma_or_attr, str);
9677 break;
9679 case AARCH64_PARSE_INVALID_FEATURE:
9680 error ("invalid feature modifier in target %s %qs",
9681 pragma_or_attr, str);
9682 break;
9684 default:
9685 gcc_unreachable ();
9688 return false;
9691 /* The target attributes that we support. On top of these we also support just
9692 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9693 handled explicitly in aarch64_process_one_target_attr. */
9695 static const struct aarch64_attribute_info aarch64_attributes[] =
9697 { "general-regs-only", aarch64_attr_mask, false, NULL,
9698 OPT_mgeneral_regs_only },
9699 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9700 OPT_mfix_cortex_a53_835769 },
9701 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9702 OPT_mfix_cortex_a53_843419 },
9703 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9704 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9705 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9706 OPT_momit_leaf_frame_pointer },
9707 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9708 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9709 OPT_march_ },
9710 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9711 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9712 OPT_mtune_ },
9713 { "sign-return-address", aarch64_attr_enum, false, NULL,
9714 OPT_msign_return_address_ },
9715 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9718 /* Parse ARG_STR which contains the definition of one target attribute.
9719 Show appropriate errors if any or return true if the attribute is valid.
9720 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9721 we're processing a target attribute or pragma. */
9723 static bool
9724 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9726 bool invert = false;
9728 size_t len = strlen (arg_str);
9730 if (len == 0)
9732 error ("malformed target %s", pragma_or_attr);
9733 return false;
9736 char *str_to_check = (char *) alloca (len + 1);
9737 strcpy (str_to_check, arg_str);
9739 /* Skip leading whitespace. */
9740 while (*str_to_check == ' ' || *str_to_check == '\t')
9741 str_to_check++;
9743 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9744 It is easier to detect and handle it explicitly here rather than going
9745 through the machinery for the rest of the target attributes in this
9746 function. */
9747 if (*str_to_check == '+')
9748 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9750 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9752 invert = true;
9753 str_to_check += 3;
9755 char *arg = strchr (str_to_check, '=');
9757 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9758 and point ARG to "foo". */
9759 if (arg)
9761 *arg = '\0';
9762 arg++;
9764 const struct aarch64_attribute_info *p_attr;
9765 bool found = false;
9766 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9768 /* If the names don't match up, or the user has given an argument
9769 to an attribute that doesn't accept one, or didn't give an argument
9770 to an attribute that expects one, fail to match. */
9771 if (strcmp (str_to_check, p_attr->name) != 0)
9772 continue;
9774 found = true;
9775 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9776 || p_attr->attr_type == aarch64_attr_enum;
9778 if (attr_need_arg_p ^ (arg != NULL))
9780 error ("target %s %qs does not accept an argument",
9781 pragma_or_attr, str_to_check);
9782 return false;
9785 /* If the name matches but the attribute does not allow "no-" versions
9786 then we can't match. */
9787 if (invert && !p_attr->allow_neg)
9789 error ("target %s %qs does not allow a negated form",
9790 pragma_or_attr, str_to_check);
9791 return false;
9794 switch (p_attr->attr_type)
9796 /* Has a custom handler registered.
9797 For example, cpu=, arch=, tune=. */
9798 case aarch64_attr_custom:
9799 gcc_assert (p_attr->handler);
9800 if (!p_attr->handler (arg, pragma_or_attr))
9801 return false;
9802 break;
9804 /* Either set or unset a boolean option. */
9805 case aarch64_attr_bool:
9807 struct cl_decoded_option decoded;
9809 generate_option (p_attr->opt_num, NULL, !invert,
9810 CL_TARGET, &decoded);
9811 aarch64_handle_option (&global_options, &global_options_set,
9812 &decoded, input_location);
9813 break;
9815 /* Set or unset a bit in the target_flags. aarch64_handle_option
9816 should know what mask to apply given the option number. */
9817 case aarch64_attr_mask:
9819 struct cl_decoded_option decoded;
9820 /* We only need to specify the option number.
9821 aarch64_handle_option will know which mask to apply. */
9822 decoded.opt_index = p_attr->opt_num;
9823 decoded.value = !invert;
9824 aarch64_handle_option (&global_options, &global_options_set,
9825 &decoded, input_location);
9826 break;
9828 /* Use the option setting machinery to set an option to an enum. */
9829 case aarch64_attr_enum:
9831 gcc_assert (arg);
9832 bool valid;
9833 int value;
9834 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9835 &value, CL_TARGET);
9836 if (valid)
9838 set_option (&global_options, NULL, p_attr->opt_num, value,
9839 NULL, DK_UNSPECIFIED, input_location,
9840 global_dc);
9842 else
9844 error ("target %s %s=%s is not valid",
9845 pragma_or_attr, str_to_check, arg);
9847 break;
9849 default:
9850 gcc_unreachable ();
9854 /* If we reached here we either have found an attribute and validated
9855 it or didn't match any. If we matched an attribute but its arguments
9856 were malformed we will have returned false already. */
9857 return found;
9860 /* Count how many times the character C appears in
9861 NULL-terminated string STR. */
9863 static unsigned int
9864 num_occurences_in_str (char c, char *str)
9866 unsigned int res = 0;
9867 while (*str != '\0')
9869 if (*str == c)
9870 res++;
9872 str++;
9875 return res;
9878 /* Parse the tree in ARGS that contains the target attribute information
9879 and update the global target options space. PRAGMA_OR_ATTR is a string
9880 to be used in error messages, specifying whether this is processing
9881 a target attribute or a target pragma. */
9883 bool
9884 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9886 if (TREE_CODE (args) == TREE_LIST)
9890 tree head = TREE_VALUE (args);
9891 if (head)
9893 if (!aarch64_process_target_attr (head, pragma_or_attr))
9894 return false;
9896 args = TREE_CHAIN (args);
9897 } while (args);
9899 return true;
9902 if (TREE_CODE (args) != STRING_CST)
9904 error ("attribute %<target%> argument not a string");
9905 return false;
9908 size_t len = strlen (TREE_STRING_POINTER (args));
9909 char *str_to_check = (char *) alloca (len + 1);
9910 strcpy (str_to_check, TREE_STRING_POINTER (args));
9912 if (len == 0)
9914 error ("malformed target %s value", pragma_or_attr);
9915 return false;
9918 /* Used to catch empty spaces between commas i.e.
9919 attribute ((target ("attr1,,attr2"))). */
9920 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9922 /* Handle multiple target attributes separated by ','. */
9923 char *token = strtok (str_to_check, ",");
9925 unsigned int num_attrs = 0;
9926 while (token)
9928 num_attrs++;
9929 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9931 error ("target %s %qs is invalid", pragma_or_attr, token);
9932 return false;
9935 token = strtok (NULL, ",");
9938 if (num_attrs != num_commas + 1)
9940 error ("malformed target %s list %qs",
9941 pragma_or_attr, TREE_STRING_POINTER (args));
9942 return false;
9945 return true;
9948 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9949 process attribute ((target ("..."))). */
9951 static bool
9952 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9954 struct cl_target_option cur_target;
9955 bool ret;
9956 tree old_optimize;
9957 tree new_target, new_optimize;
9958 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9960 /* If what we're processing is the current pragma string then the
9961 target option node is already stored in target_option_current_node
9962 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9963 having to re-parse the string. This is especially useful to keep
9964 arm_neon.h compile times down since that header contains a lot
9965 of intrinsics enclosed in pragmas. */
9966 if (!existing_target && args == current_target_pragma)
9968 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9969 return true;
9971 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9973 old_optimize = build_optimization_node (&global_options);
9974 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9976 /* If the function changed the optimization levels as well as setting
9977 target options, start with the optimizations specified. */
9978 if (func_optimize && func_optimize != old_optimize)
9979 cl_optimization_restore (&global_options,
9980 TREE_OPTIMIZATION (func_optimize));
9982 /* Save the current target options to restore at the end. */
9983 cl_target_option_save (&cur_target, &global_options);
9985 /* If fndecl already has some target attributes applied to it, unpack
9986 them so that we add this attribute on top of them, rather than
9987 overwriting them. */
9988 if (existing_target)
9990 struct cl_target_option *existing_options
9991 = TREE_TARGET_OPTION (existing_target);
9993 if (existing_options)
9994 cl_target_option_restore (&global_options, existing_options);
9996 else
9997 cl_target_option_restore (&global_options,
9998 TREE_TARGET_OPTION (target_option_current_node));
10001 ret = aarch64_process_target_attr (args, "attribute");
10003 /* Set up any additional state. */
10004 if (ret)
10006 aarch64_override_options_internal (&global_options);
10007 /* Initialize SIMD builtins if we haven't already.
10008 Set current_target_pragma to NULL for the duration so that
10009 the builtin initialization code doesn't try to tag the functions
10010 being built with the attributes specified by any current pragma, thus
10011 going into an infinite recursion. */
10012 if (TARGET_SIMD)
10014 tree saved_current_target_pragma = current_target_pragma;
10015 current_target_pragma = NULL;
10016 aarch64_init_simd_builtins ();
10017 current_target_pragma = saved_current_target_pragma;
10019 new_target = build_target_option_node (&global_options);
10021 else
10022 new_target = NULL;
10024 new_optimize = build_optimization_node (&global_options);
10026 if (fndecl && ret)
10028 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10030 if (old_optimize != new_optimize)
10031 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10034 cl_target_option_restore (&global_options, &cur_target);
10036 if (old_optimize != new_optimize)
10037 cl_optimization_restore (&global_options,
10038 TREE_OPTIMIZATION (old_optimize));
10039 return ret;
10042 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10043 tri-bool options (yes, no, don't care) and the default value is
10044 DEF, determine whether to reject inlining. */
10046 static bool
10047 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10048 int dont_care, int def)
10050 /* If the callee doesn't care, always allow inlining. */
10051 if (callee == dont_care)
10052 return true;
10054 /* If the caller doesn't care, always allow inlining. */
10055 if (caller == dont_care)
10056 return true;
10058 /* Otherwise, allow inlining if either the callee and caller values
10059 agree, or if the callee is using the default value. */
10060 return (callee == caller || callee == def);
10063 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10064 to inline CALLEE into CALLER based on target-specific info.
10065 Make sure that the caller and callee have compatible architectural
10066 features. Then go through the other possible target attributes
10067 and see if they can block inlining. Try not to reject always_inline
10068 callees unless they are incompatible architecturally. */
10070 static bool
10071 aarch64_can_inline_p (tree caller, tree callee)
10073 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10074 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10076 /* If callee has no option attributes, then it is ok to inline. */
10077 if (!callee_tree)
10078 return true;
10080 struct cl_target_option *caller_opts
10081 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10082 : target_option_default_node);
10084 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10087 /* Callee's ISA flags should be a subset of the caller's. */
10088 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10089 != callee_opts->x_aarch64_isa_flags)
10090 return false;
10092 /* Allow non-strict aligned functions inlining into strict
10093 aligned ones. */
10094 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10095 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10096 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10097 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10098 return false;
10100 bool always_inline = lookup_attribute ("always_inline",
10101 DECL_ATTRIBUTES (callee));
10103 /* If the architectural features match up and the callee is always_inline
10104 then the other attributes don't matter. */
10105 if (always_inline)
10106 return true;
10108 if (caller_opts->x_aarch64_cmodel_var
10109 != callee_opts->x_aarch64_cmodel_var)
10110 return false;
10112 if (caller_opts->x_aarch64_tls_dialect
10113 != callee_opts->x_aarch64_tls_dialect)
10114 return false;
10116 /* Honour explicit requests to workaround errata. */
10117 if (!aarch64_tribools_ok_for_inlining_p (
10118 caller_opts->x_aarch64_fix_a53_err835769,
10119 callee_opts->x_aarch64_fix_a53_err835769,
10120 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10121 return false;
10123 if (!aarch64_tribools_ok_for_inlining_p (
10124 caller_opts->x_aarch64_fix_a53_err843419,
10125 callee_opts->x_aarch64_fix_a53_err843419,
10126 2, TARGET_FIX_ERR_A53_843419))
10127 return false;
10129 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10130 caller and calle and they don't match up, reject inlining. */
10131 if (!aarch64_tribools_ok_for_inlining_p (
10132 caller_opts->x_flag_omit_leaf_frame_pointer,
10133 callee_opts->x_flag_omit_leaf_frame_pointer,
10134 2, 1))
10135 return false;
10137 /* If the callee has specific tuning overrides, respect them. */
10138 if (callee_opts->x_aarch64_override_tune_string != NULL
10139 && caller_opts->x_aarch64_override_tune_string == NULL)
10140 return false;
10142 /* If the user specified tuning override strings for the
10143 caller and callee and they don't match up, reject inlining.
10144 We just do a string compare here, we don't analyze the meaning
10145 of the string, as it would be too costly for little gain. */
10146 if (callee_opts->x_aarch64_override_tune_string
10147 && caller_opts->x_aarch64_override_tune_string
10148 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10149 caller_opts->x_aarch64_override_tune_string) != 0))
10150 return false;
10152 return true;
10155 /* Return true if SYMBOL_REF X binds locally. */
10157 static bool
10158 aarch64_symbol_binds_local_p (const_rtx x)
10160 return (SYMBOL_REF_DECL (x)
10161 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10162 : SYMBOL_REF_LOCAL_P (x));
10165 /* Return true if SYMBOL_REF X is thread local */
10166 static bool
10167 aarch64_tls_symbol_p (rtx x)
10169 if (! TARGET_HAVE_TLS)
10170 return false;
10172 if (GET_CODE (x) != SYMBOL_REF)
10173 return false;
10175 return SYMBOL_REF_TLS_MODEL (x) != 0;
10178 /* Classify a TLS symbol into one of the TLS kinds. */
10179 enum aarch64_symbol_type
10180 aarch64_classify_tls_symbol (rtx x)
10182 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10184 switch (tls_kind)
10186 case TLS_MODEL_GLOBAL_DYNAMIC:
10187 case TLS_MODEL_LOCAL_DYNAMIC:
10188 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10190 case TLS_MODEL_INITIAL_EXEC:
10191 switch (aarch64_cmodel)
10193 case AARCH64_CMODEL_TINY:
10194 case AARCH64_CMODEL_TINY_PIC:
10195 return SYMBOL_TINY_TLSIE;
10196 default:
10197 return SYMBOL_SMALL_TLSIE;
10200 case TLS_MODEL_LOCAL_EXEC:
10201 if (aarch64_tls_size == 12)
10202 return SYMBOL_TLSLE12;
10203 else if (aarch64_tls_size == 24)
10204 return SYMBOL_TLSLE24;
10205 else if (aarch64_tls_size == 32)
10206 return SYMBOL_TLSLE32;
10207 else if (aarch64_tls_size == 48)
10208 return SYMBOL_TLSLE48;
10209 else
10210 gcc_unreachable ();
10212 case TLS_MODEL_EMULATED:
10213 case TLS_MODEL_NONE:
10214 return SYMBOL_FORCE_TO_MEM;
10216 default:
10217 gcc_unreachable ();
10221 /* Return the method that should be used to access SYMBOL_REF or
10222 LABEL_REF X. */
10224 enum aarch64_symbol_type
10225 aarch64_classify_symbol (rtx x, rtx offset)
10227 if (GET_CODE (x) == LABEL_REF)
10229 switch (aarch64_cmodel)
10231 case AARCH64_CMODEL_LARGE:
10232 return SYMBOL_FORCE_TO_MEM;
10234 case AARCH64_CMODEL_TINY_PIC:
10235 case AARCH64_CMODEL_TINY:
10236 return SYMBOL_TINY_ABSOLUTE;
10238 case AARCH64_CMODEL_SMALL_SPIC:
10239 case AARCH64_CMODEL_SMALL_PIC:
10240 case AARCH64_CMODEL_SMALL:
10241 return SYMBOL_SMALL_ABSOLUTE;
10243 default:
10244 gcc_unreachable ();
10248 if (GET_CODE (x) == SYMBOL_REF)
10250 if (aarch64_tls_symbol_p (x))
10251 return aarch64_classify_tls_symbol (x);
10253 switch (aarch64_cmodel)
10255 case AARCH64_CMODEL_TINY:
10256 /* When we retrieve symbol + offset address, we have to make sure
10257 the offset does not cause overflow of the final address. But
10258 we have no way of knowing the address of symbol at compile time
10259 so we can't accurately say if the distance between the PC and
10260 symbol + offset is outside the addressible range of +/-1M in the
10261 TINY code model. So we rely on images not being greater than
10262 1M and cap the offset at 1M and anything beyond 1M will have to
10263 be loaded using an alternative mechanism. Furthermore if the
10264 symbol is a weak reference to something that isn't known to
10265 resolve to a symbol in this module, then force to memory. */
10266 if ((SYMBOL_REF_WEAK (x)
10267 && !aarch64_symbol_binds_local_p (x))
10268 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10269 return SYMBOL_FORCE_TO_MEM;
10270 return SYMBOL_TINY_ABSOLUTE;
10272 case AARCH64_CMODEL_SMALL:
10273 /* Same reasoning as the tiny code model, but the offset cap here is
10274 4G. */
10275 if ((SYMBOL_REF_WEAK (x)
10276 && !aarch64_symbol_binds_local_p (x))
10277 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10278 HOST_WIDE_INT_C (4294967264)))
10279 return SYMBOL_FORCE_TO_MEM;
10280 return SYMBOL_SMALL_ABSOLUTE;
10282 case AARCH64_CMODEL_TINY_PIC:
10283 if (!aarch64_symbol_binds_local_p (x))
10284 return SYMBOL_TINY_GOT;
10285 return SYMBOL_TINY_ABSOLUTE;
10287 case AARCH64_CMODEL_SMALL_SPIC:
10288 case AARCH64_CMODEL_SMALL_PIC:
10289 if (!aarch64_symbol_binds_local_p (x))
10290 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10291 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10292 return SYMBOL_SMALL_ABSOLUTE;
10294 case AARCH64_CMODEL_LARGE:
10295 /* This is alright even in PIC code as the constant
10296 pool reference is always PC relative and within
10297 the same translation unit. */
10298 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10299 return SYMBOL_SMALL_ABSOLUTE;
10300 else
10301 return SYMBOL_FORCE_TO_MEM;
10303 default:
10304 gcc_unreachable ();
10308 /* By default push everything into the constant pool. */
10309 return SYMBOL_FORCE_TO_MEM;
10312 bool
10313 aarch64_constant_address_p (rtx x)
10315 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10318 bool
10319 aarch64_legitimate_pic_operand_p (rtx x)
10321 if (GET_CODE (x) == SYMBOL_REF
10322 || (GET_CODE (x) == CONST
10323 && GET_CODE (XEXP (x, 0)) == PLUS
10324 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10325 return false;
10327 return true;
10330 /* Return true if X holds either a quarter-precision or
10331 floating-point +0.0 constant. */
10332 static bool
10333 aarch64_valid_floating_const (rtx x)
10335 if (!CONST_DOUBLE_P (x))
10336 return false;
10338 /* This call determines which constants can be used in mov<mode>
10339 as integer moves instead of constant loads. */
10340 if (aarch64_float_const_rtx_p (x))
10341 return true;
10343 return aarch64_float_const_representable_p (x);
10346 static bool
10347 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10349 /* Do not allow vector struct mode constants. We could support
10350 0 and -1 easily, but they need support in aarch64-simd.md. */
10351 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10352 return false;
10354 /* For these cases we never want to use a literal load.
10355 As such we have to prevent the compiler from forcing these
10356 to memory. */
10357 if ((GET_CODE (x) == CONST_VECTOR
10358 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10359 || CONST_INT_P (x)
10360 || aarch64_valid_floating_const (x)
10361 || aarch64_can_const_movi_rtx_p (x, mode)
10362 || aarch64_float_const_rtx_p (x))
10363 return !targetm.cannot_force_const_mem (mode, x);
10365 if (GET_CODE (x) == HIGH
10366 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10367 return true;
10369 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10370 so spilling them is better than rematerialization. */
10371 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10372 return true;
10374 return aarch64_constant_address_p (x);
10378 aarch64_load_tp (rtx target)
10380 if (!target
10381 || GET_MODE (target) != Pmode
10382 || !register_operand (target, Pmode))
10383 target = gen_reg_rtx (Pmode);
10385 /* Can return in any reg. */
10386 emit_insn (gen_aarch64_load_tp_hard (target));
10387 return target;
10390 /* On AAPCS systems, this is the "struct __va_list". */
10391 static GTY(()) tree va_list_type;
10393 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10394 Return the type to use as __builtin_va_list.
10396 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10398 struct __va_list
10400 void *__stack;
10401 void *__gr_top;
10402 void *__vr_top;
10403 int __gr_offs;
10404 int __vr_offs;
10405 }; */
10407 static tree
10408 aarch64_build_builtin_va_list (void)
10410 tree va_list_name;
10411 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10413 /* Create the type. */
10414 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10415 /* Give it the required name. */
10416 va_list_name = build_decl (BUILTINS_LOCATION,
10417 TYPE_DECL,
10418 get_identifier ("__va_list"),
10419 va_list_type);
10420 DECL_ARTIFICIAL (va_list_name) = 1;
10421 TYPE_NAME (va_list_type) = va_list_name;
10422 TYPE_STUB_DECL (va_list_type) = va_list_name;
10424 /* Create the fields. */
10425 f_stack = build_decl (BUILTINS_LOCATION,
10426 FIELD_DECL, get_identifier ("__stack"),
10427 ptr_type_node);
10428 f_grtop = build_decl (BUILTINS_LOCATION,
10429 FIELD_DECL, get_identifier ("__gr_top"),
10430 ptr_type_node);
10431 f_vrtop = build_decl (BUILTINS_LOCATION,
10432 FIELD_DECL, get_identifier ("__vr_top"),
10433 ptr_type_node);
10434 f_groff = build_decl (BUILTINS_LOCATION,
10435 FIELD_DECL, get_identifier ("__gr_offs"),
10436 integer_type_node);
10437 f_vroff = build_decl (BUILTINS_LOCATION,
10438 FIELD_DECL, get_identifier ("__vr_offs"),
10439 integer_type_node);
10441 /* Tell tree-stdarg pass about our internal offset fields.
10442 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10443 purpose to identify whether the code is updating va_list internal
10444 offset fields through irregular way. */
10445 va_list_gpr_counter_field = f_groff;
10446 va_list_fpr_counter_field = f_vroff;
10448 DECL_ARTIFICIAL (f_stack) = 1;
10449 DECL_ARTIFICIAL (f_grtop) = 1;
10450 DECL_ARTIFICIAL (f_vrtop) = 1;
10451 DECL_ARTIFICIAL (f_groff) = 1;
10452 DECL_ARTIFICIAL (f_vroff) = 1;
10454 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10455 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10456 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10457 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10458 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10460 TYPE_FIELDS (va_list_type) = f_stack;
10461 DECL_CHAIN (f_stack) = f_grtop;
10462 DECL_CHAIN (f_grtop) = f_vrtop;
10463 DECL_CHAIN (f_vrtop) = f_groff;
10464 DECL_CHAIN (f_groff) = f_vroff;
10466 /* Compute its layout. */
10467 layout_type (va_list_type);
10469 return va_list_type;
10472 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10473 static void
10474 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10476 const CUMULATIVE_ARGS *cum;
10477 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10478 tree stack, grtop, vrtop, groff, vroff;
10479 tree t;
10480 int gr_save_area_size = cfun->va_list_gpr_size;
10481 int vr_save_area_size = cfun->va_list_fpr_size;
10482 int vr_offset;
10484 cum = &crtl->args.info;
10485 if (cfun->va_list_gpr_size)
10486 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10487 cfun->va_list_gpr_size);
10488 if (cfun->va_list_fpr_size)
10489 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10490 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10492 if (!TARGET_FLOAT)
10494 gcc_assert (cum->aapcs_nvrn == 0);
10495 vr_save_area_size = 0;
10498 f_stack = TYPE_FIELDS (va_list_type_node);
10499 f_grtop = DECL_CHAIN (f_stack);
10500 f_vrtop = DECL_CHAIN (f_grtop);
10501 f_groff = DECL_CHAIN (f_vrtop);
10502 f_vroff = DECL_CHAIN (f_groff);
10504 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10505 NULL_TREE);
10506 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10507 NULL_TREE);
10508 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10509 NULL_TREE);
10510 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10511 NULL_TREE);
10512 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10513 NULL_TREE);
10515 /* Emit code to initialize STACK, which points to the next varargs stack
10516 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10517 by named arguments. STACK is 8-byte aligned. */
10518 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10519 if (cum->aapcs_stack_size > 0)
10520 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10521 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10522 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10524 /* Emit code to initialize GRTOP, the top of the GR save area.
10525 virtual_incoming_args_rtx should have been 16 byte aligned. */
10526 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10527 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10528 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10530 /* Emit code to initialize VRTOP, the top of the VR save area.
10531 This address is gr_save_area_bytes below GRTOP, rounded
10532 down to the next 16-byte boundary. */
10533 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10534 vr_offset = ROUND_UP (gr_save_area_size,
10535 STACK_BOUNDARY / BITS_PER_UNIT);
10537 if (vr_offset)
10538 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10539 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10540 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10542 /* Emit code to initialize GROFF, the offset from GRTOP of the
10543 next GPR argument. */
10544 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10545 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10546 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10548 /* Likewise emit code to initialize VROFF, the offset from FTOP
10549 of the next VR argument. */
10550 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10551 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10552 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10555 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10557 static tree
10558 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10559 gimple_seq *post_p ATTRIBUTE_UNUSED)
10561 tree addr;
10562 bool indirect_p;
10563 bool is_ha; /* is HFA or HVA. */
10564 bool dw_align; /* double-word align. */
10565 machine_mode ag_mode = VOIDmode;
10566 int nregs;
10567 machine_mode mode;
10569 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10570 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10571 HOST_WIDE_INT size, rsize, adjust, align;
10572 tree t, u, cond1, cond2;
10574 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10575 if (indirect_p)
10576 type = build_pointer_type (type);
10578 mode = TYPE_MODE (type);
10580 f_stack = TYPE_FIELDS (va_list_type_node);
10581 f_grtop = DECL_CHAIN (f_stack);
10582 f_vrtop = DECL_CHAIN (f_grtop);
10583 f_groff = DECL_CHAIN (f_vrtop);
10584 f_vroff = DECL_CHAIN (f_groff);
10586 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10587 f_stack, NULL_TREE);
10588 size = int_size_in_bytes (type);
10589 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10591 dw_align = false;
10592 adjust = 0;
10593 if (aarch64_vfp_is_call_or_return_candidate (mode,
10594 type,
10595 &ag_mode,
10596 &nregs,
10597 &is_ha))
10599 /* TYPE passed in fp/simd registers. */
10600 if (!TARGET_FLOAT)
10601 aarch64_err_no_fpadvsimd (mode, "varargs");
10603 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10604 unshare_expr (valist), f_vrtop, NULL_TREE);
10605 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10606 unshare_expr (valist), f_vroff, NULL_TREE);
10608 rsize = nregs * UNITS_PER_VREG;
10610 if (is_ha)
10612 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10613 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10615 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10616 && size < UNITS_PER_VREG)
10618 adjust = UNITS_PER_VREG - size;
10621 else
10623 /* TYPE passed in general registers. */
10624 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10625 unshare_expr (valist), f_grtop, NULL_TREE);
10626 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10627 unshare_expr (valist), f_groff, NULL_TREE);
10628 rsize = ROUND_UP (size, UNITS_PER_WORD);
10629 nregs = rsize / UNITS_PER_WORD;
10631 if (align > 8)
10632 dw_align = true;
10634 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10635 && size < UNITS_PER_WORD)
10637 adjust = UNITS_PER_WORD - size;
10641 /* Get a local temporary for the field value. */
10642 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10644 /* Emit code to branch if off >= 0. */
10645 t = build2 (GE_EXPR, boolean_type_node, off,
10646 build_int_cst (TREE_TYPE (off), 0));
10647 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10649 if (dw_align)
10651 /* Emit: offs = (offs + 15) & -16. */
10652 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10653 build_int_cst (TREE_TYPE (off), 15));
10654 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10655 build_int_cst (TREE_TYPE (off), -16));
10656 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10658 else
10659 roundup = NULL;
10661 /* Update ap.__[g|v]r_offs */
10662 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10663 build_int_cst (TREE_TYPE (off), rsize));
10664 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10666 /* String up. */
10667 if (roundup)
10668 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10670 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10671 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10672 build_int_cst (TREE_TYPE (f_off), 0));
10673 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10675 /* String up: make sure the assignment happens before the use. */
10676 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10677 COND_EXPR_ELSE (cond1) = t;
10679 /* Prepare the trees handling the argument that is passed on the stack;
10680 the top level node will store in ON_STACK. */
10681 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10682 if (align > 8)
10684 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10685 t = fold_convert (intDI_type_node, arg);
10686 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10687 build_int_cst (TREE_TYPE (t), 15));
10688 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10689 build_int_cst (TREE_TYPE (t), -16));
10690 t = fold_convert (TREE_TYPE (arg), t);
10691 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10693 else
10694 roundup = NULL;
10695 /* Advance ap.__stack */
10696 t = fold_convert (intDI_type_node, arg);
10697 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10698 build_int_cst (TREE_TYPE (t), size + 7));
10699 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10700 build_int_cst (TREE_TYPE (t), -8));
10701 t = fold_convert (TREE_TYPE (arg), t);
10702 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10703 /* String up roundup and advance. */
10704 if (roundup)
10705 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10706 /* String up with arg */
10707 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10708 /* Big-endianness related address adjustment. */
10709 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10710 && size < UNITS_PER_WORD)
10712 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10713 size_int (UNITS_PER_WORD - size));
10714 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10717 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10718 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10720 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10721 t = off;
10722 if (adjust)
10723 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10724 build_int_cst (TREE_TYPE (off), adjust));
10726 t = fold_convert (sizetype, t);
10727 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10729 if (is_ha)
10731 /* type ha; // treat as "struct {ftype field[n];}"
10732 ... [computing offs]
10733 for (i = 0; i <nregs; ++i, offs += 16)
10734 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10735 return ha; */
10736 int i;
10737 tree tmp_ha, field_t, field_ptr_t;
10739 /* Declare a local variable. */
10740 tmp_ha = create_tmp_var_raw (type, "ha");
10741 gimple_add_tmp_var (tmp_ha);
10743 /* Establish the base type. */
10744 switch (ag_mode)
10746 case E_SFmode:
10747 field_t = float_type_node;
10748 field_ptr_t = float_ptr_type_node;
10749 break;
10750 case E_DFmode:
10751 field_t = double_type_node;
10752 field_ptr_t = double_ptr_type_node;
10753 break;
10754 case E_TFmode:
10755 field_t = long_double_type_node;
10756 field_ptr_t = long_double_ptr_type_node;
10757 break;
10758 case E_HFmode:
10759 field_t = aarch64_fp16_type_node;
10760 field_ptr_t = aarch64_fp16_ptr_type_node;
10761 break;
10762 case E_V2SImode:
10763 case E_V4SImode:
10765 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10766 field_t = build_vector_type_for_mode (innertype, ag_mode);
10767 field_ptr_t = build_pointer_type (field_t);
10769 break;
10770 default:
10771 gcc_assert (0);
10774 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10775 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10776 addr = t;
10777 t = fold_convert (field_ptr_t, addr);
10778 t = build2 (MODIFY_EXPR, field_t,
10779 build1 (INDIRECT_REF, field_t, tmp_ha),
10780 build1 (INDIRECT_REF, field_t, t));
10782 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10783 for (i = 1; i < nregs; ++i)
10785 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10786 u = fold_convert (field_ptr_t, addr);
10787 u = build2 (MODIFY_EXPR, field_t,
10788 build2 (MEM_REF, field_t, tmp_ha,
10789 build_int_cst (field_ptr_t,
10790 (i *
10791 int_size_in_bytes (field_t)))),
10792 build1 (INDIRECT_REF, field_t, u));
10793 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10796 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10797 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10800 COND_EXPR_ELSE (cond2) = t;
10801 addr = fold_convert (build_pointer_type (type), cond1);
10802 addr = build_va_arg_indirect_ref (addr);
10804 if (indirect_p)
10805 addr = build_va_arg_indirect_ref (addr);
10807 return addr;
10810 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10812 static void
10813 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10814 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10815 int no_rtl)
10817 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10818 CUMULATIVE_ARGS local_cum;
10819 int gr_saved = cfun->va_list_gpr_size;
10820 int vr_saved = cfun->va_list_fpr_size;
10822 /* The caller has advanced CUM up to, but not beyond, the last named
10823 argument. Advance a local copy of CUM past the last "real" named
10824 argument, to find out how many registers are left over. */
10825 local_cum = *cum;
10826 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10828 /* Found out how many registers we need to save.
10829 Honor tree-stdvar analysis results. */
10830 if (cfun->va_list_gpr_size)
10831 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10832 cfun->va_list_gpr_size / UNITS_PER_WORD);
10833 if (cfun->va_list_fpr_size)
10834 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10835 cfun->va_list_fpr_size / UNITS_PER_VREG);
10837 if (!TARGET_FLOAT)
10839 gcc_assert (local_cum.aapcs_nvrn == 0);
10840 vr_saved = 0;
10843 if (!no_rtl)
10845 if (gr_saved > 0)
10847 rtx ptr, mem;
10849 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10850 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10851 - gr_saved * UNITS_PER_WORD);
10852 mem = gen_frame_mem (BLKmode, ptr);
10853 set_mem_alias_set (mem, get_varargs_alias_set ());
10855 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10856 mem, gr_saved);
10858 if (vr_saved > 0)
10860 /* We can't use move_block_from_reg, because it will use
10861 the wrong mode, storing D regs only. */
10862 machine_mode mode = TImode;
10863 int off, i, vr_start;
10865 /* Set OFF to the offset from virtual_incoming_args_rtx of
10866 the first vector register. The VR save area lies below
10867 the GR one, and is aligned to 16 bytes. */
10868 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10869 STACK_BOUNDARY / BITS_PER_UNIT);
10870 off -= vr_saved * UNITS_PER_VREG;
10872 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10873 for (i = 0; i < vr_saved; ++i)
10875 rtx ptr, mem;
10877 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10878 mem = gen_frame_mem (mode, ptr);
10879 set_mem_alias_set (mem, get_varargs_alias_set ());
10880 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10881 off += UNITS_PER_VREG;
10886 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10887 any complication of having crtl->args.pretend_args_size changed. */
10888 cfun->machine->frame.saved_varargs_size
10889 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10890 STACK_BOUNDARY / BITS_PER_UNIT)
10891 + vr_saved * UNITS_PER_VREG);
10894 static void
10895 aarch64_conditional_register_usage (void)
10897 int i;
10898 if (!TARGET_FLOAT)
10900 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10902 fixed_regs[i] = 1;
10903 call_used_regs[i] = 1;
10908 /* Walk down the type tree of TYPE counting consecutive base elements.
10909 If *MODEP is VOIDmode, then set it to the first valid floating point
10910 type. If a non-floating point type is found, or if a floating point
10911 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10912 otherwise return the count in the sub-tree. */
10913 static int
10914 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10916 machine_mode mode;
10917 HOST_WIDE_INT size;
10919 switch (TREE_CODE (type))
10921 case REAL_TYPE:
10922 mode = TYPE_MODE (type);
10923 if (mode != DFmode && mode != SFmode
10924 && mode != TFmode && mode != HFmode)
10925 return -1;
10927 if (*modep == VOIDmode)
10928 *modep = mode;
10930 if (*modep == mode)
10931 return 1;
10933 break;
10935 case COMPLEX_TYPE:
10936 mode = TYPE_MODE (TREE_TYPE (type));
10937 if (mode != DFmode && mode != SFmode
10938 && mode != TFmode && mode != HFmode)
10939 return -1;
10941 if (*modep == VOIDmode)
10942 *modep = mode;
10944 if (*modep == mode)
10945 return 2;
10947 break;
10949 case VECTOR_TYPE:
10950 /* Use V2SImode and V4SImode as representatives of all 64-bit
10951 and 128-bit vector types. */
10952 size = int_size_in_bytes (type);
10953 switch (size)
10955 case 8:
10956 mode = V2SImode;
10957 break;
10958 case 16:
10959 mode = V4SImode;
10960 break;
10961 default:
10962 return -1;
10965 if (*modep == VOIDmode)
10966 *modep = mode;
10968 /* Vector modes are considered to be opaque: two vectors are
10969 equivalent for the purposes of being homogeneous aggregates
10970 if they are the same size. */
10971 if (*modep == mode)
10972 return 1;
10974 break;
10976 case ARRAY_TYPE:
10978 int count;
10979 tree index = TYPE_DOMAIN (type);
10981 /* Can't handle incomplete types nor sizes that are not
10982 fixed. */
10983 if (!COMPLETE_TYPE_P (type)
10984 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10985 return -1;
10987 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10988 if (count == -1
10989 || !index
10990 || !TYPE_MAX_VALUE (index)
10991 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10992 || !TYPE_MIN_VALUE (index)
10993 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10994 || count < 0)
10995 return -1;
10997 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10998 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11000 /* There must be no padding. */
11001 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11002 return -1;
11004 return count;
11007 case RECORD_TYPE:
11009 int count = 0;
11010 int sub_count;
11011 tree field;
11013 /* Can't handle incomplete types nor sizes that are not
11014 fixed. */
11015 if (!COMPLETE_TYPE_P (type)
11016 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11017 return -1;
11019 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11021 if (TREE_CODE (field) != FIELD_DECL)
11022 continue;
11024 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11025 if (sub_count < 0)
11026 return -1;
11027 count += sub_count;
11030 /* There must be no padding. */
11031 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11032 return -1;
11034 return count;
11037 case UNION_TYPE:
11038 case QUAL_UNION_TYPE:
11040 /* These aren't very interesting except in a degenerate case. */
11041 int count = 0;
11042 int sub_count;
11043 tree field;
11045 /* Can't handle incomplete types nor sizes that are not
11046 fixed. */
11047 if (!COMPLETE_TYPE_P (type)
11048 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11049 return -1;
11051 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11053 if (TREE_CODE (field) != FIELD_DECL)
11054 continue;
11056 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11057 if (sub_count < 0)
11058 return -1;
11059 count = count > sub_count ? count : sub_count;
11062 /* There must be no padding. */
11063 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11064 return -1;
11066 return count;
11069 default:
11070 break;
11073 return -1;
11076 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11077 type as described in AAPCS64 \S 4.1.2.
11079 See the comment above aarch64_composite_type_p for the notes on MODE. */
11081 static bool
11082 aarch64_short_vector_p (const_tree type,
11083 machine_mode mode)
11085 HOST_WIDE_INT size = -1;
11087 if (type && TREE_CODE (type) == VECTOR_TYPE)
11088 size = int_size_in_bytes (type);
11089 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11090 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11091 size = GET_MODE_SIZE (mode);
11093 return (size == 8 || size == 16);
11096 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11097 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11098 array types. The C99 floating-point complex types are also considered
11099 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11100 types, which are GCC extensions and out of the scope of AAPCS64, are
11101 treated as composite types here as well.
11103 Note that MODE itself is not sufficient in determining whether a type
11104 is such a composite type or not. This is because
11105 stor-layout.c:compute_record_mode may have already changed the MODE
11106 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11107 structure with only one field may have its MODE set to the mode of the
11108 field. Also an integer mode whose size matches the size of the
11109 RECORD_TYPE type may be used to substitute the original mode
11110 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11111 solely relied on. */
11113 static bool
11114 aarch64_composite_type_p (const_tree type,
11115 machine_mode mode)
11117 if (aarch64_short_vector_p (type, mode))
11118 return false;
11120 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11121 return true;
11123 if (mode == BLKmode
11124 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11125 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11126 return true;
11128 return false;
11131 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11132 shall be passed or returned in simd/fp register(s) (providing these
11133 parameter passing registers are available).
11135 Upon successful return, *COUNT returns the number of needed registers,
11136 *BASE_MODE returns the mode of the individual register and when IS_HAF
11137 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11138 floating-point aggregate or a homogeneous short-vector aggregate. */
11140 static bool
11141 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11142 const_tree type,
11143 machine_mode *base_mode,
11144 int *count,
11145 bool *is_ha)
11147 machine_mode new_mode = VOIDmode;
11148 bool composite_p = aarch64_composite_type_p (type, mode);
11150 if (is_ha != NULL) *is_ha = false;
11152 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11153 || aarch64_short_vector_p (type, mode))
11155 *count = 1;
11156 new_mode = mode;
11158 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11160 if (is_ha != NULL) *is_ha = true;
11161 *count = 2;
11162 new_mode = GET_MODE_INNER (mode);
11164 else if (type && composite_p)
11166 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11168 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11170 if (is_ha != NULL) *is_ha = true;
11171 *count = ag_count;
11173 else
11174 return false;
11176 else
11177 return false;
11179 *base_mode = new_mode;
11180 return true;
11183 /* Implement TARGET_STRUCT_VALUE_RTX. */
11185 static rtx
11186 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11187 int incoming ATTRIBUTE_UNUSED)
11189 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11192 /* Implements target hook vector_mode_supported_p. */
11193 static bool
11194 aarch64_vector_mode_supported_p (machine_mode mode)
11196 if (TARGET_SIMD
11197 && (mode == V4SImode || mode == V8HImode
11198 || mode == V16QImode || mode == V2DImode
11199 || mode == V2SImode || mode == V4HImode
11200 || mode == V8QImode || mode == V2SFmode
11201 || mode == V4SFmode || mode == V2DFmode
11202 || mode == V4HFmode || mode == V8HFmode
11203 || mode == V1DFmode))
11204 return true;
11206 return false;
11209 /* Return appropriate SIMD container
11210 for MODE within a vector of WIDTH bits. */
11211 static machine_mode
11212 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11214 gcc_assert (width == 64 || width == 128);
11215 if (TARGET_SIMD)
11217 if (width == 128)
11218 switch (mode)
11220 case E_DFmode:
11221 return V2DFmode;
11222 case E_SFmode:
11223 return V4SFmode;
11224 case E_HFmode:
11225 return V8HFmode;
11226 case E_SImode:
11227 return V4SImode;
11228 case E_HImode:
11229 return V8HImode;
11230 case E_QImode:
11231 return V16QImode;
11232 case E_DImode:
11233 return V2DImode;
11234 default:
11235 break;
11237 else
11238 switch (mode)
11240 case E_SFmode:
11241 return V2SFmode;
11242 case E_HFmode:
11243 return V4HFmode;
11244 case E_SImode:
11245 return V2SImode;
11246 case E_HImode:
11247 return V4HImode;
11248 case E_QImode:
11249 return V8QImode;
11250 default:
11251 break;
11254 return word_mode;
11257 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11258 static machine_mode
11259 aarch64_preferred_simd_mode (machine_mode mode)
11261 return aarch64_simd_container_mode (mode, 128);
11264 /* Return the bitmask of possible vector sizes for the vectorizer
11265 to iterate over. */
11266 static unsigned int
11267 aarch64_autovectorize_vector_sizes (void)
11269 return (16 | 8);
11272 /* Implement TARGET_MANGLE_TYPE. */
11274 static const char *
11275 aarch64_mangle_type (const_tree type)
11277 /* The AArch64 ABI documents say that "__va_list" has to be
11278 managled as if it is in the "std" namespace. */
11279 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11280 return "St9__va_list";
11282 /* Half-precision float. */
11283 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11284 return "Dh";
11286 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11287 builtin types. */
11288 if (TYPE_NAME (type) != NULL)
11289 return aarch64_mangle_builtin_type (type);
11291 /* Use the default mangling. */
11292 return NULL;
11295 /* Find the first rtx_insn before insn that will generate an assembly
11296 instruction. */
11298 static rtx_insn *
11299 aarch64_prev_real_insn (rtx_insn *insn)
11301 if (!insn)
11302 return NULL;
11306 insn = prev_real_insn (insn);
11308 while (insn && recog_memoized (insn) < 0);
11310 return insn;
11313 static bool
11314 is_madd_op (enum attr_type t1)
11316 unsigned int i;
11317 /* A number of these may be AArch32 only. */
11318 enum attr_type mlatypes[] = {
11319 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11320 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11321 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11324 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11326 if (t1 == mlatypes[i])
11327 return true;
11330 return false;
11333 /* Check if there is a register dependency between a load and the insn
11334 for which we hold recog_data. */
11336 static bool
11337 dep_between_memop_and_curr (rtx memop)
11339 rtx load_reg;
11340 int opno;
11342 gcc_assert (GET_CODE (memop) == SET);
11344 if (!REG_P (SET_DEST (memop)))
11345 return false;
11347 load_reg = SET_DEST (memop);
11348 for (opno = 1; opno < recog_data.n_operands; opno++)
11350 rtx operand = recog_data.operand[opno];
11351 if (REG_P (operand)
11352 && reg_overlap_mentioned_p (load_reg, operand))
11353 return true;
11356 return false;
11360 /* When working around the Cortex-A53 erratum 835769,
11361 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11362 instruction and has a preceding memory instruction such that a NOP
11363 should be inserted between them. */
11365 bool
11366 aarch64_madd_needs_nop (rtx_insn* insn)
11368 enum attr_type attr_type;
11369 rtx_insn *prev;
11370 rtx body;
11372 if (!TARGET_FIX_ERR_A53_835769)
11373 return false;
11375 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11376 return false;
11378 attr_type = get_attr_type (insn);
11379 if (!is_madd_op (attr_type))
11380 return false;
11382 prev = aarch64_prev_real_insn (insn);
11383 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11384 Restore recog state to INSN to avoid state corruption. */
11385 extract_constrain_insn_cached (insn);
11387 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11388 return false;
11390 body = single_set (prev);
11392 /* If the previous insn is a memory op and there is no dependency between
11393 it and the DImode madd, emit a NOP between them. If body is NULL then we
11394 have a complex memory operation, probably a load/store pair.
11395 Be conservative for now and emit a NOP. */
11396 if (GET_MODE (recog_data.operand[0]) == DImode
11397 && (!body || !dep_between_memop_and_curr (body)))
11398 return true;
11400 return false;
11405 /* Implement FINAL_PRESCAN_INSN. */
11407 void
11408 aarch64_final_prescan_insn (rtx_insn *insn)
11410 if (aarch64_madd_needs_nop (insn))
11411 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11415 /* Return the equivalent letter for size. */
11416 static char
11417 sizetochar (int size)
11419 switch (size)
11421 case 64: return 'd';
11422 case 32: return 's';
11423 case 16: return 'h';
11424 case 8 : return 'b';
11425 default: gcc_unreachable ();
11429 /* Return true iff x is a uniform vector of floating-point
11430 constants, and the constant can be represented in
11431 quarter-precision form. Note, as aarch64_float_const_representable
11432 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11433 static bool
11434 aarch64_vect_float_const_representable_p (rtx x)
11436 rtx elt;
11437 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11438 && const_vec_duplicate_p (x, &elt)
11439 && aarch64_float_const_representable_p (elt));
11442 /* Return true for valid and false for invalid. */
11443 bool
11444 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11445 struct simd_immediate_info *info)
11447 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11448 matches = 1; \
11449 for (i = 0; i < idx; i += (STRIDE)) \
11450 if (!(TEST)) \
11451 matches = 0; \
11452 if (matches) \
11454 immtype = (CLASS); \
11455 elsize = (ELSIZE); \
11456 eshift = (SHIFT); \
11457 emvn = (NEG); \
11458 break; \
11461 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11462 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11463 unsigned char bytes[16];
11464 int immtype = -1, matches;
11465 unsigned int invmask = inverse ? 0xff : 0;
11466 int eshift, emvn;
11468 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11470 if (! (aarch64_simd_imm_zero_p (op, mode)
11471 || aarch64_vect_float_const_representable_p (op)))
11472 return false;
11474 if (info)
11476 info->value = CONST_VECTOR_ELT (op, 0);
11477 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11478 info->mvn = false;
11479 info->shift = 0;
11482 return true;
11485 /* Splat vector constant out into a byte vector. */
11486 for (i = 0; i < n_elts; i++)
11488 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11489 it must be laid out in the vector register in reverse order. */
11490 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11491 unsigned HOST_WIDE_INT elpart;
11493 gcc_assert (CONST_INT_P (el));
11494 elpart = INTVAL (el);
11496 for (unsigned int byte = 0; byte < innersize; byte++)
11498 bytes[idx++] = (elpart & 0xff) ^ invmask;
11499 elpart >>= BITS_PER_UNIT;
11504 /* Sanity check. */
11505 gcc_assert (idx == GET_MODE_SIZE (mode));
11509 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11510 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11512 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11513 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11515 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11516 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11518 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11519 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11521 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11523 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11525 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11526 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11528 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11529 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11531 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11532 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11534 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11535 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11537 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11539 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11541 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11542 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11544 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11545 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11547 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11548 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11550 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11551 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11553 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11555 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11556 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11558 while (0);
11560 if (immtype == -1)
11561 return false;
11563 if (info)
11565 info->element_width = elsize;
11566 info->mvn = emvn != 0;
11567 info->shift = eshift;
11569 unsigned HOST_WIDE_INT imm = 0;
11571 if (immtype >= 12 && immtype <= 15)
11572 info->msl = true;
11574 /* Un-invert bytes of recognized vector, if necessary. */
11575 if (invmask != 0)
11576 for (i = 0; i < idx; i++)
11577 bytes[i] ^= invmask;
11579 if (immtype == 17)
11581 /* FIXME: Broken on 32-bit H_W_I hosts. */
11582 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11584 for (i = 0; i < 8; i++)
11585 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11586 << (i * BITS_PER_UNIT);
11589 info->value = GEN_INT (imm);
11591 else
11593 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11594 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11596 /* Construct 'abcdefgh' because the assembler cannot handle
11597 generic constants. */
11598 if (info->mvn)
11599 imm = ~imm;
11600 imm = (imm >> info->shift) & 0xff;
11601 info->value = GEN_INT (imm);
11605 return true;
11606 #undef CHECK
11609 /* Check of immediate shift constants are within range. */
11610 bool
11611 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11613 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11614 if (left)
11615 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11616 else
11617 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11620 /* Return true if X is a uniform vector where all elements
11621 are either the floating-point constant 0.0 or the
11622 integer constant 0. */
11623 bool
11624 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11626 return x == CONST0_RTX (mode);
11630 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11631 operation of width WIDTH at bit position POS. */
11634 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11636 gcc_assert (CONST_INT_P (width));
11637 gcc_assert (CONST_INT_P (pos));
11639 unsigned HOST_WIDE_INT mask
11640 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11641 return GEN_INT (mask << UINTVAL (pos));
11644 bool
11645 aarch64_mov_operand_p (rtx x, machine_mode mode)
11647 if (GET_CODE (x) == HIGH
11648 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11649 return true;
11651 if (CONST_INT_P (x))
11652 return true;
11654 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11655 return true;
11657 return aarch64_classify_symbolic_expression (x)
11658 == SYMBOL_TINY_ABSOLUTE;
11661 /* Return a const_int vector of VAL. */
11663 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11665 int nunits = GET_MODE_NUNITS (mode);
11666 rtvec v = rtvec_alloc (nunits);
11667 int i;
11669 rtx cache = GEN_INT (val);
11671 for (i=0; i < nunits; i++)
11672 RTVEC_ELT (v, i) = cache;
11674 return gen_rtx_CONST_VECTOR (mode, v);
11677 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11679 bool
11680 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11682 machine_mode vmode;
11684 gcc_assert (!VECTOR_MODE_P (mode));
11685 vmode = aarch64_preferred_simd_mode (mode);
11686 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11687 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11690 /* Construct and return a PARALLEL RTX vector with elements numbering the
11691 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11692 the vector - from the perspective of the architecture. This does not
11693 line up with GCC's perspective on lane numbers, so we end up with
11694 different masks depending on our target endian-ness. The diagram
11695 below may help. We must draw the distinction when building masks
11696 which select one half of the vector. An instruction selecting
11697 architectural low-lanes for a big-endian target, must be described using
11698 a mask selecting GCC high-lanes.
11700 Big-Endian Little-Endian
11702 GCC 0 1 2 3 3 2 1 0
11703 | x | x | x | x | | x | x | x | x |
11704 Architecture 3 2 1 0 3 2 1 0
11706 Low Mask: { 2, 3 } { 0, 1 }
11707 High Mask: { 0, 1 } { 2, 3 }
11711 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11713 int nunits = GET_MODE_NUNITS (mode);
11714 rtvec v = rtvec_alloc (nunits / 2);
11715 int high_base = nunits / 2;
11716 int low_base = 0;
11717 int base;
11718 rtx t1;
11719 int i;
11721 if (BYTES_BIG_ENDIAN)
11722 base = high ? low_base : high_base;
11723 else
11724 base = high ? high_base : low_base;
11726 for (i = 0; i < nunits / 2; i++)
11727 RTVEC_ELT (v, i) = GEN_INT (base + i);
11729 t1 = gen_rtx_PARALLEL (mode, v);
11730 return t1;
11733 /* Check OP for validity as a PARALLEL RTX vector with elements
11734 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11735 from the perspective of the architecture. See the diagram above
11736 aarch64_simd_vect_par_cnst_half for more details. */
11738 bool
11739 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11740 bool high)
11742 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11743 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11744 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11745 int i = 0;
11747 if (!VECTOR_MODE_P (mode))
11748 return false;
11750 if (count_op != count_ideal)
11751 return false;
11753 for (i = 0; i < count_ideal; i++)
11755 rtx elt_op = XVECEXP (op, 0, i);
11756 rtx elt_ideal = XVECEXP (ideal, 0, i);
11758 if (!CONST_INT_P (elt_op)
11759 || INTVAL (elt_ideal) != INTVAL (elt_op))
11760 return false;
11762 return true;
11765 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11766 HIGH (exclusive). */
11767 void
11768 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11769 const_tree exp)
11771 HOST_WIDE_INT lane;
11772 gcc_assert (CONST_INT_P (operand));
11773 lane = INTVAL (operand);
11775 if (lane < low || lane >= high)
11777 if (exp)
11778 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11779 else
11780 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11784 /* Return TRUE if OP is a valid vector addressing mode. */
11785 bool
11786 aarch64_simd_mem_operand_p (rtx op)
11788 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11789 || REG_P (XEXP (op, 0)));
11792 /* Emit a register copy from operand to operand, taking care not to
11793 early-clobber source registers in the process.
11795 COUNT is the number of components into which the copy needs to be
11796 decomposed. */
11797 void
11798 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11799 unsigned int count)
11801 unsigned int i;
11802 int rdest = REGNO (operands[0]);
11803 int rsrc = REGNO (operands[1]);
11805 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11806 || rdest < rsrc)
11807 for (i = 0; i < count; i++)
11808 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11809 gen_rtx_REG (mode, rsrc + i));
11810 else
11811 for (i = 0; i < count; i++)
11812 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11813 gen_rtx_REG (mode, rsrc + count - i - 1));
11816 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11817 one of VSTRUCT modes: OI, CI, or XI. */
11819 aarch64_simd_attr_length_rglist (machine_mode mode)
11821 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11824 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11825 alignment of a vector to 128 bits. */
11826 static HOST_WIDE_INT
11827 aarch64_simd_vector_alignment (const_tree type)
11829 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11830 return MIN (align, 128);
11833 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11834 static bool
11835 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11837 if (is_packed)
11838 return false;
11840 /* We guarantee alignment for vectors up to 128-bits. */
11841 if (tree_int_cst_compare (TYPE_SIZE (type),
11842 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11843 return false;
11845 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11846 return true;
11849 /* Return true if the vector misalignment factor is supported by the
11850 target. */
11851 static bool
11852 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11853 const_tree type, int misalignment,
11854 bool is_packed)
11856 if (TARGET_SIMD && STRICT_ALIGNMENT)
11858 /* Return if movmisalign pattern is not supported for this mode. */
11859 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11860 return false;
11862 if (misalignment == -1)
11864 /* Misalignment factor is unknown at compile time but we know
11865 it's word aligned. */
11866 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11868 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11870 if (element_size != 64)
11871 return true;
11873 return false;
11876 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11877 is_packed);
11880 /* If VALS is a vector constant that can be loaded into a register
11881 using DUP, generate instructions to do so and return an RTX to
11882 assign to the register. Otherwise return NULL_RTX. */
11883 static rtx
11884 aarch64_simd_dup_constant (rtx vals)
11886 machine_mode mode = GET_MODE (vals);
11887 machine_mode inner_mode = GET_MODE_INNER (mode);
11888 rtx x;
11890 if (!const_vec_duplicate_p (vals, &x))
11891 return NULL_RTX;
11893 /* We can load this constant by using DUP and a constant in a
11894 single ARM register. This will be cheaper than a vector
11895 load. */
11896 x = copy_to_mode_reg (inner_mode, x);
11897 return gen_rtx_VEC_DUPLICATE (mode, x);
11901 /* Generate code to load VALS, which is a PARALLEL containing only
11902 constants (for vec_init) or CONST_VECTOR, efficiently into a
11903 register. Returns an RTX to copy into the register, or NULL_RTX
11904 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11905 static rtx
11906 aarch64_simd_make_constant (rtx vals)
11908 machine_mode mode = GET_MODE (vals);
11909 rtx const_dup;
11910 rtx const_vec = NULL_RTX;
11911 int n_elts = GET_MODE_NUNITS (mode);
11912 int n_const = 0;
11913 int i;
11915 if (GET_CODE (vals) == CONST_VECTOR)
11916 const_vec = vals;
11917 else if (GET_CODE (vals) == PARALLEL)
11919 /* A CONST_VECTOR must contain only CONST_INTs and
11920 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11921 Only store valid constants in a CONST_VECTOR. */
11922 for (i = 0; i < n_elts; ++i)
11924 rtx x = XVECEXP (vals, 0, i);
11925 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11926 n_const++;
11928 if (n_const == n_elts)
11929 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11931 else
11932 gcc_unreachable ();
11934 if (const_vec != NULL_RTX
11935 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11936 /* Load using MOVI/MVNI. */
11937 return const_vec;
11938 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11939 /* Loaded using DUP. */
11940 return const_dup;
11941 else if (const_vec != NULL_RTX)
11942 /* Load from constant pool. We can not take advantage of single-cycle
11943 LD1 because we need a PC-relative addressing mode. */
11944 return const_vec;
11945 else
11946 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11947 We can not construct an initializer. */
11948 return NULL_RTX;
11951 /* Expand a vector initialisation sequence, such that TARGET is
11952 initialised to contain VALS. */
11954 void
11955 aarch64_expand_vector_init (rtx target, rtx vals)
11957 machine_mode mode = GET_MODE (target);
11958 machine_mode inner_mode = GET_MODE_INNER (mode);
11959 /* The number of vector elements. */
11960 int n_elts = GET_MODE_NUNITS (mode);
11961 /* The number of vector elements which are not constant. */
11962 int n_var = 0;
11963 rtx any_const = NULL_RTX;
11964 /* The first element of vals. */
11965 rtx v0 = XVECEXP (vals, 0, 0);
11966 bool all_same = true;
11968 /* Count the number of variable elements to initialise. */
11969 for (int i = 0; i < n_elts; ++i)
11971 rtx x = XVECEXP (vals, 0, i);
11972 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11973 ++n_var;
11974 else
11975 any_const = x;
11977 all_same &= rtx_equal_p (x, v0);
11980 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11981 how best to handle this. */
11982 if (n_var == 0)
11984 rtx constant = aarch64_simd_make_constant (vals);
11985 if (constant != NULL_RTX)
11987 emit_move_insn (target, constant);
11988 return;
11992 /* Splat a single non-constant element if we can. */
11993 if (all_same)
11995 rtx x = copy_to_mode_reg (inner_mode, v0);
11996 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11997 return;
12000 enum insn_code icode = optab_handler (vec_set_optab, mode);
12001 gcc_assert (icode != CODE_FOR_nothing);
12003 /* If there are only variable elements, try to optimize
12004 the insertion using dup for the most common element
12005 followed by insertions. */
12007 /* The algorithm will fill matches[*][0] with the earliest matching element,
12008 and matches[X][1] with the count of duplicate elements (if X is the
12009 earliest element which has duplicates). */
12011 if (n_var == n_elts && n_elts <= 16)
12013 int matches[16][2] = {0};
12014 for (int i = 0; i < n_elts; i++)
12016 for (int j = 0; j <= i; j++)
12018 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12020 matches[i][0] = j;
12021 matches[j][1]++;
12022 break;
12026 int maxelement = 0;
12027 int maxv = 0;
12028 for (int i = 0; i < n_elts; i++)
12029 if (matches[i][1] > maxv)
12031 maxelement = i;
12032 maxv = matches[i][1];
12035 /* Create a duplicate of the most common element. */
12036 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12037 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12039 /* Insert the rest. */
12040 for (int i = 0; i < n_elts; i++)
12042 rtx x = XVECEXP (vals, 0, i);
12043 if (matches[i][0] == maxelement)
12044 continue;
12045 x = copy_to_mode_reg (inner_mode, x);
12046 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12048 return;
12051 /* Initialise a vector which is part-variable. We want to first try
12052 to build those lanes which are constant in the most efficient way we
12053 can. */
12054 if (n_var != n_elts)
12056 rtx copy = copy_rtx (vals);
12058 /* Load constant part of vector. We really don't care what goes into the
12059 parts we will overwrite, but we're more likely to be able to load the
12060 constant efficiently if it has fewer, larger, repeating parts
12061 (see aarch64_simd_valid_immediate). */
12062 for (int i = 0; i < n_elts; i++)
12064 rtx x = XVECEXP (vals, 0, i);
12065 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12066 continue;
12067 rtx subst = any_const;
12068 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12070 /* Look in the copied vector, as more elements are const. */
12071 rtx test = XVECEXP (copy, 0, i ^ bit);
12072 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12074 subst = test;
12075 break;
12078 XVECEXP (copy, 0, i) = subst;
12080 aarch64_expand_vector_init (target, copy);
12083 /* Insert the variable lanes directly. */
12084 for (int i = 0; i < n_elts; i++)
12086 rtx x = XVECEXP (vals, 0, i);
12087 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12088 continue;
12089 x = copy_to_mode_reg (inner_mode, x);
12090 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12094 static unsigned HOST_WIDE_INT
12095 aarch64_shift_truncation_mask (machine_mode mode)
12097 return
12098 (!SHIFT_COUNT_TRUNCATED
12099 || aarch64_vector_mode_supported_p (mode)
12100 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12103 /* Select a format to encode pointers in exception handling data. */
12105 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12107 int type;
12108 switch (aarch64_cmodel)
12110 case AARCH64_CMODEL_TINY:
12111 case AARCH64_CMODEL_TINY_PIC:
12112 case AARCH64_CMODEL_SMALL:
12113 case AARCH64_CMODEL_SMALL_PIC:
12114 case AARCH64_CMODEL_SMALL_SPIC:
12115 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12116 for everything. */
12117 type = DW_EH_PE_sdata4;
12118 break;
12119 default:
12120 /* No assumptions here. 8-byte relocs required. */
12121 type = DW_EH_PE_sdata8;
12122 break;
12124 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12127 /* The last .arch and .tune assembly strings that we printed. */
12128 static std::string aarch64_last_printed_arch_string;
12129 static std::string aarch64_last_printed_tune_string;
12131 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12132 by the function fndecl. */
12134 void
12135 aarch64_declare_function_name (FILE *stream, const char* name,
12136 tree fndecl)
12138 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12140 struct cl_target_option *targ_options;
12141 if (target_parts)
12142 targ_options = TREE_TARGET_OPTION (target_parts);
12143 else
12144 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12145 gcc_assert (targ_options);
12147 const struct processor *this_arch
12148 = aarch64_get_arch (targ_options->x_explicit_arch);
12150 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12151 std::string extension
12152 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12153 this_arch->flags);
12154 /* Only update the assembler .arch string if it is distinct from the last
12155 such string we printed. */
12156 std::string to_print = this_arch->name + extension;
12157 if (to_print != aarch64_last_printed_arch_string)
12159 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12160 aarch64_last_printed_arch_string = to_print;
12163 /* Print the cpu name we're tuning for in the comments, might be
12164 useful to readers of the generated asm. Do it only when it changes
12165 from function to function and verbose assembly is requested. */
12166 const struct processor *this_tune
12167 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12169 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12171 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12172 this_tune->name);
12173 aarch64_last_printed_tune_string = this_tune->name;
12176 /* Don't forget the type directive for ELF. */
12177 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12178 ASM_OUTPUT_LABEL (stream, name);
12181 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12183 static void
12184 aarch64_start_file (void)
12186 struct cl_target_option *default_options
12187 = TREE_TARGET_OPTION (target_option_default_node);
12189 const struct processor *default_arch
12190 = aarch64_get_arch (default_options->x_explicit_arch);
12191 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12192 std::string extension
12193 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12194 default_arch->flags);
12196 aarch64_last_printed_arch_string = default_arch->name + extension;
12197 aarch64_last_printed_tune_string = "";
12198 asm_fprintf (asm_out_file, "\t.arch %s\n",
12199 aarch64_last_printed_arch_string.c_str ());
12201 default_file_start ();
12204 /* Emit load exclusive. */
12206 static void
12207 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12208 rtx mem, rtx model_rtx)
12210 rtx (*gen) (rtx, rtx, rtx);
12212 switch (mode)
12214 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12215 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12216 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12217 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12218 default:
12219 gcc_unreachable ();
12222 emit_insn (gen (rval, mem, model_rtx));
12225 /* Emit store exclusive. */
12227 static void
12228 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12229 rtx rval, rtx mem, rtx model_rtx)
12231 rtx (*gen) (rtx, rtx, rtx, rtx);
12233 switch (mode)
12235 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12236 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12237 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12238 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12239 default:
12240 gcc_unreachable ();
12243 emit_insn (gen (bval, rval, mem, model_rtx));
12246 /* Mark the previous jump instruction as unlikely. */
12248 static void
12249 aarch64_emit_unlikely_jump (rtx insn)
12251 rtx_insn *jump = emit_jump_insn (insn);
12252 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12255 /* Expand a compare and swap pattern. */
12257 void
12258 aarch64_expand_compare_and_swap (rtx operands[])
12260 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12261 machine_mode mode, cmp_mode;
12262 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12263 int idx;
12264 gen_cas_fn gen;
12265 const gen_cas_fn split_cas[] =
12267 gen_aarch64_compare_and_swapqi,
12268 gen_aarch64_compare_and_swaphi,
12269 gen_aarch64_compare_and_swapsi,
12270 gen_aarch64_compare_and_swapdi
12272 const gen_cas_fn atomic_cas[] =
12274 gen_aarch64_compare_and_swapqi_lse,
12275 gen_aarch64_compare_and_swaphi_lse,
12276 gen_aarch64_compare_and_swapsi_lse,
12277 gen_aarch64_compare_and_swapdi_lse
12280 bval = operands[0];
12281 rval = operands[1];
12282 mem = operands[2];
12283 oldval = operands[3];
12284 newval = operands[4];
12285 is_weak = operands[5];
12286 mod_s = operands[6];
12287 mod_f = operands[7];
12288 mode = GET_MODE (mem);
12289 cmp_mode = mode;
12291 /* Normally the succ memory model must be stronger than fail, but in the
12292 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12293 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12295 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12296 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12297 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12299 switch (mode)
12301 case E_QImode:
12302 case E_HImode:
12303 /* For short modes, we're going to perform the comparison in SImode,
12304 so do the zero-extension now. */
12305 cmp_mode = SImode;
12306 rval = gen_reg_rtx (SImode);
12307 oldval = convert_modes (SImode, mode, oldval, true);
12308 /* Fall through. */
12310 case E_SImode:
12311 case E_DImode:
12312 /* Force the value into a register if needed. */
12313 if (!aarch64_plus_operand (oldval, mode))
12314 oldval = force_reg (cmp_mode, oldval);
12315 break;
12317 default:
12318 gcc_unreachable ();
12321 switch (mode)
12323 case E_QImode: idx = 0; break;
12324 case E_HImode: idx = 1; break;
12325 case E_SImode: idx = 2; break;
12326 case E_DImode: idx = 3; break;
12327 default:
12328 gcc_unreachable ();
12330 if (TARGET_LSE)
12331 gen = atomic_cas[idx];
12332 else
12333 gen = split_cas[idx];
12335 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12337 if (mode == QImode || mode == HImode)
12338 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12340 x = gen_rtx_REG (CCmode, CC_REGNUM);
12341 x = gen_rtx_EQ (SImode, x, const0_rtx);
12342 emit_insn (gen_rtx_SET (bval, x));
12345 /* Test whether the target supports using a atomic load-operate instruction.
12346 CODE is the operation and AFTER is TRUE if the data in memory after the
12347 operation should be returned and FALSE if the data before the operation
12348 should be returned. Returns FALSE if the operation isn't supported by the
12349 architecture. */
12351 bool
12352 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12354 if (!TARGET_LSE)
12355 return false;
12357 switch (code)
12359 case SET:
12360 case AND:
12361 case IOR:
12362 case XOR:
12363 case MINUS:
12364 case PLUS:
12365 return true;
12366 default:
12367 return false;
12371 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12372 sequence implementing an atomic operation. */
12374 static void
12375 aarch64_emit_post_barrier (enum memmodel model)
12377 const enum memmodel base_model = memmodel_base (model);
12379 if (is_mm_sync (model)
12380 && (base_model == MEMMODEL_ACQUIRE
12381 || base_model == MEMMODEL_ACQ_REL
12382 || base_model == MEMMODEL_SEQ_CST))
12384 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12388 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12389 for the data in memory. EXPECTED is the value expected to be in memory.
12390 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12391 is the memory ordering to use. */
12393 void
12394 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12395 rtx expected, rtx desired,
12396 rtx model)
12398 rtx (*gen) (rtx, rtx, rtx, rtx);
12399 machine_mode mode;
12401 mode = GET_MODE (mem);
12403 switch (mode)
12405 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12406 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12407 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12408 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12409 default:
12410 gcc_unreachable ();
12413 /* Move the expected value into the CAS destination register. */
12414 emit_insn (gen_rtx_SET (rval, expected));
12416 /* Emit the CAS. */
12417 emit_insn (gen (rval, mem, desired, model));
12419 /* Compare the expected value with the value loaded by the CAS, to establish
12420 whether the swap was made. */
12421 aarch64_gen_compare_reg (EQ, rval, expected);
12424 /* Split a compare and swap pattern. */
12426 void
12427 aarch64_split_compare_and_swap (rtx operands[])
12429 rtx rval, mem, oldval, newval, scratch;
12430 machine_mode mode;
12431 bool is_weak;
12432 rtx_code_label *label1, *label2;
12433 rtx x, cond;
12434 enum memmodel model;
12435 rtx model_rtx;
12437 rval = operands[0];
12438 mem = operands[1];
12439 oldval = operands[2];
12440 newval = operands[3];
12441 is_weak = (operands[4] != const0_rtx);
12442 model_rtx = operands[5];
12443 scratch = operands[7];
12444 mode = GET_MODE (mem);
12445 model = memmodel_from_int (INTVAL (model_rtx));
12447 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12448 loop:
12449 .label1:
12450 LD[A]XR rval, [mem]
12451 CBNZ rval, .label2
12452 ST[L]XR scratch, newval, [mem]
12453 CBNZ scratch, .label1
12454 .label2:
12455 CMP rval, 0. */
12456 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12458 label1 = NULL;
12459 if (!is_weak)
12461 label1 = gen_label_rtx ();
12462 emit_label (label1);
12464 label2 = gen_label_rtx ();
12466 /* The initial load can be relaxed for a __sync operation since a final
12467 barrier will be emitted to stop code hoisting. */
12468 if (is_mm_sync (model))
12469 aarch64_emit_load_exclusive (mode, rval, mem,
12470 GEN_INT (MEMMODEL_RELAXED));
12471 else
12472 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12474 if (strong_zero_p)
12476 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12477 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12478 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12479 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12481 else
12483 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12484 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12485 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12486 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12487 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12490 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12492 if (!is_weak)
12494 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12495 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12496 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12497 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12499 else
12501 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12502 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12503 emit_insn (gen_rtx_SET (cond, x));
12506 emit_label (label2);
12507 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12508 to set the condition flags. If this is not used it will be removed by
12509 later passes. */
12510 if (strong_zero_p)
12512 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12513 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12514 emit_insn (gen_rtx_SET (cond, x));
12516 /* Emit any final barrier needed for a __sync operation. */
12517 if (is_mm_sync (model))
12518 aarch64_emit_post_barrier (model);
12521 /* Emit a BIC instruction. */
12523 static void
12524 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12526 rtx shift_rtx = GEN_INT (shift);
12527 rtx (*gen) (rtx, rtx, rtx, rtx);
12529 switch (mode)
12531 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12532 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12533 default:
12534 gcc_unreachable ();
12537 emit_insn (gen (dst, s2, shift_rtx, s1));
12540 /* Emit an atomic swap. */
12542 static void
12543 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12544 rtx mem, rtx model)
12546 rtx (*gen) (rtx, rtx, rtx, rtx);
12548 switch (mode)
12550 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12551 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12552 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12553 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12554 default:
12555 gcc_unreachable ();
12558 emit_insn (gen (dst, mem, value, model));
12561 /* Operations supported by aarch64_emit_atomic_load_op. */
12563 enum aarch64_atomic_load_op_code
12565 AARCH64_LDOP_PLUS, /* A + B */
12566 AARCH64_LDOP_XOR, /* A ^ B */
12567 AARCH64_LDOP_OR, /* A | B */
12568 AARCH64_LDOP_BIC /* A & ~B */
12571 /* Emit an atomic load-operate. */
12573 static void
12574 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12575 machine_mode mode, rtx dst, rtx src,
12576 rtx mem, rtx model)
12578 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12579 const aarch64_atomic_load_op_fn plus[] =
12581 gen_aarch64_atomic_loadaddqi,
12582 gen_aarch64_atomic_loadaddhi,
12583 gen_aarch64_atomic_loadaddsi,
12584 gen_aarch64_atomic_loadadddi
12586 const aarch64_atomic_load_op_fn eor[] =
12588 gen_aarch64_atomic_loadeorqi,
12589 gen_aarch64_atomic_loadeorhi,
12590 gen_aarch64_atomic_loadeorsi,
12591 gen_aarch64_atomic_loadeordi
12593 const aarch64_atomic_load_op_fn ior[] =
12595 gen_aarch64_atomic_loadsetqi,
12596 gen_aarch64_atomic_loadsethi,
12597 gen_aarch64_atomic_loadsetsi,
12598 gen_aarch64_atomic_loadsetdi
12600 const aarch64_atomic_load_op_fn bic[] =
12602 gen_aarch64_atomic_loadclrqi,
12603 gen_aarch64_atomic_loadclrhi,
12604 gen_aarch64_atomic_loadclrsi,
12605 gen_aarch64_atomic_loadclrdi
12607 aarch64_atomic_load_op_fn gen;
12608 int idx = 0;
12610 switch (mode)
12612 case E_QImode: idx = 0; break;
12613 case E_HImode: idx = 1; break;
12614 case E_SImode: idx = 2; break;
12615 case E_DImode: idx = 3; break;
12616 default:
12617 gcc_unreachable ();
12620 switch (code)
12622 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12623 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12624 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12625 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12626 default:
12627 gcc_unreachable ();
12630 emit_insn (gen (dst, mem, src, model));
12633 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12634 location to store the data read from memory. OUT_RESULT is the location to
12635 store the result of the operation. MEM is the memory location to read and
12636 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12637 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12638 be NULL. */
12640 void
12641 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12642 rtx mem, rtx value, rtx model_rtx)
12644 machine_mode mode = GET_MODE (mem);
12645 machine_mode wmode = (mode == DImode ? DImode : SImode);
12646 const bool short_mode = (mode < SImode);
12647 aarch64_atomic_load_op_code ldop_code;
12648 rtx src;
12649 rtx x;
12651 if (out_data)
12652 out_data = gen_lowpart (mode, out_data);
12654 if (out_result)
12655 out_result = gen_lowpart (mode, out_result);
12657 /* Make sure the value is in a register, putting it into a destination
12658 register if it needs to be manipulated. */
12659 if (!register_operand (value, mode)
12660 || code == AND || code == MINUS)
12662 src = out_result ? out_result : out_data;
12663 emit_move_insn (src, gen_lowpart (mode, value));
12665 else
12666 src = value;
12667 gcc_assert (register_operand (src, mode));
12669 /* Preprocess the data for the operation as necessary. If the operation is
12670 a SET then emit a swap instruction and finish. */
12671 switch (code)
12673 case SET:
12674 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12675 return;
12677 case MINUS:
12678 /* Negate the value and treat it as a PLUS. */
12680 rtx neg_src;
12682 /* Resize the value if necessary. */
12683 if (short_mode)
12684 src = gen_lowpart (wmode, src);
12686 neg_src = gen_rtx_NEG (wmode, src);
12687 emit_insn (gen_rtx_SET (src, neg_src));
12689 if (short_mode)
12690 src = gen_lowpart (mode, src);
12692 /* Fall-through. */
12693 case PLUS:
12694 ldop_code = AARCH64_LDOP_PLUS;
12695 break;
12697 case IOR:
12698 ldop_code = AARCH64_LDOP_OR;
12699 break;
12701 case XOR:
12702 ldop_code = AARCH64_LDOP_XOR;
12703 break;
12705 case AND:
12707 rtx not_src;
12709 /* Resize the value if necessary. */
12710 if (short_mode)
12711 src = gen_lowpart (wmode, src);
12713 not_src = gen_rtx_NOT (wmode, src);
12714 emit_insn (gen_rtx_SET (src, not_src));
12716 if (short_mode)
12717 src = gen_lowpart (mode, src);
12719 ldop_code = AARCH64_LDOP_BIC;
12720 break;
12722 default:
12723 /* The operation can't be done with atomic instructions. */
12724 gcc_unreachable ();
12727 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12729 /* If necessary, calculate the data in memory after the update by redoing the
12730 operation from values in registers. */
12731 if (!out_result)
12732 return;
12734 if (short_mode)
12736 src = gen_lowpart (wmode, src);
12737 out_data = gen_lowpart (wmode, out_data);
12738 out_result = gen_lowpart (wmode, out_result);
12741 x = NULL_RTX;
12743 switch (code)
12745 case MINUS:
12746 case PLUS:
12747 x = gen_rtx_PLUS (wmode, out_data, src);
12748 break;
12749 case IOR:
12750 x = gen_rtx_IOR (wmode, out_data, src);
12751 break;
12752 case XOR:
12753 x = gen_rtx_XOR (wmode, out_data, src);
12754 break;
12755 case AND:
12756 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12757 return;
12758 default:
12759 gcc_unreachable ();
12762 emit_set_insn (out_result, x);
12764 return;
12767 /* Split an atomic operation. */
12769 void
12770 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12771 rtx value, rtx model_rtx, rtx cond)
12773 machine_mode mode = GET_MODE (mem);
12774 machine_mode wmode = (mode == DImode ? DImode : SImode);
12775 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12776 const bool is_sync = is_mm_sync (model);
12777 rtx_code_label *label;
12778 rtx x;
12780 /* Split the atomic operation into a sequence. */
12781 label = gen_label_rtx ();
12782 emit_label (label);
12784 if (new_out)
12785 new_out = gen_lowpart (wmode, new_out);
12786 if (old_out)
12787 old_out = gen_lowpart (wmode, old_out);
12788 else
12789 old_out = new_out;
12790 value = simplify_gen_subreg (wmode, value, mode, 0);
12792 /* The initial load can be relaxed for a __sync operation since a final
12793 barrier will be emitted to stop code hoisting. */
12794 if (is_sync)
12795 aarch64_emit_load_exclusive (mode, old_out, mem,
12796 GEN_INT (MEMMODEL_RELAXED));
12797 else
12798 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12800 switch (code)
12802 case SET:
12803 new_out = value;
12804 break;
12806 case NOT:
12807 x = gen_rtx_AND (wmode, old_out, value);
12808 emit_insn (gen_rtx_SET (new_out, x));
12809 x = gen_rtx_NOT (wmode, new_out);
12810 emit_insn (gen_rtx_SET (new_out, x));
12811 break;
12813 case MINUS:
12814 if (CONST_INT_P (value))
12816 value = GEN_INT (-INTVAL (value));
12817 code = PLUS;
12819 /* Fall through. */
12821 default:
12822 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12823 emit_insn (gen_rtx_SET (new_out, x));
12824 break;
12827 aarch64_emit_store_exclusive (mode, cond, mem,
12828 gen_lowpart (mode, new_out), model_rtx);
12830 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12831 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12832 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12833 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12835 /* Emit any final barrier needed for a __sync operation. */
12836 if (is_sync)
12837 aarch64_emit_post_barrier (model);
12840 static void
12841 aarch64_init_libfuncs (void)
12843 /* Half-precision float operations. The compiler handles all operations
12844 with NULL libfuncs by converting to SFmode. */
12846 /* Conversions. */
12847 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12848 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12850 /* Arithmetic. */
12851 set_optab_libfunc (add_optab, HFmode, NULL);
12852 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12853 set_optab_libfunc (smul_optab, HFmode, NULL);
12854 set_optab_libfunc (neg_optab, HFmode, NULL);
12855 set_optab_libfunc (sub_optab, HFmode, NULL);
12857 /* Comparisons. */
12858 set_optab_libfunc (eq_optab, HFmode, NULL);
12859 set_optab_libfunc (ne_optab, HFmode, NULL);
12860 set_optab_libfunc (lt_optab, HFmode, NULL);
12861 set_optab_libfunc (le_optab, HFmode, NULL);
12862 set_optab_libfunc (ge_optab, HFmode, NULL);
12863 set_optab_libfunc (gt_optab, HFmode, NULL);
12864 set_optab_libfunc (unord_optab, HFmode, NULL);
12867 /* Target hook for c_mode_for_suffix. */
12868 static machine_mode
12869 aarch64_c_mode_for_suffix (char suffix)
12871 if (suffix == 'q')
12872 return TFmode;
12874 return VOIDmode;
12877 /* We can only represent floating point constants which will fit in
12878 "quarter-precision" values. These values are characterised by
12879 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12882 (-1)^s * (n/16) * 2^r
12884 Where:
12885 's' is the sign bit.
12886 'n' is an integer in the range 16 <= n <= 31.
12887 'r' is an integer in the range -3 <= r <= 4. */
12889 /* Return true iff X can be represented by a quarter-precision
12890 floating point immediate operand X. Note, we cannot represent 0.0. */
12891 bool
12892 aarch64_float_const_representable_p (rtx x)
12894 /* This represents our current view of how many bits
12895 make up the mantissa. */
12896 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12897 int exponent;
12898 unsigned HOST_WIDE_INT mantissa, mask;
12899 REAL_VALUE_TYPE r, m;
12900 bool fail;
12902 if (!CONST_DOUBLE_P (x))
12903 return false;
12905 /* We don't support HFmode constants yet. */
12906 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12907 return false;
12909 r = *CONST_DOUBLE_REAL_VALUE (x);
12911 /* We cannot represent infinities, NaNs or +/-zero. We won't
12912 know if we have +zero until we analyse the mantissa, but we
12913 can reject the other invalid values. */
12914 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12915 || REAL_VALUE_MINUS_ZERO (r))
12916 return false;
12918 /* Extract exponent. */
12919 r = real_value_abs (&r);
12920 exponent = REAL_EXP (&r);
12922 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12923 highest (sign) bit, with a fixed binary point at bit point_pos.
12924 m1 holds the low part of the mantissa, m2 the high part.
12925 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12926 bits for the mantissa, this can fail (low bits will be lost). */
12927 real_ldexp (&m, &r, point_pos - exponent);
12928 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12930 /* If the low part of the mantissa has bits set we cannot represent
12931 the value. */
12932 if (w.ulow () != 0)
12933 return false;
12934 /* We have rejected the lower HOST_WIDE_INT, so update our
12935 understanding of how many bits lie in the mantissa and
12936 look only at the high HOST_WIDE_INT. */
12937 mantissa = w.elt (1);
12938 point_pos -= HOST_BITS_PER_WIDE_INT;
12940 /* We can only represent values with a mantissa of the form 1.xxxx. */
12941 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12942 if ((mantissa & mask) != 0)
12943 return false;
12945 /* Having filtered unrepresentable values, we may now remove all
12946 but the highest 5 bits. */
12947 mantissa >>= point_pos - 5;
12949 /* We cannot represent the value 0.0, so reject it. This is handled
12950 elsewhere. */
12951 if (mantissa == 0)
12952 return false;
12954 /* Then, as bit 4 is always set, we can mask it off, leaving
12955 the mantissa in the range [0, 15]. */
12956 mantissa &= ~(1 << 4);
12957 gcc_assert (mantissa <= 15);
12959 /* GCC internally does not use IEEE754-like encoding (where normalized
12960 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12961 Our mantissa values are shifted 4 places to the left relative to
12962 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12963 by 5 places to correct for GCC's representation. */
12964 exponent = 5 - exponent;
12966 return (exponent >= 0 && exponent <= 7);
12969 char*
12970 aarch64_output_simd_mov_immediate (rtx const_vector,
12971 machine_mode mode,
12972 unsigned width)
12974 bool is_valid;
12975 static char templ[40];
12976 const char *mnemonic;
12977 const char *shift_op;
12978 unsigned int lane_count = 0;
12979 char element_char;
12981 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12983 /* This will return true to show const_vector is legal for use as either
12984 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12985 also update INFO to show how the immediate should be generated. */
12986 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12987 gcc_assert (is_valid);
12989 element_char = sizetochar (info.element_width);
12990 lane_count = width / info.element_width;
12992 mode = GET_MODE_INNER (mode);
12993 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12995 gcc_assert (info.shift == 0 && ! info.mvn);
12996 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12997 move immediate path. */
12998 if (aarch64_float_const_zero_rtx_p (info.value))
12999 info.value = GEN_INT (0);
13000 else
13002 const unsigned int buf_size = 20;
13003 char float_buf[buf_size] = {'\0'};
13004 real_to_decimal_for_mode (float_buf,
13005 CONST_DOUBLE_REAL_VALUE (info.value),
13006 buf_size, buf_size, 1, mode);
13008 if (lane_count == 1)
13009 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13010 else
13011 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13012 lane_count, element_char, float_buf);
13013 return templ;
13017 mnemonic = info.mvn ? "mvni" : "movi";
13018 shift_op = info.msl ? "msl" : "lsl";
13020 gcc_assert (CONST_INT_P (info.value));
13021 if (lane_count == 1)
13022 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13023 mnemonic, UINTVAL (info.value));
13024 else if (info.shift)
13025 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13026 ", %s %d", mnemonic, lane_count, element_char,
13027 UINTVAL (info.value), shift_op, info.shift);
13028 else
13029 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13030 mnemonic, lane_count, element_char, UINTVAL (info.value));
13031 return templ;
13034 char*
13035 aarch64_output_scalar_simd_mov_immediate (rtx immediate, machine_mode mode)
13038 /* If a floating point number was passed and we desire to use it in an
13039 integer mode do the conversion to integer. */
13040 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13042 unsigned HOST_WIDE_INT ival;
13043 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13044 gcc_unreachable ();
13045 immediate = gen_int_mode (ival, mode);
13048 machine_mode vmode;
13049 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13050 a 128 bit vector mode. */
13051 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13053 gcc_assert (!VECTOR_MODE_P (mode));
13054 vmode = aarch64_simd_container_mode (mode, width);
13055 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13056 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13059 /* Split operands into moves from op[1] + op[2] into op[0]. */
13061 void
13062 aarch64_split_combinev16qi (rtx operands[3])
13064 unsigned int dest = REGNO (operands[0]);
13065 unsigned int src1 = REGNO (operands[1]);
13066 unsigned int src2 = REGNO (operands[2]);
13067 machine_mode halfmode = GET_MODE (operands[1]);
13068 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13069 rtx destlo, desthi;
13071 gcc_assert (halfmode == V16QImode);
13073 if (src1 == dest && src2 == dest + halfregs)
13075 /* No-op move. Can't split to nothing; emit something. */
13076 emit_note (NOTE_INSN_DELETED);
13077 return;
13080 /* Preserve register attributes for variable tracking. */
13081 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13082 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13083 GET_MODE_SIZE (halfmode));
13085 /* Special case of reversed high/low parts. */
13086 if (reg_overlap_mentioned_p (operands[2], destlo)
13087 && reg_overlap_mentioned_p (operands[1], desthi))
13089 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13090 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13091 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13093 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13095 /* Try to avoid unnecessary moves if part of the result
13096 is in the right place already. */
13097 if (src1 != dest)
13098 emit_move_insn (destlo, operands[1]);
13099 if (src2 != dest + halfregs)
13100 emit_move_insn (desthi, operands[2]);
13102 else
13104 if (src2 != dest + halfregs)
13105 emit_move_insn (desthi, operands[2]);
13106 if (src1 != dest)
13107 emit_move_insn (destlo, operands[1]);
13111 /* vec_perm support. */
13113 #define MAX_VECT_LEN 16
13115 struct expand_vec_perm_d
13117 rtx target, op0, op1;
13118 unsigned char perm[MAX_VECT_LEN];
13119 machine_mode vmode;
13120 unsigned char nelt;
13121 bool one_vector_p;
13122 bool testing_p;
13125 /* Generate a variable permutation. */
13127 static void
13128 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13130 machine_mode vmode = GET_MODE (target);
13131 bool one_vector_p = rtx_equal_p (op0, op1);
13133 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13134 gcc_checking_assert (GET_MODE (op0) == vmode);
13135 gcc_checking_assert (GET_MODE (op1) == vmode);
13136 gcc_checking_assert (GET_MODE (sel) == vmode);
13137 gcc_checking_assert (TARGET_SIMD);
13139 if (one_vector_p)
13141 if (vmode == V8QImode)
13143 /* Expand the argument to a V16QI mode by duplicating it. */
13144 rtx pair = gen_reg_rtx (V16QImode);
13145 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13146 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13148 else
13150 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13153 else
13155 rtx pair;
13157 if (vmode == V8QImode)
13159 pair = gen_reg_rtx (V16QImode);
13160 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13161 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13163 else
13165 pair = gen_reg_rtx (OImode);
13166 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13167 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13172 void
13173 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13175 machine_mode vmode = GET_MODE (target);
13176 unsigned int nelt = GET_MODE_NUNITS (vmode);
13177 bool one_vector_p = rtx_equal_p (op0, op1);
13178 rtx mask;
13180 /* The TBL instruction does not use a modulo index, so we must take care
13181 of that ourselves. */
13182 mask = aarch64_simd_gen_const_vector_dup (vmode,
13183 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13184 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13186 /* For big-endian, we also need to reverse the index within the vector
13187 (but not which vector). */
13188 if (BYTES_BIG_ENDIAN)
13190 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13191 if (!one_vector_p)
13192 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13193 sel = expand_simple_binop (vmode, XOR, sel, mask,
13194 NULL, 0, OPTAB_LIB_WIDEN);
13196 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13199 /* Recognize patterns suitable for the TRN instructions. */
13200 static bool
13201 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13203 unsigned int i, odd, mask, nelt = d->nelt;
13204 rtx out, in0, in1, x;
13205 rtx (*gen) (rtx, rtx, rtx);
13206 machine_mode vmode = d->vmode;
13208 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13209 return false;
13211 /* Note that these are little-endian tests.
13212 We correct for big-endian later. */
13213 if (d->perm[0] == 0)
13214 odd = 0;
13215 else if (d->perm[0] == 1)
13216 odd = 1;
13217 else
13218 return false;
13219 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13221 for (i = 0; i < nelt; i += 2)
13223 if (d->perm[i] != i + odd)
13224 return false;
13225 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13226 return false;
13229 /* Success! */
13230 if (d->testing_p)
13231 return true;
13233 in0 = d->op0;
13234 in1 = d->op1;
13235 if (BYTES_BIG_ENDIAN)
13237 x = in0, in0 = in1, in1 = x;
13238 odd = !odd;
13240 out = d->target;
13242 if (odd)
13244 switch (vmode)
13246 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13247 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13248 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13249 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13250 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13251 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13252 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13253 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13254 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13255 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13256 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13257 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13258 default:
13259 return false;
13262 else
13264 switch (vmode)
13266 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13267 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13268 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13269 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13270 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13271 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13272 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13273 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13274 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13275 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13276 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13277 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13278 default:
13279 return false;
13283 emit_insn (gen (out, in0, in1));
13284 return true;
13287 /* Recognize patterns suitable for the UZP instructions. */
13288 static bool
13289 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13291 unsigned int i, odd, mask, nelt = d->nelt;
13292 rtx out, in0, in1, x;
13293 rtx (*gen) (rtx, rtx, rtx);
13294 machine_mode vmode = d->vmode;
13296 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13297 return false;
13299 /* Note that these are little-endian tests.
13300 We correct for big-endian later. */
13301 if (d->perm[0] == 0)
13302 odd = 0;
13303 else if (d->perm[0] == 1)
13304 odd = 1;
13305 else
13306 return false;
13307 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13309 for (i = 0; i < nelt; i++)
13311 unsigned elt = (i * 2 + odd) & mask;
13312 if (d->perm[i] != elt)
13313 return false;
13316 /* Success! */
13317 if (d->testing_p)
13318 return true;
13320 in0 = d->op0;
13321 in1 = d->op1;
13322 if (BYTES_BIG_ENDIAN)
13324 x = in0, in0 = in1, in1 = x;
13325 odd = !odd;
13327 out = d->target;
13329 if (odd)
13331 switch (vmode)
13333 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13334 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13335 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13336 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13337 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13338 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13339 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13340 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13341 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13342 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13343 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13344 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13345 default:
13346 return false;
13349 else
13351 switch (vmode)
13353 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13354 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13355 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13356 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13357 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13358 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13359 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13360 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13361 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13362 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13363 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13364 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13365 default:
13366 return false;
13370 emit_insn (gen (out, in0, in1));
13371 return true;
13374 /* Recognize patterns suitable for the ZIP instructions. */
13375 static bool
13376 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13378 unsigned int i, high, mask, nelt = d->nelt;
13379 rtx out, in0, in1, x;
13380 rtx (*gen) (rtx, rtx, rtx);
13381 machine_mode vmode = d->vmode;
13383 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13384 return false;
13386 /* Note that these are little-endian tests.
13387 We correct for big-endian later. */
13388 high = nelt / 2;
13389 if (d->perm[0] == high)
13390 /* Do Nothing. */
13392 else if (d->perm[0] == 0)
13393 high = 0;
13394 else
13395 return false;
13396 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13398 for (i = 0; i < nelt / 2; i++)
13400 unsigned elt = (i + high) & mask;
13401 if (d->perm[i * 2] != elt)
13402 return false;
13403 elt = (elt + nelt) & mask;
13404 if (d->perm[i * 2 + 1] != elt)
13405 return false;
13408 /* Success! */
13409 if (d->testing_p)
13410 return true;
13412 in0 = d->op0;
13413 in1 = d->op1;
13414 if (BYTES_BIG_ENDIAN)
13416 x = in0, in0 = in1, in1 = x;
13417 high = !high;
13419 out = d->target;
13421 if (high)
13423 switch (vmode)
13425 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13426 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13427 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13428 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13429 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13430 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13431 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13432 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13433 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13434 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13435 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13436 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13437 default:
13438 return false;
13441 else
13443 switch (vmode)
13445 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13446 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13447 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13448 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13449 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13450 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13451 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13452 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13453 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13454 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13455 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13456 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13457 default:
13458 return false;
13462 emit_insn (gen (out, in0, in1));
13463 return true;
13466 /* Recognize patterns for the EXT insn. */
13468 static bool
13469 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13471 unsigned int i, nelt = d->nelt;
13472 rtx (*gen) (rtx, rtx, rtx, rtx);
13473 rtx offset;
13475 unsigned int location = d->perm[0]; /* Always < nelt. */
13477 /* Check if the extracted indices are increasing by one. */
13478 for (i = 1; i < nelt; i++)
13480 unsigned int required = location + i;
13481 if (d->one_vector_p)
13483 /* We'll pass the same vector in twice, so allow indices to wrap. */
13484 required &= (nelt - 1);
13486 if (d->perm[i] != required)
13487 return false;
13490 switch (d->vmode)
13492 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13493 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13494 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13495 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13496 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13497 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13498 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13499 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13500 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13501 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13502 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13503 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13504 default:
13505 return false;
13508 /* Success! */
13509 if (d->testing_p)
13510 return true;
13512 /* The case where (location == 0) is a no-op for both big- and little-endian,
13513 and is removed by the mid-end at optimization levels -O1 and higher. */
13515 if (BYTES_BIG_ENDIAN && (location != 0))
13517 /* After setup, we want the high elements of the first vector (stored
13518 at the LSB end of the register), and the low elements of the second
13519 vector (stored at the MSB end of the register). So swap. */
13520 std::swap (d->op0, d->op1);
13521 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13522 location = nelt - location;
13525 offset = GEN_INT (location);
13526 emit_insn (gen (d->target, d->op0, d->op1, offset));
13527 return true;
13530 /* Recognize patterns for the REV insns. */
13532 static bool
13533 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13535 unsigned int i, j, diff, nelt = d->nelt;
13536 rtx (*gen) (rtx, rtx);
13538 if (!d->one_vector_p)
13539 return false;
13541 diff = d->perm[0];
13542 switch (diff)
13544 case 7:
13545 switch (d->vmode)
13547 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13548 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13549 default:
13550 return false;
13552 break;
13553 case 3:
13554 switch (d->vmode)
13556 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13557 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13558 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13559 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13560 default:
13561 return false;
13563 break;
13564 case 1:
13565 switch (d->vmode)
13567 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13568 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13569 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13570 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13571 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13572 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13573 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13574 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13575 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13576 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13577 default:
13578 return false;
13580 break;
13581 default:
13582 return false;
13585 for (i = 0; i < nelt ; i += diff + 1)
13586 for (j = 0; j <= diff; j += 1)
13588 /* This is guaranteed to be true as the value of diff
13589 is 7, 3, 1 and we should have enough elements in the
13590 queue to generate this. Getting a vector mask with a
13591 value of diff other than these values implies that
13592 something is wrong by the time we get here. */
13593 gcc_assert (i + j < nelt);
13594 if (d->perm[i + j] != i + diff - j)
13595 return false;
13598 /* Success! */
13599 if (d->testing_p)
13600 return true;
13602 emit_insn (gen (d->target, d->op0));
13603 return true;
13606 static bool
13607 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13609 rtx (*gen) (rtx, rtx, rtx);
13610 rtx out = d->target;
13611 rtx in0;
13612 machine_mode vmode = d->vmode;
13613 unsigned int i, elt, nelt = d->nelt;
13614 rtx lane;
13616 elt = d->perm[0];
13617 for (i = 1; i < nelt; i++)
13619 if (elt != d->perm[i])
13620 return false;
13623 /* The generic preparation in aarch64_expand_vec_perm_const_1
13624 swaps the operand order and the permute indices if it finds
13625 d->perm[0] to be in the second operand. Thus, we can always
13626 use d->op0 and need not do any extra arithmetic to get the
13627 correct lane number. */
13628 in0 = d->op0;
13629 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13631 switch (vmode)
13633 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13634 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13635 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13636 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13637 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13638 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13639 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13640 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13641 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13642 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13643 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13644 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13645 default:
13646 return false;
13649 emit_insn (gen (out, in0, lane));
13650 return true;
13653 static bool
13654 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13656 rtx rperm[MAX_VECT_LEN], sel;
13657 machine_mode vmode = d->vmode;
13658 unsigned int i, nelt = d->nelt;
13660 if (d->testing_p)
13661 return true;
13663 /* Generic code will try constant permutation twice. Once with the
13664 original mode and again with the elements lowered to QImode.
13665 So wait and don't do the selector expansion ourselves. */
13666 if (vmode != V8QImode && vmode != V16QImode)
13667 return false;
13669 for (i = 0; i < nelt; ++i)
13671 int nunits = GET_MODE_NUNITS (vmode);
13673 /* If big-endian and two vectors we end up with a weird mixed-endian
13674 mode on NEON. Reverse the index within each word but not the word
13675 itself. */
13676 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13677 : d->perm[i]);
13679 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13680 sel = force_reg (vmode, sel);
13682 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13683 return true;
13686 static bool
13687 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13689 /* The pattern matching functions above are written to look for a small
13690 number to begin the sequence (0, 1, N/2). If we begin with an index
13691 from the second operand, we can swap the operands. */
13692 if (d->perm[0] >= d->nelt)
13694 unsigned i, nelt = d->nelt;
13696 gcc_assert (nelt == (nelt & -nelt));
13697 for (i = 0; i < nelt; ++i)
13698 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13700 std::swap (d->op0, d->op1);
13703 if (TARGET_SIMD)
13705 if (aarch64_evpc_rev (d))
13706 return true;
13707 else if (aarch64_evpc_ext (d))
13708 return true;
13709 else if (aarch64_evpc_dup (d))
13710 return true;
13711 else if (aarch64_evpc_zip (d))
13712 return true;
13713 else if (aarch64_evpc_uzp (d))
13714 return true;
13715 else if (aarch64_evpc_trn (d))
13716 return true;
13717 return aarch64_evpc_tbl (d);
13719 return false;
13722 /* Expand a vec_perm_const pattern. */
13724 bool
13725 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13727 struct expand_vec_perm_d d;
13728 int i, nelt, which;
13730 d.target = target;
13731 d.op0 = op0;
13732 d.op1 = op1;
13734 d.vmode = GET_MODE (target);
13735 gcc_assert (VECTOR_MODE_P (d.vmode));
13736 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13737 d.testing_p = false;
13739 for (i = which = 0; i < nelt; ++i)
13741 rtx e = XVECEXP (sel, 0, i);
13742 int ei = INTVAL (e) & (2 * nelt - 1);
13743 which |= (ei < nelt ? 1 : 2);
13744 d.perm[i] = ei;
13747 switch (which)
13749 default:
13750 gcc_unreachable ();
13752 case 3:
13753 d.one_vector_p = false;
13754 if (!rtx_equal_p (op0, op1))
13755 break;
13757 /* The elements of PERM do not suggest that only the first operand
13758 is used, but both operands are identical. Allow easier matching
13759 of the permutation by folding the permutation into the single
13760 input vector. */
13761 /* Fall Through. */
13762 case 2:
13763 for (i = 0; i < nelt; ++i)
13764 d.perm[i] &= nelt - 1;
13765 d.op0 = op1;
13766 d.one_vector_p = true;
13767 break;
13769 case 1:
13770 d.op1 = op0;
13771 d.one_vector_p = true;
13772 break;
13775 return aarch64_expand_vec_perm_const_1 (&d);
13778 static bool
13779 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13780 const unsigned char *sel)
13782 struct expand_vec_perm_d d;
13783 unsigned int i, nelt, which;
13784 bool ret;
13786 d.vmode = vmode;
13787 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13788 d.testing_p = true;
13789 memcpy (d.perm, sel, nelt);
13791 /* Calculate whether all elements are in one vector. */
13792 for (i = which = 0; i < nelt; ++i)
13794 unsigned char e = d.perm[i];
13795 gcc_assert (e < 2 * nelt);
13796 which |= (e < nelt ? 1 : 2);
13799 /* If all elements are from the second vector, reindex as if from the
13800 first vector. */
13801 if (which == 2)
13802 for (i = 0; i < nelt; ++i)
13803 d.perm[i] -= nelt;
13805 /* Check whether the mask can be applied to a single vector. */
13806 d.one_vector_p = (which != 3);
13808 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13809 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13810 if (!d.one_vector_p)
13811 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13813 start_sequence ();
13814 ret = aarch64_expand_vec_perm_const_1 (&d);
13815 end_sequence ();
13817 return ret;
13821 aarch64_reverse_mask (machine_mode mode)
13823 /* We have to reverse each vector because we dont have
13824 a permuted load that can reverse-load according to ABI rules. */
13825 rtx mask;
13826 rtvec v = rtvec_alloc (16);
13827 int i, j;
13828 int nunits = GET_MODE_NUNITS (mode);
13829 int usize = GET_MODE_UNIT_SIZE (mode);
13831 gcc_assert (BYTES_BIG_ENDIAN);
13832 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13834 for (i = 0; i < nunits; i++)
13835 for (j = 0; j < usize; j++)
13836 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13837 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13838 return force_reg (V16QImode, mask);
13841 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13842 However due to issues with register allocation it is preferable to avoid
13843 tieing integer scalar and FP scalar modes. Executing integer operations
13844 in general registers is better than treating them as scalar vector
13845 operations. This reduces latency and avoids redundant int<->FP moves.
13846 So tie modes if they are either the same class, or vector modes with
13847 other vector modes, vector structs or any scalar mode.
13850 bool
13851 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13853 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13854 return true;
13856 /* We specifically want to allow elements of "structure" modes to
13857 be tieable to the structure. This more general condition allows
13858 other rarer situations too. */
13859 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13860 return true;
13862 /* Also allow any scalar modes with vectors. */
13863 if (aarch64_vector_mode_supported_p (mode1)
13864 || aarch64_vector_mode_supported_p (mode2))
13865 return true;
13867 return false;
13870 /* Return a new RTX holding the result of moving POINTER forward by
13871 AMOUNT bytes. */
13873 static rtx
13874 aarch64_move_pointer (rtx pointer, int amount)
13876 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13878 return adjust_automodify_address (pointer, GET_MODE (pointer),
13879 next, amount);
13882 /* Return a new RTX holding the result of moving POINTER forward by the
13883 size of the mode it points to. */
13885 static rtx
13886 aarch64_progress_pointer (rtx pointer)
13888 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13890 return aarch64_move_pointer (pointer, amount);
13893 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13894 MODE bytes. */
13896 static void
13897 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13898 machine_mode mode)
13900 rtx reg = gen_reg_rtx (mode);
13902 /* "Cast" the pointers to the correct mode. */
13903 *src = adjust_address (*src, mode, 0);
13904 *dst = adjust_address (*dst, mode, 0);
13905 /* Emit the memcpy. */
13906 emit_move_insn (reg, *src);
13907 emit_move_insn (*dst, reg);
13908 /* Move the pointers forward. */
13909 *src = aarch64_progress_pointer (*src);
13910 *dst = aarch64_progress_pointer (*dst);
13913 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13914 we succeed, otherwise return false. */
13916 bool
13917 aarch64_expand_movmem (rtx *operands)
13919 unsigned int n;
13920 rtx dst = operands[0];
13921 rtx src = operands[1];
13922 rtx base;
13923 bool speed_p = !optimize_function_for_size_p (cfun);
13925 /* When optimizing for size, give a better estimate of the length of a
13926 memcpy call, but use the default otherwise. */
13927 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13929 /* We can't do anything smart if the amount to copy is not constant. */
13930 if (!CONST_INT_P (operands[2]))
13931 return false;
13933 n = UINTVAL (operands[2]);
13935 /* Try to keep the number of instructions low. For cases below 16 bytes we
13936 need to make at most two moves. For cases above 16 bytes it will be one
13937 move for each 16 byte chunk, then at most two additional moves. */
13938 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13939 return false;
13941 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13942 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13944 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13945 src = adjust_automodify_address (src, VOIDmode, base, 0);
13947 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13948 1-byte chunk. */
13949 if (n < 4)
13951 if (n >= 2)
13953 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13954 n -= 2;
13957 if (n == 1)
13958 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13960 return true;
13963 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13964 4-byte chunk, partially overlapping with the previously copied chunk. */
13965 if (n < 8)
13967 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13968 n -= 4;
13969 if (n > 0)
13971 int move = n - 4;
13973 src = aarch64_move_pointer (src, move);
13974 dst = aarch64_move_pointer (dst, move);
13975 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13977 return true;
13980 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13981 them, then (if applicable) an 8-byte chunk. */
13982 while (n >= 8)
13984 if (n / 16)
13986 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13987 n -= 16;
13989 else
13991 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13992 n -= 8;
13996 /* Finish the final bytes of the copy. We can always do this in one
13997 instruction. We either copy the exact amount we need, or partially
13998 overlap with the previous chunk we copied and copy 8-bytes. */
13999 if (n == 0)
14000 return true;
14001 else if (n == 1)
14002 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14003 else if (n == 2)
14004 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14005 else if (n == 4)
14006 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14007 else
14009 if (n == 3)
14011 src = aarch64_move_pointer (src, -1);
14012 dst = aarch64_move_pointer (dst, -1);
14013 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14015 else
14017 int move = n - 8;
14019 src = aarch64_move_pointer (src, move);
14020 dst = aarch64_move_pointer (dst, move);
14021 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14025 return true;
14028 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14029 SImode stores. Handle the case when the constant has identical
14030 bottom and top halves. This is beneficial when the two stores can be
14031 merged into an STP and we avoid synthesising potentially expensive
14032 immediates twice. Return true if such a split is possible. */
14034 bool
14035 aarch64_split_dimode_const_store (rtx dst, rtx src)
14037 rtx lo = gen_lowpart (SImode, src);
14038 rtx hi = gen_highpart_mode (SImode, DImode, src);
14040 bool size_p = optimize_function_for_size_p (cfun);
14042 if (!rtx_equal_p (lo, hi))
14043 return false;
14045 unsigned int orig_cost
14046 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14047 unsigned int lo_cost
14048 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14050 /* We want to transform:
14051 MOV x1, 49370
14052 MOVK x1, 0x140, lsl 16
14053 MOVK x1, 0xc0da, lsl 32
14054 MOVK x1, 0x140, lsl 48
14055 STR x1, [x0]
14056 into:
14057 MOV w1, 49370
14058 MOVK w1, 0x140, lsl 16
14059 STP w1, w1, [x0]
14060 So we want to perform this only when we save two instructions
14061 or more. When optimizing for size, however, accept any code size
14062 savings we can. */
14063 if (size_p && orig_cost <= lo_cost)
14064 return false;
14066 if (!size_p
14067 && (orig_cost <= lo_cost + 1))
14068 return false;
14070 rtx mem_lo = adjust_address (dst, SImode, 0);
14071 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14072 return false;
14074 rtx tmp_reg = gen_reg_rtx (SImode);
14075 aarch64_expand_mov_immediate (tmp_reg, lo);
14076 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14077 /* Don't emit an explicit store pair as this may not be always profitable.
14078 Let the sched-fusion logic decide whether to merge them. */
14079 emit_move_insn (mem_lo, tmp_reg);
14080 emit_move_insn (mem_hi, tmp_reg);
14082 return true;
14085 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14087 static unsigned HOST_WIDE_INT
14088 aarch64_asan_shadow_offset (void)
14090 return (HOST_WIDE_INT_1 << 36);
14093 static bool
14094 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14095 unsigned int align,
14096 enum by_pieces_operation op,
14097 bool speed_p)
14099 /* STORE_BY_PIECES can be used when copying a constant string, but
14100 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14101 For now we always fail this and let the move_by_pieces code copy
14102 the string from read-only memory. */
14103 if (op == STORE_BY_PIECES)
14104 return false;
14106 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14109 static rtx
14110 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14111 int code, tree treeop0, tree treeop1)
14113 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14114 rtx op0, op1;
14115 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14116 insn_code icode;
14117 struct expand_operand ops[4];
14119 start_sequence ();
14120 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14122 op_mode = GET_MODE (op0);
14123 if (op_mode == VOIDmode)
14124 op_mode = GET_MODE (op1);
14126 switch (op_mode)
14128 case E_QImode:
14129 case E_HImode:
14130 case E_SImode:
14131 cmp_mode = SImode;
14132 icode = CODE_FOR_cmpsi;
14133 break;
14135 case E_DImode:
14136 cmp_mode = DImode;
14137 icode = CODE_FOR_cmpdi;
14138 break;
14140 case E_SFmode:
14141 cmp_mode = SFmode;
14142 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14143 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14144 break;
14146 case E_DFmode:
14147 cmp_mode = DFmode;
14148 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14149 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14150 break;
14152 default:
14153 end_sequence ();
14154 return NULL_RTX;
14157 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14158 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14159 if (!op0 || !op1)
14161 end_sequence ();
14162 return NULL_RTX;
14164 *prep_seq = get_insns ();
14165 end_sequence ();
14167 create_fixed_operand (&ops[0], op0);
14168 create_fixed_operand (&ops[1], op1);
14170 start_sequence ();
14171 if (!maybe_expand_insn (icode, 2, ops))
14173 end_sequence ();
14174 return NULL_RTX;
14176 *gen_seq = get_insns ();
14177 end_sequence ();
14179 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14180 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14183 static rtx
14184 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14185 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14187 rtx op0, op1, target;
14188 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14189 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14190 insn_code icode;
14191 struct expand_operand ops[6];
14192 int aarch64_cond;
14194 push_to_sequence (*prep_seq);
14195 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14197 op_mode = GET_MODE (op0);
14198 if (op_mode == VOIDmode)
14199 op_mode = GET_MODE (op1);
14201 switch (op_mode)
14203 case E_QImode:
14204 case E_HImode:
14205 case E_SImode:
14206 cmp_mode = SImode;
14207 icode = CODE_FOR_ccmpsi;
14208 break;
14210 case E_DImode:
14211 cmp_mode = DImode;
14212 icode = CODE_FOR_ccmpdi;
14213 break;
14215 case E_SFmode:
14216 cmp_mode = SFmode;
14217 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14218 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14219 break;
14221 case E_DFmode:
14222 cmp_mode = DFmode;
14223 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14224 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14225 break;
14227 default:
14228 end_sequence ();
14229 return NULL_RTX;
14232 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14233 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14234 if (!op0 || !op1)
14236 end_sequence ();
14237 return NULL_RTX;
14239 *prep_seq = get_insns ();
14240 end_sequence ();
14242 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14243 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14245 if (bit_code != AND)
14247 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14248 GET_MODE (XEXP (prev, 0))),
14249 VOIDmode, XEXP (prev, 0), const0_rtx);
14250 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14253 create_fixed_operand (&ops[0], XEXP (prev, 0));
14254 create_fixed_operand (&ops[1], target);
14255 create_fixed_operand (&ops[2], op0);
14256 create_fixed_operand (&ops[3], op1);
14257 create_fixed_operand (&ops[4], prev);
14258 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14260 push_to_sequence (*gen_seq);
14261 if (!maybe_expand_insn (icode, 6, ops))
14263 end_sequence ();
14264 return NULL_RTX;
14267 *gen_seq = get_insns ();
14268 end_sequence ();
14270 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14273 #undef TARGET_GEN_CCMP_FIRST
14274 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14276 #undef TARGET_GEN_CCMP_NEXT
14277 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14279 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14280 instruction fusion of some sort. */
14282 static bool
14283 aarch64_macro_fusion_p (void)
14285 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14289 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14290 should be kept together during scheduling. */
14292 static bool
14293 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14295 rtx set_dest;
14296 rtx prev_set = single_set (prev);
14297 rtx curr_set = single_set (curr);
14298 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14299 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14301 if (!aarch64_macro_fusion_p ())
14302 return false;
14304 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14306 /* We are trying to match:
14307 prev (mov) == (set (reg r0) (const_int imm16))
14308 curr (movk) == (set (zero_extract (reg r0)
14309 (const_int 16)
14310 (const_int 16))
14311 (const_int imm16_1)) */
14313 set_dest = SET_DEST (curr_set);
14315 if (GET_CODE (set_dest) == ZERO_EXTRACT
14316 && CONST_INT_P (SET_SRC (curr_set))
14317 && CONST_INT_P (SET_SRC (prev_set))
14318 && CONST_INT_P (XEXP (set_dest, 2))
14319 && INTVAL (XEXP (set_dest, 2)) == 16
14320 && REG_P (XEXP (set_dest, 0))
14321 && REG_P (SET_DEST (prev_set))
14322 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14324 return true;
14328 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14331 /* We're trying to match:
14332 prev (adrp) == (set (reg r1)
14333 (high (symbol_ref ("SYM"))))
14334 curr (add) == (set (reg r0)
14335 (lo_sum (reg r1)
14336 (symbol_ref ("SYM"))))
14337 Note that r0 need not necessarily be the same as r1, especially
14338 during pre-regalloc scheduling. */
14340 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14341 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14343 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14344 && REG_P (XEXP (SET_SRC (curr_set), 0))
14345 && REGNO (XEXP (SET_SRC (curr_set), 0))
14346 == REGNO (SET_DEST (prev_set))
14347 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14348 XEXP (SET_SRC (curr_set), 1)))
14349 return true;
14353 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14356 /* We're trying to match:
14357 prev (movk) == (set (zero_extract (reg r0)
14358 (const_int 16)
14359 (const_int 32))
14360 (const_int imm16_1))
14361 curr (movk) == (set (zero_extract (reg r0)
14362 (const_int 16)
14363 (const_int 48))
14364 (const_int imm16_2)) */
14366 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14367 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14368 && REG_P (XEXP (SET_DEST (prev_set), 0))
14369 && REG_P (XEXP (SET_DEST (curr_set), 0))
14370 && REGNO (XEXP (SET_DEST (prev_set), 0))
14371 == REGNO (XEXP (SET_DEST (curr_set), 0))
14372 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14373 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14374 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14375 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14376 && CONST_INT_P (SET_SRC (prev_set))
14377 && CONST_INT_P (SET_SRC (curr_set)))
14378 return true;
14381 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14383 /* We're trying to match:
14384 prev (adrp) == (set (reg r0)
14385 (high (symbol_ref ("SYM"))))
14386 curr (ldr) == (set (reg r1)
14387 (mem (lo_sum (reg r0)
14388 (symbol_ref ("SYM")))))
14390 curr (ldr) == (set (reg r1)
14391 (zero_extend (mem
14392 (lo_sum (reg r0)
14393 (symbol_ref ("SYM")))))) */
14394 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14395 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14397 rtx curr_src = SET_SRC (curr_set);
14399 if (GET_CODE (curr_src) == ZERO_EXTEND)
14400 curr_src = XEXP (curr_src, 0);
14402 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14403 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14404 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14405 == REGNO (SET_DEST (prev_set))
14406 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14407 XEXP (SET_SRC (prev_set), 0)))
14408 return true;
14412 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14413 && aarch_crypto_can_dual_issue (prev, curr))
14414 return true;
14416 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14417 && any_condjump_p (curr))
14419 enum attr_type prev_type = get_attr_type (prev);
14421 unsigned int condreg1, condreg2;
14422 rtx cc_reg_1;
14423 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14424 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14426 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14427 && prev
14428 && modified_in_p (cc_reg_1, prev))
14430 /* FIXME: this misses some which is considered simple arthematic
14431 instructions for ThunderX. Simple shifts are missed here. */
14432 if (prev_type == TYPE_ALUS_SREG
14433 || prev_type == TYPE_ALUS_IMM
14434 || prev_type == TYPE_LOGICS_REG
14435 || prev_type == TYPE_LOGICS_IMM)
14436 return true;
14440 if (prev_set
14441 && curr_set
14442 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14443 && any_condjump_p (curr))
14445 /* We're trying to match:
14446 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14447 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14448 (const_int 0))
14449 (label_ref ("SYM"))
14450 (pc)) */
14451 if (SET_DEST (curr_set) == (pc_rtx)
14452 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14453 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14454 && REG_P (SET_DEST (prev_set))
14455 && REGNO (SET_DEST (prev_set))
14456 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14458 /* Fuse ALU operations followed by conditional branch instruction. */
14459 switch (get_attr_type (prev))
14461 case TYPE_ALU_IMM:
14462 case TYPE_ALU_SREG:
14463 case TYPE_ADC_REG:
14464 case TYPE_ADC_IMM:
14465 case TYPE_ADCS_REG:
14466 case TYPE_ADCS_IMM:
14467 case TYPE_LOGIC_REG:
14468 case TYPE_LOGIC_IMM:
14469 case TYPE_CSEL:
14470 case TYPE_ADR:
14471 case TYPE_MOV_IMM:
14472 case TYPE_SHIFT_REG:
14473 case TYPE_SHIFT_IMM:
14474 case TYPE_BFM:
14475 case TYPE_RBIT:
14476 case TYPE_REV:
14477 case TYPE_EXTEND:
14478 return true;
14480 default:;
14485 return false;
14488 /* Return true iff the instruction fusion described by OP is enabled. */
14490 bool
14491 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14493 return (aarch64_tune_params.fusible_ops & op) != 0;
14496 /* If MEM is in the form of [base+offset], extract the two parts
14497 of address and set to BASE and OFFSET, otherwise return false
14498 after clearing BASE and OFFSET. */
14500 bool
14501 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14503 rtx addr;
14505 gcc_assert (MEM_P (mem));
14507 addr = XEXP (mem, 0);
14509 if (REG_P (addr))
14511 *base = addr;
14512 *offset = const0_rtx;
14513 return true;
14516 if (GET_CODE (addr) == PLUS
14517 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14519 *base = XEXP (addr, 0);
14520 *offset = XEXP (addr, 1);
14521 return true;
14524 *base = NULL_RTX;
14525 *offset = NULL_RTX;
14527 return false;
14530 /* Types for scheduling fusion. */
14531 enum sched_fusion_type
14533 SCHED_FUSION_NONE = 0,
14534 SCHED_FUSION_LD_SIGN_EXTEND,
14535 SCHED_FUSION_LD_ZERO_EXTEND,
14536 SCHED_FUSION_LD,
14537 SCHED_FUSION_ST,
14538 SCHED_FUSION_NUM
14541 /* If INSN is a load or store of address in the form of [base+offset],
14542 extract the two parts and set to BASE and OFFSET. Return scheduling
14543 fusion type this INSN is. */
14545 static enum sched_fusion_type
14546 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14548 rtx x, dest, src;
14549 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14551 gcc_assert (INSN_P (insn));
14552 x = PATTERN (insn);
14553 if (GET_CODE (x) != SET)
14554 return SCHED_FUSION_NONE;
14556 src = SET_SRC (x);
14557 dest = SET_DEST (x);
14559 machine_mode dest_mode = GET_MODE (dest);
14561 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14562 return SCHED_FUSION_NONE;
14564 if (GET_CODE (src) == SIGN_EXTEND)
14566 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14567 src = XEXP (src, 0);
14568 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14569 return SCHED_FUSION_NONE;
14571 else if (GET_CODE (src) == ZERO_EXTEND)
14573 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14574 src = XEXP (src, 0);
14575 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14576 return SCHED_FUSION_NONE;
14579 if (GET_CODE (src) == MEM && REG_P (dest))
14580 extract_base_offset_in_addr (src, base, offset);
14581 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14583 fusion = SCHED_FUSION_ST;
14584 extract_base_offset_in_addr (dest, base, offset);
14586 else
14587 return SCHED_FUSION_NONE;
14589 if (*base == NULL_RTX || *offset == NULL_RTX)
14590 fusion = SCHED_FUSION_NONE;
14592 return fusion;
14595 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14597 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14598 and PRI are only calculated for these instructions. For other instruction,
14599 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14600 type instruction fusion can be added by returning different priorities.
14602 It's important that irrelevant instructions get the largest FUSION_PRI. */
14604 static void
14605 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14606 int *fusion_pri, int *pri)
14608 int tmp, off_val;
14609 rtx base, offset;
14610 enum sched_fusion_type fusion;
14612 gcc_assert (INSN_P (insn));
14614 tmp = max_pri - 1;
14615 fusion = fusion_load_store (insn, &base, &offset);
14616 if (fusion == SCHED_FUSION_NONE)
14618 *pri = tmp;
14619 *fusion_pri = tmp;
14620 return;
14623 /* Set FUSION_PRI according to fusion type and base register. */
14624 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14626 /* Calculate PRI. */
14627 tmp /= 2;
14629 /* INSN with smaller offset goes first. */
14630 off_val = (int)(INTVAL (offset));
14631 if (off_val >= 0)
14632 tmp -= (off_val & 0xfffff);
14633 else
14634 tmp += ((- off_val) & 0xfffff);
14636 *pri = tmp;
14637 return;
14640 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14641 Adjust priority of sha1h instructions so they are scheduled before
14642 other SHA1 instructions. */
14644 static int
14645 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14647 rtx x = PATTERN (insn);
14649 if (GET_CODE (x) == SET)
14651 x = SET_SRC (x);
14653 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14654 return priority + 10;
14657 return priority;
14660 /* Given OPERANDS of consecutive load/store, check if we can merge
14661 them into ldp/stp. LOAD is true if they are load instructions.
14662 MODE is the mode of memory operands. */
14664 bool
14665 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14666 machine_mode mode)
14668 HOST_WIDE_INT offval_1, offval_2, msize;
14669 enum reg_class rclass_1, rclass_2;
14670 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14672 if (load)
14674 mem_1 = operands[1];
14675 mem_2 = operands[3];
14676 reg_1 = operands[0];
14677 reg_2 = operands[2];
14678 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14679 if (REGNO (reg_1) == REGNO (reg_2))
14680 return false;
14682 else
14684 mem_1 = operands[0];
14685 mem_2 = operands[2];
14686 reg_1 = operands[1];
14687 reg_2 = operands[3];
14690 /* The mems cannot be volatile. */
14691 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14692 return false;
14694 /* If we have SImode and slow unaligned ldp,
14695 check the alignment to be at least 8 byte. */
14696 if (mode == SImode
14697 && (aarch64_tune_params.extra_tuning_flags
14698 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14699 && !optimize_size
14700 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14701 return false;
14703 /* Check if the addresses are in the form of [base+offset]. */
14704 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14705 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14706 return false;
14707 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14708 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14709 return false;
14711 /* Check if the bases are same. */
14712 if (!rtx_equal_p (base_1, base_2))
14713 return false;
14715 offval_1 = INTVAL (offset_1);
14716 offval_2 = INTVAL (offset_2);
14717 msize = GET_MODE_SIZE (mode);
14718 /* Check if the offsets are consecutive. */
14719 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14720 return false;
14722 /* Check if the addresses are clobbered by load. */
14723 if (load)
14725 if (reg_mentioned_p (reg_1, mem_1))
14726 return false;
14728 /* In increasing order, the last load can clobber the address. */
14729 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14730 return false;
14733 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14734 rclass_1 = FP_REGS;
14735 else
14736 rclass_1 = GENERAL_REGS;
14738 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14739 rclass_2 = FP_REGS;
14740 else
14741 rclass_2 = GENERAL_REGS;
14743 /* Check if the registers are of same class. */
14744 if (rclass_1 != rclass_2)
14745 return false;
14747 return true;
14750 /* Given OPERANDS of consecutive load/store, check if we can merge
14751 them into ldp/stp by adjusting the offset. LOAD is true if they
14752 are load instructions. MODE is the mode of memory operands.
14754 Given below consecutive stores:
14756 str w1, [xb, 0x100]
14757 str w1, [xb, 0x104]
14758 str w1, [xb, 0x108]
14759 str w1, [xb, 0x10c]
14761 Though the offsets are out of the range supported by stp, we can
14762 still pair them after adjusting the offset, like:
14764 add scratch, xb, 0x100
14765 stp w1, w1, [scratch]
14766 stp w1, w1, [scratch, 0x8]
14768 The peephole patterns detecting this opportunity should guarantee
14769 the scratch register is avaliable. */
14771 bool
14772 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14773 machine_mode mode)
14775 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14776 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14777 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14778 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14780 if (load)
14782 reg_1 = operands[0];
14783 mem_1 = operands[1];
14784 reg_2 = operands[2];
14785 mem_2 = operands[3];
14786 reg_3 = operands[4];
14787 mem_3 = operands[5];
14788 reg_4 = operands[6];
14789 mem_4 = operands[7];
14790 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14791 && REG_P (reg_3) && REG_P (reg_4));
14792 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14793 return false;
14795 else
14797 mem_1 = operands[0];
14798 reg_1 = operands[1];
14799 mem_2 = operands[2];
14800 reg_2 = operands[3];
14801 mem_3 = operands[4];
14802 reg_3 = operands[5];
14803 mem_4 = operands[6];
14804 reg_4 = operands[7];
14806 /* Skip if memory operand is by itslef valid for ldp/stp. */
14807 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14808 return false;
14810 /* The mems cannot be volatile. */
14811 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14812 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14813 return false;
14815 /* Check if the addresses are in the form of [base+offset]. */
14816 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14817 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14818 return false;
14819 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14820 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14821 return false;
14822 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14823 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14824 return false;
14825 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14826 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14827 return false;
14829 /* Check if the bases are same. */
14830 if (!rtx_equal_p (base_1, base_2)
14831 || !rtx_equal_p (base_2, base_3)
14832 || !rtx_equal_p (base_3, base_4))
14833 return false;
14835 offval_1 = INTVAL (offset_1);
14836 offval_2 = INTVAL (offset_2);
14837 offval_3 = INTVAL (offset_3);
14838 offval_4 = INTVAL (offset_4);
14839 msize = GET_MODE_SIZE (mode);
14840 /* Check if the offsets are consecutive. */
14841 if ((offval_1 != (offval_2 + msize)
14842 || offval_1 != (offval_3 + msize * 2)
14843 || offval_1 != (offval_4 + msize * 3))
14844 && (offval_4 != (offval_3 + msize)
14845 || offval_4 != (offval_2 + msize * 2)
14846 || offval_4 != (offval_1 + msize * 3)))
14847 return false;
14849 /* Check if the addresses are clobbered by load. */
14850 if (load)
14852 if (reg_mentioned_p (reg_1, mem_1)
14853 || reg_mentioned_p (reg_2, mem_2)
14854 || reg_mentioned_p (reg_3, mem_3))
14855 return false;
14857 /* In increasing order, the last load can clobber the address. */
14858 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14859 return false;
14862 /* If we have SImode and slow unaligned ldp,
14863 check the alignment to be at least 8 byte. */
14864 if (mode == SImode
14865 && (aarch64_tune_params.extra_tuning_flags
14866 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14867 && !optimize_size
14868 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14869 return false;
14871 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14872 rclass_1 = FP_REGS;
14873 else
14874 rclass_1 = GENERAL_REGS;
14876 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14877 rclass_2 = FP_REGS;
14878 else
14879 rclass_2 = GENERAL_REGS;
14881 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14882 rclass_3 = FP_REGS;
14883 else
14884 rclass_3 = GENERAL_REGS;
14886 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14887 rclass_4 = FP_REGS;
14888 else
14889 rclass_4 = GENERAL_REGS;
14891 /* Check if the registers are of same class. */
14892 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14893 return false;
14895 return true;
14898 /* Given OPERANDS of consecutive load/store, this function pairs them
14899 into ldp/stp after adjusting the offset. It depends on the fact
14900 that addresses of load/store instructions are in increasing order.
14901 MODE is the mode of memory operands. CODE is the rtl operator
14902 which should be applied to all memory operands, it's SIGN_EXTEND,
14903 ZERO_EXTEND or UNKNOWN. */
14905 bool
14906 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14907 machine_mode mode, RTX_CODE code)
14909 rtx base, offset, t1, t2;
14910 rtx mem_1, mem_2, mem_3, mem_4;
14911 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14913 if (load)
14915 mem_1 = operands[1];
14916 mem_2 = operands[3];
14917 mem_3 = operands[5];
14918 mem_4 = operands[7];
14920 else
14922 mem_1 = operands[0];
14923 mem_2 = operands[2];
14924 mem_3 = operands[4];
14925 mem_4 = operands[6];
14926 gcc_assert (code == UNKNOWN);
14929 extract_base_offset_in_addr (mem_1, &base, &offset);
14930 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14932 /* Adjust offset thus it can fit in ldp/stp instruction. */
14933 msize = GET_MODE_SIZE (mode);
14934 stp_off_limit = msize * 0x40;
14935 off_val = INTVAL (offset);
14936 abs_off = (off_val < 0) ? -off_val : off_val;
14937 new_off = abs_off % stp_off_limit;
14938 adj_off = abs_off - new_off;
14940 /* Further adjust to make sure all offsets are OK. */
14941 if ((new_off + msize * 2) >= stp_off_limit)
14943 adj_off += stp_off_limit;
14944 new_off -= stp_off_limit;
14947 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14948 if (adj_off >= 0x1000)
14949 return false;
14951 if (off_val < 0)
14953 adj_off = -adj_off;
14954 new_off = -new_off;
14957 /* Create new memory references. */
14958 mem_1 = change_address (mem_1, VOIDmode,
14959 plus_constant (DImode, operands[8], new_off));
14961 /* Check if the adjusted address is OK for ldp/stp. */
14962 if (!aarch64_mem_pair_operand (mem_1, mode))
14963 return false;
14965 msize = GET_MODE_SIZE (mode);
14966 mem_2 = change_address (mem_2, VOIDmode,
14967 plus_constant (DImode,
14968 operands[8],
14969 new_off + msize));
14970 mem_3 = change_address (mem_3, VOIDmode,
14971 plus_constant (DImode,
14972 operands[8],
14973 new_off + msize * 2));
14974 mem_4 = change_address (mem_4, VOIDmode,
14975 plus_constant (DImode,
14976 operands[8],
14977 new_off + msize * 3));
14979 if (code == ZERO_EXTEND)
14981 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14982 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14983 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14984 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14986 else if (code == SIGN_EXTEND)
14988 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14989 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14990 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14991 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14994 if (load)
14996 operands[1] = mem_1;
14997 operands[3] = mem_2;
14998 operands[5] = mem_3;
14999 operands[7] = mem_4;
15001 else
15003 operands[0] = mem_1;
15004 operands[2] = mem_2;
15005 operands[4] = mem_3;
15006 operands[6] = mem_4;
15009 /* Emit adjusting instruction. */
15010 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15011 /* Emit ldp/stp instructions. */
15012 t1 = gen_rtx_SET (operands[0], operands[1]);
15013 t2 = gen_rtx_SET (operands[2], operands[3]);
15014 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15015 t1 = gen_rtx_SET (operands[4], operands[5]);
15016 t2 = gen_rtx_SET (operands[6], operands[7]);
15017 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15018 return true;
15021 /* Return 1 if pseudo register should be created and used to hold
15022 GOT address for PIC code. */
15024 bool
15025 aarch64_use_pseudo_pic_reg (void)
15027 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15030 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15032 static int
15033 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15035 switch (XINT (x, 1))
15037 case UNSPEC_GOTSMALLPIC:
15038 case UNSPEC_GOTSMALLPIC28K:
15039 case UNSPEC_GOTTINYPIC:
15040 return 0;
15041 default:
15042 break;
15045 return default_unspec_may_trap_p (x, flags);
15049 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15050 return the log2 of that value. Otherwise return -1. */
15053 aarch64_fpconst_pow_of_2 (rtx x)
15055 const REAL_VALUE_TYPE *r;
15057 if (!CONST_DOUBLE_P (x))
15058 return -1;
15060 r = CONST_DOUBLE_REAL_VALUE (x);
15062 if (REAL_VALUE_NEGATIVE (*r)
15063 || REAL_VALUE_ISNAN (*r)
15064 || REAL_VALUE_ISINF (*r)
15065 || !real_isinteger (r, DFmode))
15066 return -1;
15068 return exact_log2 (real_to_integer (r));
15071 /* If X is a vector of equal CONST_DOUBLE values and that value is
15072 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15075 aarch64_vec_fpconst_pow_of_2 (rtx x)
15077 if (GET_CODE (x) != CONST_VECTOR)
15078 return -1;
15080 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15081 return -1;
15083 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15084 if (firstval <= 0)
15085 return -1;
15087 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15088 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15089 return -1;
15091 return firstval;
15094 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15095 to float.
15097 __fp16 always promotes through this hook.
15098 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15099 through the generic excess precision logic rather than here. */
15101 static tree
15102 aarch64_promoted_type (const_tree t)
15104 if (SCALAR_FLOAT_TYPE_P (t)
15105 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15106 return float_type_node;
15108 return NULL_TREE;
15111 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15113 static bool
15114 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15115 optimization_type opt_type)
15117 switch (op)
15119 case rsqrt_optab:
15120 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15122 default:
15123 return true;
15127 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15128 if MODE is HFmode, and punt to the generic implementation otherwise. */
15130 static bool
15131 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
15133 return (mode == HFmode
15134 ? true
15135 : default_libgcc_floating_mode_supported_p (mode));
15138 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15139 if MODE is HFmode, and punt to the generic implementation otherwise. */
15141 static bool
15142 aarch64_scalar_mode_supported_p (machine_mode mode)
15144 return (mode == HFmode
15145 ? true
15146 : default_scalar_mode_supported_p (mode));
15149 /* Set the value of FLT_EVAL_METHOD.
15150 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15152 0: evaluate all operations and constants, whose semantic type has at
15153 most the range and precision of type float, to the range and
15154 precision of float; evaluate all other operations and constants to
15155 the range and precision of the semantic type;
15157 N, where _FloatN is a supported interchange floating type
15158 evaluate all operations and constants, whose semantic type has at
15159 most the range and precision of _FloatN type, to the range and
15160 precision of the _FloatN type; evaluate all other operations and
15161 constants to the range and precision of the semantic type;
15163 If we have the ARMv8.2-A extensions then we support _Float16 in native
15164 precision, so we should set this to 16. Otherwise, we support the type,
15165 but want to evaluate expressions in float precision, so set this to
15166 0. */
15168 static enum flt_eval_method
15169 aarch64_excess_precision (enum excess_precision_type type)
15171 switch (type)
15173 case EXCESS_PRECISION_TYPE_FAST:
15174 case EXCESS_PRECISION_TYPE_STANDARD:
15175 /* We can calculate either in 16-bit range and precision or
15176 32-bit range and precision. Make that decision based on whether
15177 we have native support for the ARMv8.2-A 16-bit floating-point
15178 instructions or not. */
15179 return (TARGET_FP_F16INST
15180 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15181 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15182 case EXCESS_PRECISION_TYPE_IMPLICIT:
15183 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15184 default:
15185 gcc_unreachable ();
15187 return FLT_EVAL_METHOD_UNPREDICTABLE;
15190 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15191 scheduled for speculative execution. Reject the long-running division
15192 and square-root instructions. */
15194 static bool
15195 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15197 switch (get_attr_type (insn))
15199 case TYPE_SDIV:
15200 case TYPE_UDIV:
15201 case TYPE_FDIVS:
15202 case TYPE_FDIVD:
15203 case TYPE_FSQRTS:
15204 case TYPE_FSQRTD:
15205 case TYPE_NEON_FP_SQRT_S:
15206 case TYPE_NEON_FP_SQRT_D:
15207 case TYPE_NEON_FP_SQRT_S_Q:
15208 case TYPE_NEON_FP_SQRT_D_Q:
15209 case TYPE_NEON_FP_DIV_S:
15210 case TYPE_NEON_FP_DIV_D:
15211 case TYPE_NEON_FP_DIV_S_Q:
15212 case TYPE_NEON_FP_DIV_D_Q:
15213 return false;
15214 default:
15215 return true;
15219 /* Target-specific selftests. */
15221 #if CHECKING_P
15223 namespace selftest {
15225 /* Selftest for the RTL loader.
15226 Verify that the RTL loader copes with a dump from
15227 print_rtx_function. This is essentially just a test that class
15228 function_reader can handle a real dump, but it also verifies
15229 that lookup_reg_by_dump_name correctly handles hard regs.
15230 The presence of hard reg names in the dump means that the test is
15231 target-specific, hence it is in this file. */
15233 static void
15234 aarch64_test_loading_full_dump ()
15236 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15238 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15240 rtx_insn *insn_1 = get_insn_by_uid (1);
15241 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15243 rtx_insn *insn_15 = get_insn_by_uid (15);
15244 ASSERT_EQ (INSN, GET_CODE (insn_15));
15245 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15247 /* Verify crtl->return_rtx. */
15248 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15249 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15250 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15253 /* Run all target-specific selftests. */
15255 static void
15256 aarch64_run_selftests (void)
15258 aarch64_test_loading_full_dump ();
15261 } // namespace selftest
15263 #endif /* #if CHECKING_P */
15265 #undef TARGET_ADDRESS_COST
15266 #define TARGET_ADDRESS_COST aarch64_address_cost
15268 /* This hook will determines whether unnamed bitfields affect the alignment
15269 of the containing structure. The hook returns true if the structure
15270 should inherit the alignment requirements of an unnamed bitfield's
15271 type. */
15272 #undef TARGET_ALIGN_ANON_BITFIELD
15273 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15275 #undef TARGET_ASM_ALIGNED_DI_OP
15276 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15278 #undef TARGET_ASM_ALIGNED_HI_OP
15279 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15281 #undef TARGET_ASM_ALIGNED_SI_OP
15282 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15284 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15285 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15286 hook_bool_const_tree_hwi_hwi_const_tree_true
15288 #undef TARGET_ASM_FILE_START
15289 #define TARGET_ASM_FILE_START aarch64_start_file
15291 #undef TARGET_ASM_OUTPUT_MI_THUNK
15292 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15294 #undef TARGET_ASM_SELECT_RTX_SECTION
15295 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15297 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15298 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15300 #undef TARGET_BUILD_BUILTIN_VA_LIST
15301 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15303 #undef TARGET_CALLEE_COPIES
15304 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15306 #undef TARGET_CAN_ELIMINATE
15307 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15309 #undef TARGET_CAN_INLINE_P
15310 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15312 #undef TARGET_CANNOT_FORCE_CONST_MEM
15313 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15315 #undef TARGET_CASE_VALUES_THRESHOLD
15316 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15318 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15319 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15321 /* Only the least significant bit is used for initialization guard
15322 variables. */
15323 #undef TARGET_CXX_GUARD_MASK_BIT
15324 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15326 #undef TARGET_C_MODE_FOR_SUFFIX
15327 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15329 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15330 #undef TARGET_DEFAULT_TARGET_FLAGS
15331 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15332 #endif
15334 #undef TARGET_CLASS_MAX_NREGS
15335 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15337 #undef TARGET_BUILTIN_DECL
15338 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15340 #undef TARGET_BUILTIN_RECIPROCAL
15341 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15343 #undef TARGET_C_EXCESS_PRECISION
15344 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15346 #undef TARGET_EXPAND_BUILTIN
15347 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15349 #undef TARGET_EXPAND_BUILTIN_VA_START
15350 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15352 #undef TARGET_FOLD_BUILTIN
15353 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15355 #undef TARGET_FUNCTION_ARG
15356 #define TARGET_FUNCTION_ARG aarch64_function_arg
15358 #undef TARGET_FUNCTION_ARG_ADVANCE
15359 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15361 #undef TARGET_FUNCTION_ARG_BOUNDARY
15362 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15364 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15365 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15367 #undef TARGET_FUNCTION_VALUE
15368 #define TARGET_FUNCTION_VALUE aarch64_function_value
15370 #undef TARGET_FUNCTION_VALUE_REGNO_P
15371 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15373 #undef TARGET_FRAME_POINTER_REQUIRED
15374 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15376 #undef TARGET_GIMPLE_FOLD_BUILTIN
15377 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15379 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15380 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15382 #undef TARGET_INIT_BUILTINS
15383 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15385 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15386 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15387 aarch64_ira_change_pseudo_allocno_class
15389 #undef TARGET_LEGITIMATE_ADDRESS_P
15390 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15392 #undef TARGET_LEGITIMATE_CONSTANT_P
15393 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15395 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15396 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15397 aarch64_legitimize_address_displacement
15399 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15400 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15402 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15403 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15404 aarch64_libgcc_floating_mode_supported_p
15406 #undef TARGET_MANGLE_TYPE
15407 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15409 #undef TARGET_MEMORY_MOVE_COST
15410 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15412 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15413 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15415 #undef TARGET_MUST_PASS_IN_STACK
15416 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15418 /* This target hook should return true if accesses to volatile bitfields
15419 should use the narrowest mode possible. It should return false if these
15420 accesses should use the bitfield container type. */
15421 #undef TARGET_NARROW_VOLATILE_BITFIELD
15422 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15424 #undef TARGET_OPTION_OVERRIDE
15425 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15427 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15428 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15429 aarch64_override_options_after_change
15431 #undef TARGET_OPTION_SAVE
15432 #define TARGET_OPTION_SAVE aarch64_option_save
15434 #undef TARGET_OPTION_RESTORE
15435 #define TARGET_OPTION_RESTORE aarch64_option_restore
15437 #undef TARGET_OPTION_PRINT
15438 #define TARGET_OPTION_PRINT aarch64_option_print
15440 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15441 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15443 #undef TARGET_SET_CURRENT_FUNCTION
15444 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15446 #undef TARGET_PASS_BY_REFERENCE
15447 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15449 #undef TARGET_PREFERRED_RELOAD_CLASS
15450 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15452 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15453 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15455 #undef TARGET_PROMOTED_TYPE
15456 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15458 #undef TARGET_SECONDARY_RELOAD
15459 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15461 #undef TARGET_SHIFT_TRUNCATION_MASK
15462 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15464 #undef TARGET_SETUP_INCOMING_VARARGS
15465 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15467 #undef TARGET_STRUCT_VALUE_RTX
15468 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15470 #undef TARGET_REGISTER_MOVE_COST
15471 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15473 #undef TARGET_RETURN_IN_MEMORY
15474 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15476 #undef TARGET_RETURN_IN_MSB
15477 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15479 #undef TARGET_RTX_COSTS
15480 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15482 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15483 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15485 #undef TARGET_SCHED_ISSUE_RATE
15486 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15488 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15489 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15490 aarch64_sched_first_cycle_multipass_dfa_lookahead
15492 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15493 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15494 aarch64_first_cycle_multipass_dfa_lookahead_guard
15496 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15497 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15498 aarch64_get_separate_components
15500 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15501 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15502 aarch64_components_for_bb
15504 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15505 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15506 aarch64_disqualify_components
15508 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15509 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15510 aarch64_emit_prologue_components
15512 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15513 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15514 aarch64_emit_epilogue_components
15516 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15517 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15518 aarch64_set_handled_components
15520 #undef TARGET_TRAMPOLINE_INIT
15521 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15523 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15524 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15526 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15527 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15529 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15530 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15531 aarch64_builtin_support_vector_misalignment
15533 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15534 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15536 #undef TARGET_VECTORIZE_ADD_STMT_COST
15537 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15539 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15540 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15541 aarch64_builtin_vectorization_cost
15543 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15544 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15546 #undef TARGET_VECTORIZE_BUILTINS
15547 #define TARGET_VECTORIZE_BUILTINS
15549 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15550 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15551 aarch64_builtin_vectorized_function
15553 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15554 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15555 aarch64_autovectorize_vector_sizes
15557 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15558 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15559 aarch64_atomic_assign_expand_fenv
15561 /* Section anchor support. */
15563 #undef TARGET_MIN_ANCHOR_OFFSET
15564 #define TARGET_MIN_ANCHOR_OFFSET -256
15566 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15567 byte offset; we can do much more for larger data types, but have no way
15568 to determine the size of the access. We assume accesses are aligned. */
15569 #undef TARGET_MAX_ANCHOR_OFFSET
15570 #define TARGET_MAX_ANCHOR_OFFSET 4095
15572 #undef TARGET_VECTOR_ALIGNMENT
15573 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15575 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15576 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15577 aarch64_simd_vector_alignment_reachable
15579 /* vec_perm support. */
15581 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15582 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15583 aarch64_vectorize_vec_perm_const_ok
15585 #undef TARGET_INIT_LIBFUNCS
15586 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15588 #undef TARGET_FIXED_CONDITION_CODE_REGS
15589 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15591 #undef TARGET_FLAGS_REGNUM
15592 #define TARGET_FLAGS_REGNUM CC_REGNUM
15594 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15595 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15597 #undef TARGET_ASAN_SHADOW_OFFSET
15598 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15600 #undef TARGET_LEGITIMIZE_ADDRESS
15601 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15603 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15604 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15605 aarch64_use_by_pieces_infrastructure_p
15607 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15608 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15610 #undef TARGET_CAN_USE_DOLOOP_P
15611 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15613 #undef TARGET_SCHED_ADJUST_PRIORITY
15614 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15616 #undef TARGET_SCHED_MACRO_FUSION_P
15617 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15619 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15620 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15622 #undef TARGET_SCHED_FUSION_PRIORITY
15623 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15625 #undef TARGET_UNSPEC_MAY_TRAP_P
15626 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15628 #undef TARGET_USE_PSEUDO_PIC_REG
15629 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15631 #undef TARGET_PRINT_OPERAND
15632 #define TARGET_PRINT_OPERAND aarch64_print_operand
15634 #undef TARGET_PRINT_OPERAND_ADDRESS
15635 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15637 #undef TARGET_OPTAB_SUPPORTED_P
15638 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15640 #undef TARGET_OMIT_STRUCT_RETURN_REG
15641 #define TARGET_OMIT_STRUCT_RETURN_REG true
15643 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15644 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15645 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15647 #if CHECKING_P
15648 #undef TARGET_RUN_TARGET_SELFTESTS
15649 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15650 #endif /* #if CHECKING_P */
15652 struct gcc_target targetm = TARGET_INITIALIZER;
15654 #include "gt-aarch64.h"