2017-08-01 Tamar Christina <tamar.christina@arm.com>
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob5a2ad7e9156a6f0389c09470cf1414bff45d8099
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67 #include "selftest.h"
68 #include "selftest-rtl.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76 /* Classifies an address.
78 ADDRESS_REG_IMM
79 A simple base register plus immediate offset.
81 ADDRESS_REG_WB
82 A base register indexed by immediate offset with writeback.
84 ADDRESS_REG_REG
85 A base register indexed by (optionally scaled) register.
87 ADDRESS_REG_UXTW
88 A base register indexed by (optionally scaled) zero-extended register.
90 ADDRESS_REG_SXTW
91 A base register indexed by (optionally scaled) sign-extended register.
93 ADDRESS_LO_SUM
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
96 ADDRESS_SYMBOLIC:
97 A constant symbolic address, in pc-relative literal pool. */
99 enum aarch64_address_type {
100 ADDRESS_REG_IMM,
101 ADDRESS_REG_WB,
102 ADDRESS_REG_REG,
103 ADDRESS_REG_UXTW,
104 ADDRESS_REG_SXTW,
105 ADDRESS_LO_SUM,
106 ADDRESS_SYMBOLIC
109 struct aarch64_address_info {
110 enum aarch64_address_type type;
111 rtx base;
112 rtx offset;
113 int shift;
114 enum aarch64_symbol_type symbol_type;
117 struct simd_immediate_info
119 rtx value;
120 int shift;
121 int element_width;
122 bool mvn;
123 bool msl;
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel;
129 #ifdef HAVE_AS_TLS
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
132 #endif
134 static bool aarch64_composite_type_p (const_tree, machine_mode);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 const_tree,
137 machine_mode *, int *,
138 bool *);
139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
147 const_tree type,
148 int misalignment,
149 bool is_packed);
150 static machine_mode
151 aarch64_simd_container_mode (machine_mode mode, unsigned width);
153 /* Major revision number of the ARM Architecture implemented by the target. */
154 unsigned aarch64_architecture_version;
156 /* The processor for which instructions should be scheduled. */
157 enum aarch64_processor aarch64_tune = cortexa53;
159 /* Mask to specify which instruction scheduling options should be used. */
160 unsigned long aarch64_tune_flags = 0;
162 /* Global flag for PC relative loads. */
163 bool aarch64_pcrelative_literal_loads;
165 /* Support for command line parsing of boolean flags in the tuning
166 structures. */
167 struct aarch64_flag_desc
169 const char* name;
170 unsigned int flag;
173 #define AARCH64_FUSION_PAIR(name, internal_name) \
174 { name, AARCH64_FUSE_##internal_name },
175 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
177 { "none", AARCH64_FUSE_NOTHING },
178 #include "aarch64-fusion-pairs.def"
179 { "all", AARCH64_FUSE_ALL },
180 { NULL, AARCH64_FUSE_NOTHING }
183 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
184 { name, AARCH64_EXTRA_TUNE_##internal_name },
185 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
187 { "none", AARCH64_EXTRA_TUNE_NONE },
188 #include "aarch64-tuning-flags.def"
189 { "all", AARCH64_EXTRA_TUNE_ALL },
190 { NULL, AARCH64_EXTRA_TUNE_NONE }
193 /* Tuning parameters. */
195 static const struct cpu_addrcost_table generic_addrcost_table =
198 1, /* hi */
199 0, /* si */
200 0, /* di */
201 1, /* ti */
203 0, /* pre_modify */
204 0, /* post_modify */
205 0, /* register_offset */
206 0, /* register_sextend */
207 0, /* register_zextend */
208 0 /* imm_offset */
211 static const struct cpu_addrcost_table exynosm1_addrcost_table =
214 0, /* hi */
215 0, /* si */
216 0, /* di */
217 2, /* ti */
219 0, /* pre_modify */
220 0, /* post_modify */
221 1, /* register_offset */
222 1, /* register_sextend */
223 2, /* register_zextend */
224 0, /* imm_offset */
227 static const struct cpu_addrcost_table xgene1_addrcost_table =
230 1, /* hi */
231 0, /* si */
232 0, /* di */
233 1, /* ti */
235 1, /* pre_modify */
236 0, /* post_modify */
237 0, /* register_offset */
238 1, /* register_sextend */
239 1, /* register_zextend */
240 0, /* imm_offset */
243 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
246 1, /* hi */
247 1, /* si */
248 1, /* di */
249 2, /* ti */
251 0, /* pre_modify */
252 0, /* post_modify */
253 2, /* register_offset */
254 3, /* register_sextend */
255 3, /* register_zextend */
256 0, /* imm_offset */
259 static const struct cpu_regmove_cost generic_regmove_cost =
261 1, /* GP2GP */
262 /* Avoid the use of slow int<->fp moves for spilling by setting
263 their cost higher than memmov_cost. */
264 5, /* GP2FP */
265 5, /* FP2GP */
266 2 /* FP2FP */
269 static const struct cpu_regmove_cost cortexa57_regmove_cost =
271 1, /* GP2GP */
272 /* Avoid the use of slow int<->fp moves for spilling by setting
273 their cost higher than memmov_cost. */
274 5, /* GP2FP */
275 5, /* FP2GP */
276 2 /* FP2FP */
279 static const struct cpu_regmove_cost cortexa53_regmove_cost =
281 1, /* GP2GP */
282 /* Avoid the use of slow int<->fp moves for spilling by setting
283 their cost higher than memmov_cost. */
284 5, /* GP2FP */
285 5, /* FP2GP */
286 2 /* FP2FP */
289 static const struct cpu_regmove_cost exynosm1_regmove_cost =
291 1, /* GP2GP */
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost (actual, 4 and 9). */
294 9, /* GP2FP */
295 9, /* FP2GP */
296 1 /* FP2FP */
299 static const struct cpu_regmove_cost thunderx_regmove_cost =
301 2, /* GP2GP */
302 2, /* GP2FP */
303 6, /* FP2GP */
304 4 /* FP2FP */
307 static const struct cpu_regmove_cost xgene1_regmove_cost =
309 1, /* GP2GP */
310 /* Avoid the use of slow int<->fp moves for spilling by setting
311 their cost higher than memmov_cost. */
312 8, /* GP2FP */
313 8, /* FP2GP */
314 2 /* FP2FP */
317 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
319 2, /* GP2GP */
320 /* Avoid the use of int<->fp moves for spilling. */
321 6, /* GP2FP */
322 6, /* FP2GP */
323 4 /* FP2FP */
326 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
328 1, /* GP2GP */
329 /* Avoid the use of int<->fp moves for spilling. */
330 8, /* GP2FP */
331 8, /* FP2GP */
332 4 /* FP2FP */
335 /* Generic costs for vector insn classes. */
336 static const struct cpu_vector_cost generic_vector_cost =
338 1, /* scalar_int_stmt_cost */
339 1, /* scalar_fp_stmt_cost */
340 1, /* scalar_load_cost */
341 1, /* scalar_store_cost */
342 1, /* vec_int_stmt_cost */
343 1, /* vec_fp_stmt_cost */
344 2, /* vec_permute_cost */
345 1, /* vec_to_scalar_cost */
346 1, /* scalar_to_vec_cost */
347 1, /* vec_align_load_cost */
348 1, /* vec_unalign_load_cost */
349 1, /* vec_unalign_store_cost */
350 1, /* vec_store_cost */
351 3, /* cond_taken_branch_cost */
352 1 /* cond_not_taken_branch_cost */
355 /* ThunderX costs for vector insn classes. */
356 static const struct cpu_vector_cost thunderx_vector_cost =
358 1, /* scalar_int_stmt_cost */
359 1, /* scalar_fp_stmt_cost */
360 3, /* scalar_load_cost */
361 1, /* scalar_store_cost */
362 4, /* vec_int_stmt_cost */
363 1, /* vec_fp_stmt_cost */
364 4, /* vec_permute_cost */
365 2, /* vec_to_scalar_cost */
366 2, /* scalar_to_vec_cost */
367 3, /* vec_align_load_cost */
368 5, /* vec_unalign_load_cost */
369 5, /* vec_unalign_store_cost */
370 1, /* vec_store_cost */
371 3, /* cond_taken_branch_cost */
372 3 /* cond_not_taken_branch_cost */
375 /* Generic costs for vector insn classes. */
376 static const struct cpu_vector_cost cortexa57_vector_cost =
378 1, /* scalar_int_stmt_cost */
379 1, /* scalar_fp_stmt_cost */
380 4, /* scalar_load_cost */
381 1, /* scalar_store_cost */
382 2, /* vec_int_stmt_cost */
383 2, /* vec_fp_stmt_cost */
384 3, /* vec_permute_cost */
385 8, /* vec_to_scalar_cost */
386 8, /* scalar_to_vec_cost */
387 4, /* vec_align_load_cost */
388 4, /* vec_unalign_load_cost */
389 1, /* vec_unalign_store_cost */
390 1, /* vec_store_cost */
391 1, /* cond_taken_branch_cost */
392 1 /* cond_not_taken_branch_cost */
395 static const struct cpu_vector_cost exynosm1_vector_cost =
397 1, /* scalar_int_stmt_cost */
398 1, /* scalar_fp_stmt_cost */
399 5, /* scalar_load_cost */
400 1, /* scalar_store_cost */
401 3, /* vec_int_stmt_cost */
402 3, /* vec_fp_stmt_cost */
403 3, /* vec_permute_cost */
404 3, /* vec_to_scalar_cost */
405 3, /* scalar_to_vec_cost */
406 5, /* vec_align_load_cost */
407 5, /* vec_unalign_load_cost */
408 1, /* vec_unalign_store_cost */
409 1, /* vec_store_cost */
410 1, /* cond_taken_branch_cost */
411 1 /* cond_not_taken_branch_cost */
414 /* Generic costs for vector insn classes. */
415 static const struct cpu_vector_cost xgene1_vector_cost =
417 1, /* scalar_int_stmt_cost */
418 1, /* scalar_fp_stmt_cost */
419 5, /* scalar_load_cost */
420 1, /* scalar_store_cost */
421 2, /* vec_int_stmt_cost */
422 2, /* vec_fp_stmt_cost */
423 2, /* vec_permute_cost */
424 4, /* vec_to_scalar_cost */
425 4, /* scalar_to_vec_cost */
426 10, /* vec_align_load_cost */
427 10, /* vec_unalign_load_cost */
428 2, /* vec_unalign_store_cost */
429 2, /* vec_store_cost */
430 2, /* cond_taken_branch_cost */
431 1 /* cond_not_taken_branch_cost */
434 /* Costs for vector insn classes for Vulcan. */
435 static const struct cpu_vector_cost thunderx2t99_vector_cost =
437 1, /* scalar_int_stmt_cost */
438 6, /* scalar_fp_stmt_cost */
439 4, /* scalar_load_cost */
440 1, /* scalar_store_cost */
441 5, /* vec_int_stmt_cost */
442 6, /* vec_fp_stmt_cost */
443 3, /* vec_permute_cost */
444 6, /* vec_to_scalar_cost */
445 5, /* scalar_to_vec_cost */
446 8, /* vec_align_load_cost */
447 8, /* vec_unalign_load_cost */
448 4, /* vec_unalign_store_cost */
449 4, /* vec_store_cost */
450 2, /* cond_taken_branch_cost */
451 1 /* cond_not_taken_branch_cost */
454 /* Generic costs for branch instructions. */
455 static const struct cpu_branch_cost generic_branch_cost =
457 1, /* Predictable. */
458 3 /* Unpredictable. */
461 /* Generic approximation modes. */
462 static const cpu_approx_modes generic_approx_modes =
464 AARCH64_APPROX_NONE, /* division */
465 AARCH64_APPROX_NONE, /* sqrt */
466 AARCH64_APPROX_NONE /* recip_sqrt */
469 /* Approximation modes for Exynos M1. */
470 static const cpu_approx_modes exynosm1_approx_modes =
472 AARCH64_APPROX_NONE, /* division */
473 AARCH64_APPROX_ALL, /* sqrt */
474 AARCH64_APPROX_ALL /* recip_sqrt */
477 /* Approximation modes for X-Gene 1. */
478 static const cpu_approx_modes xgene1_approx_modes =
480 AARCH64_APPROX_NONE, /* division */
481 AARCH64_APPROX_NONE, /* sqrt */
482 AARCH64_APPROX_ALL /* recip_sqrt */
485 /* Generic prefetch settings (which disable prefetch). */
486 static const cpu_prefetch_tune generic_prefetch_tune =
488 0, /* num_slots */
489 -1, /* l1_cache_size */
490 -1, /* l1_cache_line_size */
491 -1, /* l2_cache_size */
492 -1 /* default_opt_level */
495 static const cpu_prefetch_tune exynosm1_prefetch_tune =
497 0, /* num_slots */
498 -1, /* l1_cache_size */
499 64, /* l1_cache_line_size */
500 -1, /* l2_cache_size */
501 -1 /* default_opt_level */
504 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
506 4, /* num_slots */
507 32, /* l1_cache_size */
508 64, /* l1_cache_line_size */
509 1024, /* l2_cache_size */
510 3 /* default_opt_level */
513 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
515 8, /* num_slots */
516 32, /* l1_cache_size */
517 128, /* l1_cache_line_size */
518 16*1024, /* l2_cache_size */
519 3 /* default_opt_level */
522 static const cpu_prefetch_tune thunderx_prefetch_tune =
524 8, /* num_slots */
525 32, /* l1_cache_size */
526 128, /* l1_cache_line_size */
527 -1, /* l2_cache_size */
528 -1 /* default_opt_level */
531 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
533 8, /* num_slots */
534 32, /* l1_cache_size */
535 64, /* l1_cache_line_size */
536 256, /* l2_cache_size */
537 -1 /* default_opt_level */
540 static const struct tune_params generic_tunings =
542 &cortexa57_extra_costs,
543 &generic_addrcost_table,
544 &generic_regmove_cost,
545 &generic_vector_cost,
546 &generic_branch_cost,
547 &generic_approx_modes,
548 4, /* memmov_cost */
549 2, /* issue_rate */
550 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
551 8, /* function_align. */
552 4, /* jump_align. */
553 8, /* loop_align. */
554 2, /* int_reassoc_width. */
555 4, /* fp_reassoc_width. */
556 1, /* vec_reassoc_width. */
557 2, /* min_div_recip_mul_sf. */
558 2, /* min_div_recip_mul_df. */
559 0, /* max_case_values. */
560 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
561 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
562 &generic_prefetch_tune
565 static const struct tune_params cortexa35_tunings =
567 &cortexa53_extra_costs,
568 &generic_addrcost_table,
569 &cortexa53_regmove_cost,
570 &generic_vector_cost,
571 &generic_branch_cost,
572 &generic_approx_modes,
573 4, /* memmov_cost */
574 1, /* issue_rate */
575 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
576 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
577 16, /* function_align. */
578 4, /* jump_align. */
579 8, /* loop_align. */
580 2, /* int_reassoc_width. */
581 4, /* fp_reassoc_width. */
582 1, /* vec_reassoc_width. */
583 2, /* min_div_recip_mul_sf. */
584 2, /* min_div_recip_mul_df. */
585 0, /* max_case_values. */
586 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
587 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
588 &generic_prefetch_tune
591 static const struct tune_params cortexa53_tunings =
593 &cortexa53_extra_costs,
594 &generic_addrcost_table,
595 &cortexa53_regmove_cost,
596 &generic_vector_cost,
597 &generic_branch_cost,
598 &generic_approx_modes,
599 4, /* memmov_cost */
600 2, /* issue_rate */
601 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
602 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
603 16, /* function_align. */
604 4, /* jump_align. */
605 8, /* loop_align. */
606 2, /* int_reassoc_width. */
607 4, /* fp_reassoc_width. */
608 1, /* vec_reassoc_width. */
609 2, /* min_div_recip_mul_sf. */
610 2, /* min_div_recip_mul_df. */
611 0, /* max_case_values. */
612 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
613 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
614 &generic_prefetch_tune
617 static const struct tune_params cortexa57_tunings =
619 &cortexa57_extra_costs,
620 &generic_addrcost_table,
621 &cortexa57_regmove_cost,
622 &cortexa57_vector_cost,
623 &generic_branch_cost,
624 &generic_approx_modes,
625 4, /* memmov_cost */
626 3, /* issue_rate */
627 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
628 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
629 16, /* function_align. */
630 4, /* jump_align. */
631 8, /* loop_align. */
632 2, /* int_reassoc_width. */
633 4, /* fp_reassoc_width. */
634 1, /* vec_reassoc_width. */
635 2, /* min_div_recip_mul_sf. */
636 2, /* min_div_recip_mul_df. */
637 0, /* max_case_values. */
638 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
639 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
640 &generic_prefetch_tune
643 static const struct tune_params cortexa72_tunings =
645 &cortexa57_extra_costs,
646 &generic_addrcost_table,
647 &cortexa57_regmove_cost,
648 &cortexa57_vector_cost,
649 &generic_branch_cost,
650 &generic_approx_modes,
651 4, /* memmov_cost */
652 3, /* issue_rate */
653 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
654 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
655 16, /* function_align. */
656 4, /* jump_align. */
657 8, /* loop_align. */
658 2, /* int_reassoc_width. */
659 4, /* fp_reassoc_width. */
660 1, /* vec_reassoc_width. */
661 2, /* min_div_recip_mul_sf. */
662 2, /* min_div_recip_mul_df. */
663 0, /* max_case_values. */
664 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
665 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
666 &generic_prefetch_tune
669 static const struct tune_params cortexa73_tunings =
671 &cortexa57_extra_costs,
672 &generic_addrcost_table,
673 &cortexa57_regmove_cost,
674 &cortexa57_vector_cost,
675 &generic_branch_cost,
676 &generic_approx_modes,
677 4, /* memmov_cost. */
678 2, /* issue_rate. */
679 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
680 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
681 16, /* function_align. */
682 4, /* jump_align. */
683 8, /* loop_align. */
684 2, /* int_reassoc_width. */
685 4, /* fp_reassoc_width. */
686 1, /* vec_reassoc_width. */
687 2, /* min_div_recip_mul_sf. */
688 2, /* min_div_recip_mul_df. */
689 0, /* max_case_values. */
690 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
691 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
692 &generic_prefetch_tune
697 static const struct tune_params exynosm1_tunings =
699 &exynosm1_extra_costs,
700 &exynosm1_addrcost_table,
701 &exynosm1_regmove_cost,
702 &exynosm1_vector_cost,
703 &generic_branch_cost,
704 &exynosm1_approx_modes,
705 4, /* memmov_cost */
706 3, /* issue_rate */
707 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
708 4, /* function_align. */
709 4, /* jump_align. */
710 4, /* loop_align. */
711 2, /* int_reassoc_width. */
712 4, /* fp_reassoc_width. */
713 1, /* vec_reassoc_width. */
714 2, /* min_div_recip_mul_sf. */
715 2, /* min_div_recip_mul_df. */
716 48, /* max_case_values. */
717 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
718 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
719 &exynosm1_prefetch_tune
722 static const struct tune_params thunderxt88_tunings =
724 &thunderx_extra_costs,
725 &generic_addrcost_table,
726 &thunderx_regmove_cost,
727 &thunderx_vector_cost,
728 &generic_branch_cost,
729 &generic_approx_modes,
730 6, /* memmov_cost */
731 2, /* issue_rate */
732 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
733 8, /* function_align. */
734 8, /* jump_align. */
735 8, /* loop_align. */
736 2, /* int_reassoc_width. */
737 4, /* fp_reassoc_width. */
738 1, /* vec_reassoc_width. */
739 2, /* min_div_recip_mul_sf. */
740 2, /* min_div_recip_mul_df. */
741 0, /* max_case_values. */
742 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
743 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
744 &thunderxt88_prefetch_tune
747 static const struct tune_params thunderx_tunings =
749 &thunderx_extra_costs,
750 &generic_addrcost_table,
751 &thunderx_regmove_cost,
752 &thunderx_vector_cost,
753 &generic_branch_cost,
754 &generic_approx_modes,
755 6, /* memmov_cost */
756 2, /* issue_rate */
757 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
758 8, /* function_align. */
759 8, /* jump_align. */
760 8, /* loop_align. */
761 2, /* int_reassoc_width. */
762 4, /* fp_reassoc_width. */
763 1, /* vec_reassoc_width. */
764 2, /* min_div_recip_mul_sf. */
765 2, /* min_div_recip_mul_df. */
766 0, /* max_case_values. */
767 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
768 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
769 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
770 &thunderx_prefetch_tune
773 static const struct tune_params xgene1_tunings =
775 &xgene1_extra_costs,
776 &xgene1_addrcost_table,
777 &xgene1_regmove_cost,
778 &xgene1_vector_cost,
779 &generic_branch_cost,
780 &xgene1_approx_modes,
781 6, /* memmov_cost */
782 4, /* issue_rate */
783 AARCH64_FUSE_NOTHING, /* fusible_ops */
784 16, /* function_align. */
785 8, /* jump_align. */
786 16, /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params qdf24xx_tunings =
800 &qdf24xx_extra_costs,
801 &generic_addrcost_table,
802 &qdf24xx_regmove_cost,
803 &generic_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 4, /* memmov_cost */
807 4, /* issue_rate */
808 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
810 16, /* function_align. */
811 8, /* jump_align. */
812 16, /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
821 &qdf24xx_prefetch_tune
824 static const struct tune_params thunderx2t99_tunings =
826 &thunderx2t99_extra_costs,
827 &thunderx2t99_addrcost_table,
828 &thunderx2t99_regmove_cost,
829 &thunderx2t99_vector_cost,
830 &generic_branch_cost,
831 &generic_approx_modes,
832 4, /* memmov_cost. */
833 4, /* issue_rate. */
834 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
835 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
836 16, /* function_align. */
837 8, /* jump_align. */
838 16, /* loop_align. */
839 3, /* int_reassoc_width. */
840 2, /* fp_reassoc_width. */
841 2, /* vec_reassoc_width. */
842 2, /* min_div_recip_mul_sf. */
843 2, /* min_div_recip_mul_df. */
844 0, /* max_case_values. */
845 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
846 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
847 &thunderx2t99_prefetch_tune
850 /* Support for fine-grained override of the tuning structures. */
851 struct aarch64_tuning_override_function
853 const char* name;
854 void (*parse_override)(const char*, struct tune_params*);
857 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
858 static void aarch64_parse_tune_string (const char*, struct tune_params*);
860 static const struct aarch64_tuning_override_function
861 aarch64_tuning_override_functions[] =
863 { "fuse", aarch64_parse_fuse_string },
864 { "tune", aarch64_parse_tune_string },
865 { NULL, NULL }
868 /* A processor implementing AArch64. */
869 struct processor
871 const char *const name;
872 enum aarch64_processor ident;
873 enum aarch64_processor sched_core;
874 enum aarch64_arch arch;
875 unsigned architecture_version;
876 const unsigned long flags;
877 const struct tune_params *const tune;
880 /* Architectures implementing AArch64. */
881 static const struct processor all_architectures[] =
883 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
884 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
885 #include "aarch64-arches.def"
886 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
889 /* Processor cores implementing AArch64. */
890 static const struct processor all_cores[] =
892 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
893 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
894 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
895 FLAGS, &COSTS##_tunings},
896 #include "aarch64-cores.def"
897 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
898 AARCH64_FL_FOR_ARCH8, &generic_tunings},
899 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
903 /* Target specification. These are populated by the -march, -mtune, -mcpu
904 handling code or by target attributes. */
905 static const struct processor *selected_arch;
906 static const struct processor *selected_cpu;
907 static const struct processor *selected_tune;
909 /* The current tuning set. */
910 struct tune_params aarch64_tune_params = generic_tunings;
912 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
914 /* An ISA extension in the co-processor and main instruction set space. */
915 struct aarch64_option_extension
917 const char *const name;
918 const unsigned long flags_on;
919 const unsigned long flags_off;
922 typedef enum aarch64_cond_code
924 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
925 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
926 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
928 aarch64_cc;
930 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
932 /* The condition codes of the processor, and the inverse function. */
933 static const char * const aarch64_condition_codes[] =
935 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
936 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
939 /* Generate code to enable conditional branches in functions over 1 MiB. */
940 const char *
941 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
942 const char * branch_format)
944 rtx_code_label * tmp_label = gen_label_rtx ();
945 char label_buf[256];
946 char buffer[128];
947 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
948 CODE_LABEL_NUMBER (tmp_label));
949 const char *label_ptr = targetm.strip_name_encoding (label_buf);
950 rtx dest_label = operands[pos_label];
951 operands[pos_label] = tmp_label;
953 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
954 output_asm_insn (buffer, operands);
956 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
957 operands[pos_label] = dest_label;
958 output_asm_insn (buffer, operands);
959 return "";
962 void
963 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
965 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
966 if (TARGET_GENERAL_REGS_ONLY)
967 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
968 else
969 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
972 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
973 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
974 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
975 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
976 cost (in this case the best class is the lowest cost one). Using ALL_REGS
977 irrespectively of its cost results in bad allocations with many redundant
978 int<->FP moves which are expensive on various cores.
979 To avoid this we don't allow ALL_REGS as the allocno class, but force a
980 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
981 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
982 Otherwise set the allocno class depending on the mode.
983 The result of this is that it is no longer inefficient to have a higher
984 memory move cost than the register move cost.
987 static reg_class_t
988 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
989 reg_class_t best_class)
991 machine_mode mode;
993 if (allocno_class != ALL_REGS)
994 return allocno_class;
996 if (best_class != ALL_REGS)
997 return best_class;
999 mode = PSEUDO_REGNO_MODE (regno);
1000 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1003 static unsigned int
1004 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 if (GET_MODE_UNIT_SIZE (mode) == 4)
1007 return aarch64_tune_params.min_div_recip_mul_sf;
1008 return aarch64_tune_params.min_div_recip_mul_df;
1011 static int
1012 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1013 machine_mode mode)
1015 if (VECTOR_MODE_P (mode))
1016 return aarch64_tune_params.vec_reassoc_width;
1017 if (INTEGRAL_MODE_P (mode))
1018 return aarch64_tune_params.int_reassoc_width;
1019 if (FLOAT_MODE_P (mode))
1020 return aarch64_tune_params.fp_reassoc_width;
1021 return 1;
1024 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1025 unsigned
1026 aarch64_dbx_register_number (unsigned regno)
1028 if (GP_REGNUM_P (regno))
1029 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1030 else if (regno == SP_REGNUM)
1031 return AARCH64_DWARF_SP;
1032 else if (FP_REGNUM_P (regno))
1033 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1036 equivalent DWARF register. */
1037 return DWARF_FRAME_REGISTERS;
1040 /* Return TRUE if MODE is any of the large INT modes. */
1041 static bool
1042 aarch64_vect_struct_mode_p (machine_mode mode)
1044 return mode == OImode || mode == CImode || mode == XImode;
1047 /* Return TRUE if MODE is any of the vector modes. */
1048 static bool
1049 aarch64_vector_mode_p (machine_mode mode)
1051 return aarch64_vector_mode_supported_p (mode)
1052 || aarch64_vect_struct_mode_p (mode);
1055 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1056 static bool
1057 aarch64_array_mode_supported_p (machine_mode mode,
1058 unsigned HOST_WIDE_INT nelems)
1060 if (TARGET_SIMD
1061 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1062 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1063 && (nelems >= 2 && nelems <= 4))
1064 return true;
1066 return false;
1069 /* Implement HARD_REGNO_NREGS. */
1072 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 switch (aarch64_regno_regclass (regno))
1076 case FP_REGS:
1077 case FP_LO_REGS:
1078 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1079 default:
1080 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082 gcc_unreachable ();
1085 /* Implement HARD_REGNO_MODE_OK. */
1088 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 if (GET_MODE_CLASS (mode) == MODE_CC)
1091 return regno == CC_REGNUM;
1093 if (regno == SP_REGNUM)
1094 /* The purpose of comparing with ptr_mode is to support the
1095 global register variable associated with the stack pointer
1096 register via the syntax of asm ("wsp") in ILP32. */
1097 return mode == Pmode || mode == ptr_mode;
1099 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1100 return mode == Pmode;
1102 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1103 return 1;
1105 if (FP_REGNUM_P (regno))
1107 if (aarch64_vect_struct_mode_p (mode))
1108 return
1109 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1110 else
1111 return 1;
1114 return 0;
1117 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1118 machine_mode
1119 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1120 machine_mode mode)
1122 /* Handle modes that fit within single registers. */
1123 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1125 if (GET_MODE_SIZE (mode) >= 4)
1126 return mode;
1127 else
1128 return SImode;
1130 /* Fall back to generic for multi-reg and very large modes. */
1131 else
1132 return choose_hard_reg_mode (regno, nregs, false);
1135 /* Return true if calls to DECL should be treated as
1136 long-calls (ie called via a register). */
1137 static bool
1138 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1140 return false;
1143 /* Return true if calls to symbol-ref SYM should be treated as
1144 long-calls (ie called via a register). */
1145 bool
1146 aarch64_is_long_call_p (rtx sym)
1148 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1151 /* Return true if calls to symbol-ref SYM should not go through
1152 plt stubs. */
1154 bool
1155 aarch64_is_noplt_call_p (rtx sym)
1157 const_tree decl = SYMBOL_REF_DECL (sym);
1159 if (flag_pic
1160 && decl
1161 && (!flag_plt
1162 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1163 && !targetm.binds_local_p (decl))
1164 return true;
1166 return false;
1169 /* Return true if the offsets to a zero/sign-extract operation
1170 represent an expression that matches an extend operation. The
1171 operands represent the paramters from
1173 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1174 bool
1175 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1176 rtx extract_imm)
1178 HOST_WIDE_INT mult_val, extract_val;
1180 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1181 return false;
1183 mult_val = INTVAL (mult_imm);
1184 extract_val = INTVAL (extract_imm);
1186 if (extract_val > 8
1187 && extract_val < GET_MODE_BITSIZE (mode)
1188 && exact_log2 (extract_val & ~7) > 0
1189 && (extract_val & 7) <= 4
1190 && mult_val == (1 << (extract_val & 7)))
1191 return true;
1193 return false;
1196 /* Emit an insn that's a simple single-set. Both the operands must be
1197 known to be valid. */
1198 inline static rtx_insn *
1199 emit_set_insn (rtx x, rtx y)
1201 return emit_insn (gen_rtx_SET (x, y));
1204 /* X and Y are two things to compare using CODE. Emit the compare insn and
1205 return the rtx for register 0 in the proper mode. */
1207 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1209 machine_mode mode = SELECT_CC_MODE (code, x, y);
1210 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1212 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1213 return cc_reg;
1216 /* Build the SYMBOL_REF for __tls_get_addr. */
1218 static GTY(()) rtx tls_get_addr_libfunc;
1221 aarch64_tls_get_addr (void)
1223 if (!tls_get_addr_libfunc)
1224 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1225 return tls_get_addr_libfunc;
1228 /* Return the TLS model to use for ADDR. */
1230 static enum tls_model
1231 tls_symbolic_operand_type (rtx addr)
1233 enum tls_model tls_kind = TLS_MODEL_NONE;
1234 rtx sym, addend;
1236 if (GET_CODE (addr) == CONST)
1238 split_const (addr, &sym, &addend);
1239 if (GET_CODE (sym) == SYMBOL_REF)
1240 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1242 else if (GET_CODE (addr) == SYMBOL_REF)
1243 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1245 return tls_kind;
1248 /* We'll allow lo_sum's in addresses in our legitimate addresses
1249 so that combine would take care of combining addresses where
1250 necessary, but for generation purposes, we'll generate the address
1251 as :
1252 RTL Absolute
1253 tmp = hi (symbol_ref); adrp x1, foo
1254 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1257 PIC TLS
1258 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1259 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1260 bl __tls_get_addr
1263 Load TLS symbol, depending on TLS mechanism and TLS access model.
1265 Global Dynamic - Traditional TLS:
1266 adrp tmp, :tlsgd:imm
1267 add dest, tmp, #:tlsgd_lo12:imm
1268 bl __tls_get_addr
1270 Global Dynamic - TLS Descriptors:
1271 adrp dest, :tlsdesc:imm
1272 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1273 add dest, dest, #:tlsdesc_lo12:imm
1274 blr tmp
1275 mrs tp, tpidr_el0
1276 add dest, dest, tp
1278 Initial Exec:
1279 mrs tp, tpidr_el0
1280 adrp tmp, :gottprel:imm
1281 ldr dest, [tmp, #:gottprel_lo12:imm]
1282 add dest, dest, tp
1284 Local Exec:
1285 mrs tp, tpidr_el0
1286 add t0, tp, #:tprel_hi12:imm, lsl #12
1287 add t0, t0, #:tprel_lo12_nc:imm
1290 static void
1291 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1292 enum aarch64_symbol_type type)
1294 switch (type)
1296 case SYMBOL_SMALL_ABSOLUTE:
1298 /* In ILP32, the mode of dest can be either SImode or DImode. */
1299 rtx tmp_reg = dest;
1300 machine_mode mode = GET_MODE (dest);
1302 gcc_assert (mode == Pmode || mode == ptr_mode);
1304 if (can_create_pseudo_p ())
1305 tmp_reg = gen_reg_rtx (mode);
1307 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1308 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1309 return;
1312 case SYMBOL_TINY_ABSOLUTE:
1313 emit_insn (gen_rtx_SET (dest, imm));
1314 return;
1316 case SYMBOL_SMALL_GOT_28K:
1318 machine_mode mode = GET_MODE (dest);
1319 rtx gp_rtx = pic_offset_table_rtx;
1320 rtx insn;
1321 rtx mem;
1323 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1324 here before rtl expand. Tree IVOPT will generate rtl pattern to
1325 decide rtx costs, in which case pic_offset_table_rtx is not
1326 initialized. For that case no need to generate the first adrp
1327 instruction as the final cost for global variable access is
1328 one instruction. */
1329 if (gp_rtx != NULL)
1331 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1332 using the page base as GOT base, the first page may be wasted,
1333 in the worst scenario, there is only 28K space for GOT).
1335 The generate instruction sequence for accessing global variable
1338 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1340 Only one instruction needed. But we must initialize
1341 pic_offset_table_rtx properly. We generate initialize insn for
1342 every global access, and allow CSE to remove all redundant.
1344 The final instruction sequences will look like the following
1345 for multiply global variables access.
1347 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1349 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1350 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1351 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1352 ... */
1354 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1355 crtl->uses_pic_offset_table = 1;
1356 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1358 if (mode != GET_MODE (gp_rtx))
1359 gp_rtx = gen_lowpart (mode, gp_rtx);
1363 if (mode == ptr_mode)
1365 if (mode == DImode)
1366 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1367 else
1368 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1370 mem = XVECEXP (SET_SRC (insn), 0, 0);
1372 else
1374 gcc_assert (mode == Pmode);
1376 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1377 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1380 /* The operand is expected to be MEM. Whenever the related insn
1381 pattern changed, above code which calculate mem should be
1382 updated. */
1383 gcc_assert (GET_CODE (mem) == MEM);
1384 MEM_READONLY_P (mem) = 1;
1385 MEM_NOTRAP_P (mem) = 1;
1386 emit_insn (insn);
1387 return;
1390 case SYMBOL_SMALL_GOT_4G:
1392 /* In ILP32, the mode of dest can be either SImode or DImode,
1393 while the got entry is always of SImode size. The mode of
1394 dest depends on how dest is used: if dest is assigned to a
1395 pointer (e.g. in the memory), it has SImode; it may have
1396 DImode if dest is dereferenced to access the memeory.
1397 This is why we have to handle three different ldr_got_small
1398 patterns here (two patterns for ILP32). */
1400 rtx insn;
1401 rtx mem;
1402 rtx tmp_reg = dest;
1403 machine_mode mode = GET_MODE (dest);
1405 if (can_create_pseudo_p ())
1406 tmp_reg = gen_reg_rtx (mode);
1408 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1409 if (mode == ptr_mode)
1411 if (mode == DImode)
1412 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1413 else
1414 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1416 mem = XVECEXP (SET_SRC (insn), 0, 0);
1418 else
1420 gcc_assert (mode == Pmode);
1422 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1423 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1426 gcc_assert (GET_CODE (mem) == MEM);
1427 MEM_READONLY_P (mem) = 1;
1428 MEM_NOTRAP_P (mem) = 1;
1429 emit_insn (insn);
1430 return;
1433 case SYMBOL_SMALL_TLSGD:
1435 rtx_insn *insns;
1436 machine_mode mode = GET_MODE (dest);
1437 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1439 start_sequence ();
1440 if (TARGET_ILP32)
1441 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1442 else
1443 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1444 insns = get_insns ();
1445 end_sequence ();
1447 RTL_CONST_CALL_P (insns) = 1;
1448 emit_libcall_block (insns, dest, result, imm);
1449 return;
1452 case SYMBOL_SMALL_TLSDESC:
1454 machine_mode mode = GET_MODE (dest);
1455 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1456 rtx tp;
1458 gcc_assert (mode == Pmode || mode == ptr_mode);
1460 /* In ILP32, the got entry is always of SImode size. Unlike
1461 small GOT, the dest is fixed at reg 0. */
1462 if (TARGET_ILP32)
1463 emit_insn (gen_tlsdesc_small_si (imm));
1464 else
1465 emit_insn (gen_tlsdesc_small_di (imm));
1466 tp = aarch64_load_tp (NULL);
1468 if (mode != Pmode)
1469 tp = gen_lowpart (mode, tp);
1471 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1472 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1473 return;
1476 case SYMBOL_SMALL_TLSIE:
1478 /* In ILP32, the mode of dest can be either SImode or DImode,
1479 while the got entry is always of SImode size. The mode of
1480 dest depends on how dest is used: if dest is assigned to a
1481 pointer (e.g. in the memory), it has SImode; it may have
1482 DImode if dest is dereferenced to access the memeory.
1483 This is why we have to handle three different tlsie_small
1484 patterns here (two patterns for ILP32). */
1485 machine_mode mode = GET_MODE (dest);
1486 rtx tmp_reg = gen_reg_rtx (mode);
1487 rtx tp = aarch64_load_tp (NULL);
1489 if (mode == ptr_mode)
1491 if (mode == DImode)
1492 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1493 else
1495 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1496 tp = gen_lowpart (mode, tp);
1499 else
1501 gcc_assert (mode == Pmode);
1502 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1505 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1506 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1507 return;
1510 case SYMBOL_TLSLE12:
1511 case SYMBOL_TLSLE24:
1512 case SYMBOL_TLSLE32:
1513 case SYMBOL_TLSLE48:
1515 machine_mode mode = GET_MODE (dest);
1516 rtx tp = aarch64_load_tp (NULL);
1518 if (mode != Pmode)
1519 tp = gen_lowpart (mode, tp);
1521 switch (type)
1523 case SYMBOL_TLSLE12:
1524 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1525 (dest, tp, imm));
1526 break;
1527 case SYMBOL_TLSLE24:
1528 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1529 (dest, tp, imm));
1530 break;
1531 case SYMBOL_TLSLE32:
1532 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1533 (dest, imm));
1534 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1535 (dest, dest, tp));
1536 break;
1537 case SYMBOL_TLSLE48:
1538 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1539 (dest, imm));
1540 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1541 (dest, dest, tp));
1542 break;
1543 default:
1544 gcc_unreachable ();
1547 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1548 return;
1551 case SYMBOL_TINY_GOT:
1552 emit_insn (gen_ldr_got_tiny (dest, imm));
1553 return;
1555 case SYMBOL_TINY_TLSIE:
1557 machine_mode mode = GET_MODE (dest);
1558 rtx tp = aarch64_load_tp (NULL);
1560 if (mode == ptr_mode)
1562 if (mode == DImode)
1563 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1564 else
1566 tp = gen_lowpart (mode, tp);
1567 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1570 else
1572 gcc_assert (mode == Pmode);
1573 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1576 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1577 return;
1580 default:
1581 gcc_unreachable ();
1585 /* Emit a move from SRC to DEST. Assume that the move expanders can
1586 handle all moves if !can_create_pseudo_p (). The distinction is
1587 important because, unlike emit_move_insn, the move expanders know
1588 how to force Pmode objects into the constant pool even when the
1589 constant pool address is not itself legitimate. */
1590 static rtx
1591 aarch64_emit_move (rtx dest, rtx src)
1593 return (can_create_pseudo_p ()
1594 ? emit_move_insn (dest, src)
1595 : emit_move_insn_1 (dest, src));
1598 /* Split a 128-bit move operation into two 64-bit move operations,
1599 taking care to handle partial overlap of register to register
1600 copies. Special cases are needed when moving between GP regs and
1601 FP regs. SRC can be a register, constant or memory; DST a register
1602 or memory. If either operand is memory it must not have any side
1603 effects. */
1604 void
1605 aarch64_split_128bit_move (rtx dst, rtx src)
1607 rtx dst_lo, dst_hi;
1608 rtx src_lo, src_hi;
1610 machine_mode mode = GET_MODE (dst);
1612 gcc_assert (mode == TImode || mode == TFmode);
1613 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1614 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1616 if (REG_P (dst) && REG_P (src))
1618 int src_regno = REGNO (src);
1619 int dst_regno = REGNO (dst);
1621 /* Handle FP <-> GP regs. */
1622 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1624 src_lo = gen_lowpart (word_mode, src);
1625 src_hi = gen_highpart (word_mode, src);
1627 if (mode == TImode)
1629 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1630 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1632 else
1634 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1635 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1637 return;
1639 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1641 dst_lo = gen_lowpart (word_mode, dst);
1642 dst_hi = gen_highpart (word_mode, dst);
1644 if (mode == TImode)
1646 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1647 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1649 else
1651 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1652 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1654 return;
1658 dst_lo = gen_lowpart (word_mode, dst);
1659 dst_hi = gen_highpart (word_mode, dst);
1660 src_lo = gen_lowpart (word_mode, src);
1661 src_hi = gen_highpart_mode (word_mode, mode, src);
1663 /* At most one pairing may overlap. */
1664 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1666 aarch64_emit_move (dst_hi, src_hi);
1667 aarch64_emit_move (dst_lo, src_lo);
1669 else
1671 aarch64_emit_move (dst_lo, src_lo);
1672 aarch64_emit_move (dst_hi, src_hi);
1676 bool
1677 aarch64_split_128bit_move_p (rtx dst, rtx src)
1679 return (! REG_P (src)
1680 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1683 /* Split a complex SIMD combine. */
1685 void
1686 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1688 machine_mode src_mode = GET_MODE (src1);
1689 machine_mode dst_mode = GET_MODE (dst);
1691 gcc_assert (VECTOR_MODE_P (dst_mode));
1692 gcc_assert (register_operand (dst, dst_mode)
1693 && register_operand (src1, src_mode)
1694 && register_operand (src2, src_mode));
1696 rtx (*gen) (rtx, rtx, rtx);
1698 switch (src_mode)
1700 case V8QImode:
1701 gen = gen_aarch64_simd_combinev8qi;
1702 break;
1703 case V4HImode:
1704 gen = gen_aarch64_simd_combinev4hi;
1705 break;
1706 case V2SImode:
1707 gen = gen_aarch64_simd_combinev2si;
1708 break;
1709 case V4HFmode:
1710 gen = gen_aarch64_simd_combinev4hf;
1711 break;
1712 case V2SFmode:
1713 gen = gen_aarch64_simd_combinev2sf;
1714 break;
1715 case DImode:
1716 gen = gen_aarch64_simd_combinedi;
1717 break;
1718 case DFmode:
1719 gen = gen_aarch64_simd_combinedf;
1720 break;
1721 default:
1722 gcc_unreachable ();
1725 emit_insn (gen (dst, src1, src2));
1726 return;
1729 /* Split a complex SIMD move. */
1731 void
1732 aarch64_split_simd_move (rtx dst, rtx src)
1734 machine_mode src_mode = GET_MODE (src);
1735 machine_mode dst_mode = GET_MODE (dst);
1737 gcc_assert (VECTOR_MODE_P (dst_mode));
1739 if (REG_P (dst) && REG_P (src))
1741 rtx (*gen) (rtx, rtx);
1743 gcc_assert (VECTOR_MODE_P (src_mode));
1745 switch (src_mode)
1747 case V16QImode:
1748 gen = gen_aarch64_split_simd_movv16qi;
1749 break;
1750 case V8HImode:
1751 gen = gen_aarch64_split_simd_movv8hi;
1752 break;
1753 case V4SImode:
1754 gen = gen_aarch64_split_simd_movv4si;
1755 break;
1756 case V2DImode:
1757 gen = gen_aarch64_split_simd_movv2di;
1758 break;
1759 case V8HFmode:
1760 gen = gen_aarch64_split_simd_movv8hf;
1761 break;
1762 case V4SFmode:
1763 gen = gen_aarch64_split_simd_movv4sf;
1764 break;
1765 case V2DFmode:
1766 gen = gen_aarch64_split_simd_movv2df;
1767 break;
1768 default:
1769 gcc_unreachable ();
1772 emit_insn (gen (dst, src));
1773 return;
1777 bool
1778 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1779 machine_mode ymode, rtx y)
1781 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1782 gcc_assert (r != NULL);
1783 return rtx_equal_p (x, r);
1787 static rtx
1788 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1790 if (can_create_pseudo_p ())
1791 return force_reg (mode, value);
1792 else
1794 x = aarch64_emit_move (x, value);
1795 return x;
1800 static rtx
1801 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1803 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1805 rtx high;
1806 /* Load the full offset into a register. This
1807 might be improvable in the future. */
1808 high = GEN_INT (offset);
1809 offset = 0;
1810 high = aarch64_force_temporary (mode, temp, high);
1811 reg = aarch64_force_temporary (mode, temp,
1812 gen_rtx_PLUS (mode, high, reg));
1814 return plus_constant (mode, reg, offset);
1817 static int
1818 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1819 machine_mode mode)
1821 int i;
1822 unsigned HOST_WIDE_INT val, val2, mask;
1823 int one_match, zero_match;
1824 int num_insns;
1826 val = INTVAL (imm);
1828 if (aarch64_move_imm (val, mode))
1830 if (generate)
1831 emit_insn (gen_rtx_SET (dest, imm));
1832 return 1;
1835 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1836 (with XXXX non-zero). In that case check to see if the move can be done in
1837 a smaller mode. */
1838 val2 = val & 0xffffffff;
1839 if (mode == DImode
1840 && aarch64_move_imm (val2, SImode)
1841 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1843 if (generate)
1844 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1846 /* Check if we have to emit a second instruction by checking to see
1847 if any of the upper 32 bits of the original DI mode value is set. */
1848 if (val == val2)
1849 return 1;
1851 i = (val >> 48) ? 48 : 32;
1853 if (generate)
1854 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1855 GEN_INT ((val >> i) & 0xffff)));
1857 return 2;
1860 if ((val >> 32) == 0 || mode == SImode)
1862 if (generate)
1864 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1865 if (mode == SImode)
1866 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1867 GEN_INT ((val >> 16) & 0xffff)));
1868 else
1869 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1870 GEN_INT ((val >> 16) & 0xffff)));
1872 return 2;
1875 /* Remaining cases are all for DImode. */
1877 mask = 0xffff;
1878 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1879 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1880 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1881 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1883 if (zero_match != 2 && one_match != 2)
1885 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1886 For a 64-bit bitmask try whether changing 16 bits to all ones or
1887 zeroes creates a valid bitmask. To check any repeated bitmask,
1888 try using 16 bits from the other 32-bit half of val. */
1890 for (i = 0; i < 64; i += 16, mask <<= 16)
1892 val2 = val & ~mask;
1893 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1894 break;
1895 val2 = val | mask;
1896 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1897 break;
1898 val2 = val2 & ~mask;
1899 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1900 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1901 break;
1903 if (i != 64)
1905 if (generate)
1907 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1908 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1909 GEN_INT ((val >> i) & 0xffff)));
1911 return 2;
1915 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1916 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1917 otherwise skip zero bits. */
1919 num_insns = 1;
1920 mask = 0xffff;
1921 val2 = one_match > zero_match ? ~val : val;
1922 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1924 if (generate)
1925 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1926 ? (val | ~(mask << i))
1927 : (val & (mask << i)))));
1928 for (i += 16; i < 64; i += 16)
1930 if ((val2 & (mask << i)) == 0)
1931 continue;
1932 if (generate)
1933 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1934 GEN_INT ((val >> i) & 0xffff)));
1935 num_insns ++;
1938 return num_insns;
1942 void
1943 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1945 machine_mode mode = GET_MODE (dest);
1947 gcc_assert (mode == SImode || mode == DImode);
1949 /* Check on what type of symbol it is. */
1950 if (GET_CODE (imm) == SYMBOL_REF
1951 || GET_CODE (imm) == LABEL_REF
1952 || GET_CODE (imm) == CONST)
1954 rtx mem, base, offset;
1955 enum aarch64_symbol_type sty;
1957 /* If we have (const (plus symbol offset)), separate out the offset
1958 before we start classifying the symbol. */
1959 split_const (imm, &base, &offset);
1961 sty = aarch64_classify_symbol (base, offset);
1962 switch (sty)
1964 case SYMBOL_FORCE_TO_MEM:
1965 if (offset != const0_rtx
1966 && targetm.cannot_force_const_mem (mode, imm))
1968 gcc_assert (can_create_pseudo_p ());
1969 base = aarch64_force_temporary (mode, dest, base);
1970 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1971 aarch64_emit_move (dest, base);
1972 return;
1975 mem = force_const_mem (ptr_mode, imm);
1976 gcc_assert (mem);
1978 /* If we aren't generating PC relative literals, then
1979 we need to expand the literal pool access carefully.
1980 This is something that needs to be done in a number
1981 of places, so could well live as a separate function. */
1982 if (!aarch64_pcrelative_literal_loads)
1984 gcc_assert (can_create_pseudo_p ());
1985 base = gen_reg_rtx (ptr_mode);
1986 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1987 if (ptr_mode != Pmode)
1988 base = convert_memory_address (Pmode, base);
1989 mem = gen_rtx_MEM (ptr_mode, base);
1992 if (mode != ptr_mode)
1993 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1995 emit_insn (gen_rtx_SET (dest, mem));
1997 return;
1999 case SYMBOL_SMALL_TLSGD:
2000 case SYMBOL_SMALL_TLSDESC:
2001 case SYMBOL_SMALL_TLSIE:
2002 case SYMBOL_SMALL_GOT_28K:
2003 case SYMBOL_SMALL_GOT_4G:
2004 case SYMBOL_TINY_GOT:
2005 case SYMBOL_TINY_TLSIE:
2006 if (offset != const0_rtx)
2008 gcc_assert(can_create_pseudo_p ());
2009 base = aarch64_force_temporary (mode, dest, base);
2010 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2011 aarch64_emit_move (dest, base);
2012 return;
2014 /* FALLTHRU */
2016 case SYMBOL_SMALL_ABSOLUTE:
2017 case SYMBOL_TINY_ABSOLUTE:
2018 case SYMBOL_TLSLE12:
2019 case SYMBOL_TLSLE24:
2020 case SYMBOL_TLSLE32:
2021 case SYMBOL_TLSLE48:
2022 aarch64_load_symref_appropriately (dest, imm, sty);
2023 return;
2025 default:
2026 gcc_unreachable ();
2030 if (!CONST_INT_P (imm))
2032 if (GET_CODE (imm) == HIGH)
2033 emit_insn (gen_rtx_SET (dest, imm));
2034 else
2036 rtx mem = force_const_mem (mode, imm);
2037 gcc_assert (mem);
2038 emit_insn (gen_rtx_SET (dest, mem));
2041 return;
2044 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2047 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2048 temporary value if necessary. FRAME_RELATED_P should be true if
2049 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2050 to the generated instructions. If SCRATCHREG is known to hold
2051 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2052 immediate again.
2054 Since this function may be used to adjust the stack pointer, we must
2055 ensure that it cannot cause transient stack deallocation (for example
2056 by first incrementing SP and then decrementing when adjusting by a
2057 large immediate). */
2059 static void
2060 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2061 HOST_WIDE_INT delta, bool frame_related_p,
2062 bool emit_move_imm)
2064 HOST_WIDE_INT mdelta = abs_hwi (delta);
2065 rtx this_rtx = gen_rtx_REG (mode, regnum);
2066 rtx_insn *insn;
2068 if (!mdelta)
2069 return;
2071 /* Single instruction adjustment. */
2072 if (aarch64_uimm12_shift (mdelta))
2074 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2075 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2076 return;
2079 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2080 Only do this if mdelta is not a 16-bit move as adjusting using a move
2081 is better. */
2082 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2084 HOST_WIDE_INT low_off = mdelta & 0xfff;
2086 low_off = delta < 0 ? -low_off : low_off;
2087 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2088 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2089 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2090 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2091 return;
2094 /* Emit a move immediate if required and an addition/subtraction. */
2095 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2096 if (emit_move_imm)
2097 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2098 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2099 : gen_add2_insn (this_rtx, scratch_rtx));
2100 if (frame_related_p)
2102 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2103 rtx adj = plus_constant (mode, this_rtx, delta);
2104 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2108 static inline void
2109 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2110 HOST_WIDE_INT delta)
2112 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2115 static inline void
2116 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2118 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2119 true, emit_move_imm);
2122 static inline void
2123 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2125 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2126 frame_related_p, true);
2129 static bool
2130 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2131 tree exp ATTRIBUTE_UNUSED)
2133 /* Currently, always true. */
2134 return true;
2137 /* Implement TARGET_PASS_BY_REFERENCE. */
2139 static bool
2140 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2141 machine_mode mode,
2142 const_tree type,
2143 bool named ATTRIBUTE_UNUSED)
2145 HOST_WIDE_INT size;
2146 machine_mode dummymode;
2147 int nregs;
2149 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2150 size = (mode == BLKmode && type)
2151 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2153 /* Aggregates are passed by reference based on their size. */
2154 if (type && AGGREGATE_TYPE_P (type))
2156 size = int_size_in_bytes (type);
2159 /* Variable sized arguments are always returned by reference. */
2160 if (size < 0)
2161 return true;
2163 /* Can this be a candidate to be passed in fp/simd register(s)? */
2164 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2165 &dummymode, &nregs,
2166 NULL))
2167 return false;
2169 /* Arguments which are variable sized or larger than 2 registers are
2170 passed by reference unless they are a homogenous floating point
2171 aggregate. */
2172 return size > 2 * UNITS_PER_WORD;
2175 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2176 static bool
2177 aarch64_return_in_msb (const_tree valtype)
2179 machine_mode dummy_mode;
2180 int dummy_int;
2182 /* Never happens in little-endian mode. */
2183 if (!BYTES_BIG_ENDIAN)
2184 return false;
2186 /* Only composite types smaller than or equal to 16 bytes can
2187 be potentially returned in registers. */
2188 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2189 || int_size_in_bytes (valtype) <= 0
2190 || int_size_in_bytes (valtype) > 16)
2191 return false;
2193 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2194 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2195 is always passed/returned in the least significant bits of fp/simd
2196 register(s). */
2197 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2198 &dummy_mode, &dummy_int, NULL))
2199 return false;
2201 return true;
2204 /* Implement TARGET_FUNCTION_VALUE.
2205 Define how to find the value returned by a function. */
2207 static rtx
2208 aarch64_function_value (const_tree type, const_tree func,
2209 bool outgoing ATTRIBUTE_UNUSED)
2211 machine_mode mode;
2212 int unsignedp;
2213 int count;
2214 machine_mode ag_mode;
2216 mode = TYPE_MODE (type);
2217 if (INTEGRAL_TYPE_P (type))
2218 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2220 if (aarch64_return_in_msb (type))
2222 HOST_WIDE_INT size = int_size_in_bytes (type);
2224 if (size % UNITS_PER_WORD != 0)
2226 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2227 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2231 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2232 &ag_mode, &count, NULL))
2234 if (!aarch64_composite_type_p (type, mode))
2236 gcc_assert (count == 1 && mode == ag_mode);
2237 return gen_rtx_REG (mode, V0_REGNUM);
2239 else
2241 int i;
2242 rtx par;
2244 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2245 for (i = 0; i < count; i++)
2247 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2248 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2249 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2250 XVECEXP (par, 0, i) = tmp;
2252 return par;
2255 else
2256 return gen_rtx_REG (mode, R0_REGNUM);
2259 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2260 Return true if REGNO is the number of a hard register in which the values
2261 of called function may come back. */
2263 static bool
2264 aarch64_function_value_regno_p (const unsigned int regno)
2266 /* Maximum of 16 bytes can be returned in the general registers. Examples
2267 of 16-byte return values are: 128-bit integers and 16-byte small
2268 structures (excluding homogeneous floating-point aggregates). */
2269 if (regno == R0_REGNUM || regno == R1_REGNUM)
2270 return true;
2272 /* Up to four fp/simd registers can return a function value, e.g. a
2273 homogeneous floating-point aggregate having four members. */
2274 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2275 return TARGET_FLOAT;
2277 return false;
2280 /* Implement TARGET_RETURN_IN_MEMORY.
2282 If the type T of the result of a function is such that
2283 void func (T arg)
2284 would require that arg be passed as a value in a register (or set of
2285 registers) according to the parameter passing rules, then the result
2286 is returned in the same registers as would be used for such an
2287 argument. */
2289 static bool
2290 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2292 HOST_WIDE_INT size;
2293 machine_mode ag_mode;
2294 int count;
2296 if (!AGGREGATE_TYPE_P (type)
2297 && TREE_CODE (type) != COMPLEX_TYPE
2298 && TREE_CODE (type) != VECTOR_TYPE)
2299 /* Simple scalar types always returned in registers. */
2300 return false;
2302 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2303 type,
2304 &ag_mode,
2305 &count,
2306 NULL))
2307 return false;
2309 /* Types larger than 2 registers returned in memory. */
2310 size = int_size_in_bytes (type);
2311 return (size < 0 || size > 2 * UNITS_PER_WORD);
2314 static bool
2315 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2316 const_tree type, int *nregs)
2318 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2319 return aarch64_vfp_is_call_or_return_candidate (mode,
2320 type,
2321 &pcum->aapcs_vfp_rmode,
2322 nregs,
2323 NULL);
2326 /* Given MODE and TYPE of a function argument, return the alignment in
2327 bits. The idea is to suppress any stronger alignment requested by
2328 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2329 This is a helper function for local use only. */
2331 static unsigned int
2332 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2334 if (!type)
2335 return GET_MODE_ALIGNMENT (mode);
2337 if (integer_zerop (TYPE_SIZE (type)))
2338 return 0;
2340 gcc_assert (TYPE_MODE (type) == mode);
2342 if (!AGGREGATE_TYPE_P (type))
2343 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2345 if (TREE_CODE (type) == ARRAY_TYPE)
2346 return TYPE_ALIGN (TREE_TYPE (type));
2348 unsigned int alignment = 0;
2349 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2350 if (TREE_CODE (field) == FIELD_DECL)
2351 alignment = std::max (alignment, DECL_ALIGN (field));
2353 return alignment;
2356 /* Layout a function argument according to the AAPCS64 rules. The rule
2357 numbers refer to the rule numbers in the AAPCS64. */
2359 static void
2360 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2361 const_tree type,
2362 bool named ATTRIBUTE_UNUSED)
2364 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2365 int ncrn, nvrn, nregs;
2366 bool allocate_ncrn, allocate_nvrn;
2367 HOST_WIDE_INT size;
2369 /* We need to do this once per argument. */
2370 if (pcum->aapcs_arg_processed)
2371 return;
2373 pcum->aapcs_arg_processed = true;
2375 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2376 size
2377 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2378 UNITS_PER_WORD);
2380 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2381 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2382 mode,
2383 type,
2384 &nregs);
2386 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2387 The following code thus handles passing by SIMD/FP registers first. */
2389 nvrn = pcum->aapcs_nvrn;
2391 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2392 and homogenous short-vector aggregates (HVA). */
2393 if (allocate_nvrn)
2395 if (!TARGET_FLOAT)
2396 aarch64_err_no_fpadvsimd (mode, "argument");
2398 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2400 pcum->aapcs_nextnvrn = nvrn + nregs;
2401 if (!aarch64_composite_type_p (type, mode))
2403 gcc_assert (nregs == 1);
2404 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2406 else
2408 rtx par;
2409 int i;
2410 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2411 for (i = 0; i < nregs; i++)
2413 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2414 V0_REGNUM + nvrn + i);
2415 tmp = gen_rtx_EXPR_LIST
2416 (VOIDmode, tmp,
2417 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2418 XVECEXP (par, 0, i) = tmp;
2420 pcum->aapcs_reg = par;
2422 return;
2424 else
2426 /* C.3 NSRN is set to 8. */
2427 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2428 goto on_stack;
2432 ncrn = pcum->aapcs_ncrn;
2433 nregs = size / UNITS_PER_WORD;
2435 /* C6 - C9. though the sign and zero extension semantics are
2436 handled elsewhere. This is the case where the argument fits
2437 entirely general registers. */
2438 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2441 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2443 /* C.8 if the argument has an alignment of 16 then the NGRN is
2444 rounded up to the next even number. */
2445 if (nregs == 2
2446 && ncrn % 2
2447 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2448 comparison is there because for > 16 * BITS_PER_UNIT
2449 alignment nregs should be > 2 and therefore it should be
2450 passed by reference rather than value. */
2451 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2453 ++ncrn;
2454 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2457 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2458 A reg is still generated for it, but the caller should be smart
2459 enough not to use it. */
2460 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2461 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2462 else
2464 rtx par;
2465 int i;
2467 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2468 for (i = 0; i < nregs; i++)
2470 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2471 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2472 GEN_INT (i * UNITS_PER_WORD));
2473 XVECEXP (par, 0, i) = tmp;
2475 pcum->aapcs_reg = par;
2478 pcum->aapcs_nextncrn = ncrn + nregs;
2479 return;
2482 /* C.11 */
2483 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2485 /* The argument is passed on stack; record the needed number of words for
2486 this argument and align the total size if necessary. */
2487 on_stack:
2488 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2490 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2491 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2492 16 / UNITS_PER_WORD);
2493 return;
2496 /* Implement TARGET_FUNCTION_ARG. */
2498 static rtx
2499 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2500 const_tree type, bool named)
2502 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2503 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2505 if (mode == VOIDmode)
2506 return NULL_RTX;
2508 aarch64_layout_arg (pcum_v, mode, type, named);
2509 return pcum->aapcs_reg;
2512 void
2513 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2514 const_tree fntype ATTRIBUTE_UNUSED,
2515 rtx libname ATTRIBUTE_UNUSED,
2516 const_tree fndecl ATTRIBUTE_UNUSED,
2517 unsigned n_named ATTRIBUTE_UNUSED)
2519 pcum->aapcs_ncrn = 0;
2520 pcum->aapcs_nvrn = 0;
2521 pcum->aapcs_nextncrn = 0;
2522 pcum->aapcs_nextnvrn = 0;
2523 pcum->pcs_variant = ARM_PCS_AAPCS64;
2524 pcum->aapcs_reg = NULL_RTX;
2525 pcum->aapcs_arg_processed = false;
2526 pcum->aapcs_stack_words = 0;
2527 pcum->aapcs_stack_size = 0;
2529 if (!TARGET_FLOAT
2530 && fndecl && TREE_PUBLIC (fndecl)
2531 && fntype && fntype != error_mark_node)
2533 const_tree type = TREE_TYPE (fntype);
2534 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2535 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2536 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2537 &mode, &nregs, NULL))
2538 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2540 return;
2543 static void
2544 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2545 machine_mode mode,
2546 const_tree type,
2547 bool named)
2549 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2550 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2552 aarch64_layout_arg (pcum_v, mode, type, named);
2553 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2554 != (pcum->aapcs_stack_words != 0));
2555 pcum->aapcs_arg_processed = false;
2556 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2557 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2558 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2559 pcum->aapcs_stack_words = 0;
2560 pcum->aapcs_reg = NULL_RTX;
2564 bool
2565 aarch64_function_arg_regno_p (unsigned regno)
2567 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2568 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2571 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2572 PARM_BOUNDARY bits of alignment, but will be given anything up
2573 to STACK_BOUNDARY bits if the type requires it. This makes sure
2574 that both before and after the layout of each argument, the Next
2575 Stacked Argument Address (NSAA) will have a minimum alignment of
2576 8 bytes. */
2578 static unsigned int
2579 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2581 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2582 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2585 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2587 Return true if an argument passed on the stack should be padded upwards,
2588 i.e. if the least-significant byte of the stack slot has useful data.
2590 Small aggregate types are placed in the lowest memory address.
2592 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2594 bool
2595 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2597 /* On little-endian targets, the least significant byte of every stack
2598 argument is passed at the lowest byte address of the stack slot. */
2599 if (!BYTES_BIG_ENDIAN)
2600 return true;
2602 /* Otherwise, integral, floating-point and pointer types are padded downward:
2603 the least significant byte of a stack argument is passed at the highest
2604 byte address of the stack slot. */
2605 if (type
2606 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2607 || POINTER_TYPE_P (type))
2608 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2609 return false;
2611 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2612 return true;
2615 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2617 It specifies padding for the last (may also be the only)
2618 element of a block move between registers and memory. If
2619 assuming the block is in the memory, padding upward means that
2620 the last element is padded after its highest significant byte,
2621 while in downward padding, the last element is padded at the
2622 its least significant byte side.
2624 Small aggregates and small complex types are always padded
2625 upwards.
2627 We don't need to worry about homogeneous floating-point or
2628 short-vector aggregates; their move is not affected by the
2629 padding direction determined here. Regardless of endianness,
2630 each element of such an aggregate is put in the least
2631 significant bits of a fp/simd register.
2633 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2634 register has useful data, and return the opposite if the most
2635 significant byte does. */
2637 bool
2638 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2639 bool first ATTRIBUTE_UNUSED)
2642 /* Small composite types are always padded upward. */
2643 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2645 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2646 : GET_MODE_SIZE (mode));
2647 if (size < 2 * UNITS_PER_WORD)
2648 return true;
2651 /* Otherwise, use the default padding. */
2652 return !BYTES_BIG_ENDIAN;
2655 static machine_mode
2656 aarch64_libgcc_cmp_return_mode (void)
2658 return SImode;
2661 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2663 /* We use the 12-bit shifted immediate arithmetic instructions so values
2664 must be multiple of (1 << 12), i.e. 4096. */
2665 #define ARITH_FACTOR 4096
2667 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2668 #error Cannot use simple address calculation for stack probing
2669 #endif
2671 /* The pair of scratch registers used for stack probing. */
2672 #define PROBE_STACK_FIRST_REG 9
2673 #define PROBE_STACK_SECOND_REG 10
2675 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2676 inclusive. These are offsets from the current stack pointer. */
2678 static void
2679 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2681 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2683 /* See the same assertion on PROBE_INTERVAL above. */
2684 gcc_assert ((first % ARITH_FACTOR) == 0);
2686 /* See if we have a constant small number of probes to generate. If so,
2687 that's the easy case. */
2688 if (size <= PROBE_INTERVAL)
2690 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2692 emit_set_insn (reg1,
2693 plus_constant (Pmode,
2694 stack_pointer_rtx, -(first + base)));
2695 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2698 /* The run-time loop is made up of 8 insns in the generic case while the
2699 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2700 else if (size <= 4 * PROBE_INTERVAL)
2702 HOST_WIDE_INT i, rem;
2704 emit_set_insn (reg1,
2705 plus_constant (Pmode,
2706 stack_pointer_rtx,
2707 -(first + PROBE_INTERVAL)));
2708 emit_stack_probe (reg1);
2710 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2711 it exceeds SIZE. If only two probes are needed, this will not
2712 generate any code. Then probe at FIRST + SIZE. */
2713 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2715 emit_set_insn (reg1,
2716 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2717 emit_stack_probe (reg1);
2720 rem = size - (i - PROBE_INTERVAL);
2721 if (rem > 256)
2723 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2725 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2726 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2728 else
2729 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2732 /* Otherwise, do the same as above, but in a loop. Note that we must be
2733 extra careful with variables wrapping around because we might be at
2734 the very top (or the very bottom) of the address space and we have
2735 to be able to handle this case properly; in particular, we use an
2736 equality test for the loop condition. */
2737 else
2739 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2741 /* Step 1: round SIZE to the previous multiple of the interval. */
2743 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2746 /* Step 2: compute initial and final value of the loop counter. */
2748 /* TEST_ADDR = SP + FIRST. */
2749 emit_set_insn (reg1,
2750 plus_constant (Pmode, stack_pointer_rtx, -first));
2752 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2753 HOST_WIDE_INT adjustment = - (first + rounded_size);
2754 if (! aarch64_uimm12_shift (adjustment))
2756 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2757 true, Pmode);
2758 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2760 else
2762 emit_set_insn (reg2,
2763 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2766 /* Step 3: the loop
2770 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2771 probe at TEST_ADDR
2773 while (TEST_ADDR != LAST_ADDR)
2775 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2776 until it is equal to ROUNDED_SIZE. */
2778 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2781 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2782 that SIZE is equal to ROUNDED_SIZE. */
2784 if (size != rounded_size)
2786 HOST_WIDE_INT rem = size - rounded_size;
2788 if (rem > 256)
2790 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2792 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2793 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2795 else
2796 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2800 /* Make sure nothing is scheduled before we are done. */
2801 emit_insn (gen_blockage ());
2804 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2805 absolute addresses. */
2807 const char *
2808 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2810 static int labelno = 0;
2811 char loop_lab[32];
2812 rtx xops[2];
2814 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2816 /* Loop. */
2817 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2819 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2820 xops[0] = reg1;
2821 xops[1] = GEN_INT (PROBE_INTERVAL);
2822 output_asm_insn ("sub\t%0, %0, %1", xops);
2824 /* Probe at TEST_ADDR. */
2825 output_asm_insn ("str\txzr, [%0]", xops);
2827 /* Test if TEST_ADDR == LAST_ADDR. */
2828 xops[1] = reg2;
2829 output_asm_insn ("cmp\t%0, %1", xops);
2831 /* Branch. */
2832 fputs ("\tb.ne\t", asm_out_file);
2833 assemble_name_raw (asm_out_file, loop_lab);
2834 fputc ('\n', asm_out_file);
2836 return "";
2839 static bool
2840 aarch64_frame_pointer_required (void)
2842 /* In aarch64_override_options_after_change
2843 flag_omit_leaf_frame_pointer turns off the frame pointer by
2844 default. Turn it back on now if we've not got a leaf
2845 function. */
2846 if (flag_omit_leaf_frame_pointer
2847 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2848 return true;
2850 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2851 if (crtl->calls_eh_return)
2852 return true;
2854 return false;
2857 /* Mark the registers that need to be saved by the callee and calculate
2858 the size of the callee-saved registers area and frame record (both FP
2859 and LR may be omitted). */
2860 static void
2861 aarch64_layout_frame (void)
2863 HOST_WIDE_INT offset = 0;
2864 int regno, last_fp_reg = INVALID_REGNUM;
2866 if (reload_completed && cfun->machine->frame.laid_out)
2867 return;
2869 #define SLOT_NOT_REQUIRED (-2)
2870 #define SLOT_REQUIRED (-1)
2872 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2873 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2875 /* First mark all the registers that really need to be saved... */
2876 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2877 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2879 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2880 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2882 /* ... that includes the eh data registers (if needed)... */
2883 if (crtl->calls_eh_return)
2884 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2885 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2886 = SLOT_REQUIRED;
2888 /* ... and any callee saved register that dataflow says is live. */
2889 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2890 if (df_regs_ever_live_p (regno)
2891 && (regno == R30_REGNUM
2892 || !call_used_regs[regno]))
2893 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2895 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2896 if (df_regs_ever_live_p (regno)
2897 && !call_used_regs[regno])
2899 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2900 last_fp_reg = regno;
2903 if (frame_pointer_needed)
2905 /* FP and LR are placed in the linkage record. */
2906 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2907 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2908 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2909 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2910 offset += 2 * UNITS_PER_WORD;
2913 /* Now assign stack slots for them. */
2914 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2915 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2917 cfun->machine->frame.reg_offset[regno] = offset;
2918 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2919 cfun->machine->frame.wb_candidate1 = regno;
2920 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2921 cfun->machine->frame.wb_candidate2 = regno;
2922 offset += UNITS_PER_WORD;
2925 HOST_WIDE_INT max_int_offset = offset;
2926 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2927 bool has_align_gap = offset != max_int_offset;
2929 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2930 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2932 /* If there is an alignment gap between integer and fp callee-saves,
2933 allocate the last fp register to it if possible. */
2934 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2936 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2937 break;
2940 cfun->machine->frame.reg_offset[regno] = offset;
2941 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2942 cfun->machine->frame.wb_candidate1 = regno;
2943 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2944 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2945 cfun->machine->frame.wb_candidate2 = regno;
2946 offset += UNITS_PER_WORD;
2949 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2951 cfun->machine->frame.saved_regs_size = offset;
2953 HOST_WIDE_INT varargs_and_saved_regs_size
2954 = offset + cfun->machine->frame.saved_varargs_size;
2956 cfun->machine->frame.hard_fp_offset
2957 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2958 STACK_BOUNDARY / BITS_PER_UNIT);
2960 cfun->machine->frame.frame_size
2961 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2962 + crtl->outgoing_args_size,
2963 STACK_BOUNDARY / BITS_PER_UNIT);
2965 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2967 cfun->machine->frame.initial_adjust = 0;
2968 cfun->machine->frame.final_adjust = 0;
2969 cfun->machine->frame.callee_adjust = 0;
2970 cfun->machine->frame.callee_offset = 0;
2972 HOST_WIDE_INT max_push_offset = 0;
2973 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2974 max_push_offset = 512;
2975 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2976 max_push_offset = 256;
2978 if (cfun->machine->frame.frame_size < max_push_offset
2979 && crtl->outgoing_args_size == 0)
2981 /* Simple, small frame with no outgoing arguments:
2982 stp reg1, reg2, [sp, -frame_size]!
2983 stp reg3, reg4, [sp, 16] */
2984 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2986 else if ((crtl->outgoing_args_size
2987 + cfun->machine->frame.saved_regs_size < 512)
2988 && !(cfun->calls_alloca
2989 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2991 /* Frame with small outgoing arguments:
2992 sub sp, sp, frame_size
2993 stp reg1, reg2, [sp, outgoing_args_size]
2994 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2995 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2996 cfun->machine->frame.callee_offset
2997 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2999 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3001 /* Frame with large outgoing arguments but a small local area:
3002 stp reg1, reg2, [sp, -hard_fp_offset]!
3003 stp reg3, reg4, [sp, 16]
3004 sub sp, sp, outgoing_args_size */
3005 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3006 cfun->machine->frame.final_adjust
3007 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3009 else if (!frame_pointer_needed
3010 && varargs_and_saved_regs_size < max_push_offset)
3012 /* Frame with large local area and outgoing arguments (this pushes the
3013 callee-saves first, followed by the locals and outgoing area):
3014 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3015 stp reg3, reg4, [sp, 16]
3016 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3017 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3018 cfun->machine->frame.final_adjust
3019 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3020 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3021 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3023 else
3025 /* Frame with large local area and outgoing arguments using frame pointer:
3026 sub sp, sp, hard_fp_offset
3027 stp x29, x30, [sp, 0]
3028 add x29, sp, 0
3029 stp reg3, reg4, [sp, 16]
3030 sub sp, sp, outgoing_args_size */
3031 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3032 cfun->machine->frame.final_adjust
3033 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3036 cfun->machine->frame.laid_out = true;
3039 /* Return true if the register REGNO is saved on entry to
3040 the current function. */
3042 static bool
3043 aarch64_register_saved_on_entry (int regno)
3045 return cfun->machine->frame.reg_offset[regno] >= 0;
3048 /* Return the next register up from REGNO up to LIMIT for the callee
3049 to save. */
3051 static unsigned
3052 aarch64_next_callee_save (unsigned regno, unsigned limit)
3054 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3055 regno ++;
3056 return regno;
3059 /* Push the register number REGNO of mode MODE to the stack with write-back
3060 adjusting the stack by ADJUSTMENT. */
3062 static void
3063 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3064 HOST_WIDE_INT adjustment)
3066 rtx base_rtx = stack_pointer_rtx;
3067 rtx insn, reg, mem;
3069 reg = gen_rtx_REG (mode, regno);
3070 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3071 plus_constant (Pmode, base_rtx, -adjustment));
3072 mem = gen_rtx_MEM (mode, mem);
3074 insn = emit_move_insn (mem, reg);
3075 RTX_FRAME_RELATED_P (insn) = 1;
3078 /* Generate and return an instruction to store the pair of registers
3079 REG and REG2 of mode MODE to location BASE with write-back adjusting
3080 the stack location BASE by ADJUSTMENT. */
3082 static rtx
3083 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3084 HOST_WIDE_INT adjustment)
3086 switch (mode)
3088 case DImode:
3089 return gen_storewb_pairdi_di (base, base, reg, reg2,
3090 GEN_INT (-adjustment),
3091 GEN_INT (UNITS_PER_WORD - adjustment));
3092 case DFmode:
3093 return gen_storewb_pairdf_di (base, base, reg, reg2,
3094 GEN_INT (-adjustment),
3095 GEN_INT (UNITS_PER_WORD - adjustment));
3096 default:
3097 gcc_unreachable ();
3101 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3102 stack pointer by ADJUSTMENT. */
3104 static void
3105 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3107 rtx_insn *insn;
3108 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3110 if (regno2 == INVALID_REGNUM)
3111 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3113 rtx reg1 = gen_rtx_REG (mode, regno1);
3114 rtx reg2 = gen_rtx_REG (mode, regno2);
3116 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3117 reg2, adjustment));
3118 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3119 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3120 RTX_FRAME_RELATED_P (insn) = 1;
3123 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3124 adjusting it by ADJUSTMENT afterwards. */
3126 static rtx
3127 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3128 HOST_WIDE_INT adjustment)
3130 switch (mode)
3132 case DImode:
3133 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3134 GEN_INT (UNITS_PER_WORD));
3135 case DFmode:
3136 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3137 GEN_INT (UNITS_PER_WORD));
3138 default:
3139 gcc_unreachable ();
3143 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3144 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3145 into CFI_OPS. */
3147 static void
3148 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3149 rtx *cfi_ops)
3151 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3152 rtx reg1 = gen_rtx_REG (mode, regno1);
3154 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3156 if (regno2 == INVALID_REGNUM)
3158 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3159 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3160 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3162 else
3164 rtx reg2 = gen_rtx_REG (mode, regno2);
3165 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3166 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3167 reg2, adjustment));
3171 /* Generate and return a store pair instruction of mode MODE to store
3172 register REG1 to MEM1 and register REG2 to MEM2. */
3174 static rtx
3175 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3176 rtx reg2)
3178 switch (mode)
3180 case DImode:
3181 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3183 case DFmode:
3184 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3186 default:
3187 gcc_unreachable ();
3191 /* Generate and regurn a load pair isntruction of mode MODE to load register
3192 REG1 from MEM1 and register REG2 from MEM2. */
3194 static rtx
3195 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3196 rtx mem2)
3198 switch (mode)
3200 case DImode:
3201 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3203 case DFmode:
3204 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3206 default:
3207 gcc_unreachable ();
3211 /* Return TRUE if return address signing should be enabled for the current
3212 function, otherwise return FALSE. */
3214 bool
3215 aarch64_return_address_signing_enabled (void)
3217 /* This function should only be called after frame laid out. */
3218 gcc_assert (cfun->machine->frame.laid_out);
3220 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3221 if it's LR is pushed onto stack. */
3222 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3223 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3224 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3227 /* Emit code to save the callee-saved registers from register number START
3228 to LIMIT to the stack at the location starting at offset START_OFFSET,
3229 skipping any write-back candidates if SKIP_WB is true. */
3231 static void
3232 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3233 unsigned start, unsigned limit, bool skip_wb)
3235 rtx_insn *insn;
3236 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3237 ? gen_frame_mem : gen_rtx_MEM);
3238 unsigned regno;
3239 unsigned regno2;
3241 for (regno = aarch64_next_callee_save (start, limit);
3242 regno <= limit;
3243 regno = aarch64_next_callee_save (regno + 1, limit))
3245 rtx reg, mem;
3246 HOST_WIDE_INT offset;
3248 if (skip_wb
3249 && (regno == cfun->machine->frame.wb_candidate1
3250 || regno == cfun->machine->frame.wb_candidate2))
3251 continue;
3253 if (cfun->machine->reg_is_wrapped_separately[regno])
3254 continue;
3256 reg = gen_rtx_REG (mode, regno);
3257 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3258 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3259 offset));
3261 regno2 = aarch64_next_callee_save (regno + 1, limit);
3263 if (regno2 <= limit
3264 && !cfun->machine->reg_is_wrapped_separately[regno2]
3265 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3266 == cfun->machine->frame.reg_offset[regno2]))
3269 rtx reg2 = gen_rtx_REG (mode, regno2);
3270 rtx mem2;
3272 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3273 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3274 offset));
3275 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3276 reg2));
3278 /* The first part of a frame-related parallel insn is
3279 always assumed to be relevant to the frame
3280 calculations; subsequent parts, are only
3281 frame-related if explicitly marked. */
3282 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3283 regno = regno2;
3285 else
3286 insn = emit_move_insn (mem, reg);
3288 RTX_FRAME_RELATED_P (insn) = 1;
3292 /* Emit code to restore the callee registers of mode MODE from register
3293 number START up to and including LIMIT. Restore from the stack offset
3294 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3295 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3297 static void
3298 aarch64_restore_callee_saves (machine_mode mode,
3299 HOST_WIDE_INT start_offset, unsigned start,
3300 unsigned limit, bool skip_wb, rtx *cfi_ops)
3302 rtx base_rtx = stack_pointer_rtx;
3303 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3304 ? gen_frame_mem : gen_rtx_MEM);
3305 unsigned regno;
3306 unsigned regno2;
3307 HOST_WIDE_INT offset;
3309 for (regno = aarch64_next_callee_save (start, limit);
3310 regno <= limit;
3311 regno = aarch64_next_callee_save (regno + 1, limit))
3313 if (cfun->machine->reg_is_wrapped_separately[regno])
3314 continue;
3316 rtx reg, mem;
3318 if (skip_wb
3319 && (regno == cfun->machine->frame.wb_candidate1
3320 || regno == cfun->machine->frame.wb_candidate2))
3321 continue;
3323 reg = gen_rtx_REG (mode, regno);
3324 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3325 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3327 regno2 = aarch64_next_callee_save (regno + 1, limit);
3329 if (regno2 <= limit
3330 && !cfun->machine->reg_is_wrapped_separately[regno2]
3331 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3332 == cfun->machine->frame.reg_offset[regno2]))
3334 rtx reg2 = gen_rtx_REG (mode, regno2);
3335 rtx mem2;
3337 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3338 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3339 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3341 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3342 regno = regno2;
3344 else
3345 emit_move_insn (reg, mem);
3346 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3350 static inline bool
3351 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3352 HOST_WIDE_INT offset)
3354 return offset >= -256 && offset < 256;
3357 static inline bool
3358 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3360 return (offset >= 0
3361 && offset < 4096 * GET_MODE_SIZE (mode)
3362 && offset % GET_MODE_SIZE (mode) == 0);
3365 bool
3366 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3368 return (offset >= -64 * GET_MODE_SIZE (mode)
3369 && offset < 64 * GET_MODE_SIZE (mode)
3370 && offset % GET_MODE_SIZE (mode) == 0);
3373 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3375 static sbitmap
3376 aarch64_get_separate_components (void)
3378 aarch64_layout_frame ();
3380 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3381 bitmap_clear (components);
3383 /* The registers we need saved to the frame. */
3384 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3385 if (aarch64_register_saved_on_entry (regno))
3387 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3388 if (!frame_pointer_needed)
3389 offset += cfun->machine->frame.frame_size
3390 - cfun->machine->frame.hard_fp_offset;
3391 /* Check that we can access the stack slot of the register with one
3392 direct load with no adjustments needed. */
3393 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3394 bitmap_set_bit (components, regno);
3397 /* Don't mess with the hard frame pointer. */
3398 if (frame_pointer_needed)
3399 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3401 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3402 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3403 /* If aarch64_layout_frame has chosen registers to store/restore with
3404 writeback don't interfere with them to avoid having to output explicit
3405 stack adjustment instructions. */
3406 if (reg2 != INVALID_REGNUM)
3407 bitmap_clear_bit (components, reg2);
3408 if (reg1 != INVALID_REGNUM)
3409 bitmap_clear_bit (components, reg1);
3411 bitmap_clear_bit (components, LR_REGNUM);
3412 bitmap_clear_bit (components, SP_REGNUM);
3414 return components;
3417 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3419 static sbitmap
3420 aarch64_components_for_bb (basic_block bb)
3422 bitmap in = DF_LIVE_IN (bb);
3423 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3424 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3426 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3427 bitmap_clear (components);
3429 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3430 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3431 if ((!call_used_regs[regno])
3432 && (bitmap_bit_p (in, regno)
3433 || bitmap_bit_p (gen, regno)
3434 || bitmap_bit_p (kill, regno)))
3435 bitmap_set_bit (components, regno);
3437 return components;
3440 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3441 Nothing to do for aarch64. */
3443 static void
3444 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3448 /* Return the next set bit in BMP from START onwards. Return the total number
3449 of bits in BMP if no set bit is found at or after START. */
3451 static unsigned int
3452 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3454 unsigned int nbits = SBITMAP_SIZE (bmp);
3455 if (start == nbits)
3456 return start;
3458 gcc_assert (start < nbits);
3459 for (unsigned int i = start; i < nbits; i++)
3460 if (bitmap_bit_p (bmp, i))
3461 return i;
3463 return nbits;
3466 /* Do the work for aarch64_emit_prologue_components and
3467 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3468 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3469 for these components or the epilogue sequence. That is, it determines
3470 whether we should emit stores or loads and what kind of CFA notes to attach
3471 to the insns. Otherwise the logic for the two sequences is very
3472 similar. */
3474 static void
3475 aarch64_process_components (sbitmap components, bool prologue_p)
3477 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3478 ? HARD_FRAME_POINTER_REGNUM
3479 : STACK_POINTER_REGNUM);
3481 unsigned last_regno = SBITMAP_SIZE (components);
3482 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3483 rtx_insn *insn = NULL;
3485 while (regno != last_regno)
3487 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3488 so DFmode for the vector registers is enough. */
3489 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3490 rtx reg = gen_rtx_REG (mode, regno);
3491 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3492 if (!frame_pointer_needed)
3493 offset += cfun->machine->frame.frame_size
3494 - cfun->machine->frame.hard_fp_offset;
3495 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3496 rtx mem = gen_frame_mem (mode, addr);
3498 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3499 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3500 /* No more registers to handle after REGNO.
3501 Emit a single save/restore and exit. */
3502 if (regno2 == last_regno)
3504 insn = emit_insn (set);
3505 RTX_FRAME_RELATED_P (insn) = 1;
3506 if (prologue_p)
3507 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3508 else
3509 add_reg_note (insn, REG_CFA_RESTORE, reg);
3510 break;
3513 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3514 /* The next register is not of the same class or its offset is not
3515 mergeable with the current one into a pair. */
3516 if (!satisfies_constraint_Ump (mem)
3517 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3518 || (offset2 - cfun->machine->frame.reg_offset[regno])
3519 != GET_MODE_SIZE (mode))
3521 insn = emit_insn (set);
3522 RTX_FRAME_RELATED_P (insn) = 1;
3523 if (prologue_p)
3524 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3525 else
3526 add_reg_note (insn, REG_CFA_RESTORE, reg);
3528 regno = regno2;
3529 continue;
3532 /* REGNO2 can be saved/restored in a pair with REGNO. */
3533 rtx reg2 = gen_rtx_REG (mode, regno2);
3534 if (!frame_pointer_needed)
3535 offset2 += cfun->machine->frame.frame_size
3536 - cfun->machine->frame.hard_fp_offset;
3537 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3538 rtx mem2 = gen_frame_mem (mode, addr2);
3539 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3540 : gen_rtx_SET (reg2, mem2);
3542 if (prologue_p)
3543 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3544 else
3545 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3547 RTX_FRAME_RELATED_P (insn) = 1;
3548 if (prologue_p)
3550 add_reg_note (insn, REG_CFA_OFFSET, set);
3551 add_reg_note (insn, REG_CFA_OFFSET, set2);
3553 else
3555 add_reg_note (insn, REG_CFA_RESTORE, reg);
3556 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3559 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3563 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3565 static void
3566 aarch64_emit_prologue_components (sbitmap components)
3568 aarch64_process_components (components, true);
3571 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3573 static void
3574 aarch64_emit_epilogue_components (sbitmap components)
3576 aarch64_process_components (components, false);
3579 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3581 static void
3582 aarch64_set_handled_components (sbitmap components)
3584 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3585 if (bitmap_bit_p (components, regno))
3586 cfun->machine->reg_is_wrapped_separately[regno] = true;
3589 /* AArch64 stack frames generated by this compiler look like:
3591 +-------------------------------+
3593 | incoming stack arguments |
3595 +-------------------------------+
3596 | | <-- incoming stack pointer (aligned)
3597 | callee-allocated save area |
3598 | for register varargs |
3600 +-------------------------------+
3601 | local variables | <-- frame_pointer_rtx
3603 +-------------------------------+
3604 | padding0 | \
3605 +-------------------------------+ |
3606 | callee-saved registers | | frame.saved_regs_size
3607 +-------------------------------+ |
3608 | LR' | |
3609 +-------------------------------+ |
3610 | FP' | / <- hard_frame_pointer_rtx (aligned)
3611 +-------------------------------+
3612 | dynamic allocation |
3613 +-------------------------------+
3614 | padding |
3615 +-------------------------------+
3616 | outgoing stack arguments | <-- arg_pointer
3618 +-------------------------------+
3619 | | <-- stack_pointer_rtx (aligned)
3621 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3622 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3623 unchanged. */
3625 /* Generate the prologue instructions for entry into a function.
3626 Establish the stack frame by decreasing the stack pointer with a
3627 properly calculated size and, if necessary, create a frame record
3628 filled with the values of LR and previous frame pointer. The
3629 current FP is also set up if it is in use. */
3631 void
3632 aarch64_expand_prologue (void)
3634 aarch64_layout_frame ();
3636 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3637 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3638 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3639 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3640 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3641 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3642 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3643 rtx_insn *insn;
3645 /* Sign return address for functions. */
3646 if (aarch64_return_address_signing_enabled ())
3648 insn = emit_insn (gen_pacisp ());
3649 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3650 RTX_FRAME_RELATED_P (insn) = 1;
3653 if (flag_stack_usage_info)
3654 current_function_static_stack_size = frame_size;
3656 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3658 if (crtl->is_leaf && !cfun->calls_alloca)
3660 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3661 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3662 frame_size - STACK_CHECK_PROTECT);
3664 else if (frame_size > 0)
3665 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3668 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3670 if (callee_adjust != 0)
3671 aarch64_push_regs (reg1, reg2, callee_adjust);
3673 if (frame_pointer_needed)
3675 if (callee_adjust == 0)
3676 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3677 R30_REGNUM, false);
3678 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3679 stack_pointer_rtx,
3680 GEN_INT (callee_offset)));
3681 RTX_FRAME_RELATED_P (insn) = 1;
3682 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3685 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3686 callee_adjust != 0 || frame_pointer_needed);
3687 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3688 callee_adjust != 0 || frame_pointer_needed);
3689 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3692 /* Return TRUE if we can use a simple_return insn.
3694 This function checks whether the callee saved stack is empty, which
3695 means no restore actions are need. The pro_and_epilogue will use
3696 this to check whether shrink-wrapping opt is feasible. */
3698 bool
3699 aarch64_use_return_insn_p (void)
3701 if (!reload_completed)
3702 return false;
3704 if (crtl->profile)
3705 return false;
3707 aarch64_layout_frame ();
3709 return cfun->machine->frame.frame_size == 0;
3712 /* Generate the epilogue instructions for returning from a function.
3713 This is almost exactly the reverse of the prolog sequence, except
3714 that we need to insert barriers to avoid scheduling loads that read
3715 from a deallocated stack, and we optimize the unwind records by
3716 emitting them all together if possible. */
3717 void
3718 aarch64_expand_epilogue (bool for_sibcall)
3720 aarch64_layout_frame ();
3722 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3723 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3724 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3725 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3726 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3727 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3728 rtx cfi_ops = NULL;
3729 rtx_insn *insn;
3731 /* We need to add memory barrier to prevent read from deallocated stack. */
3732 bool need_barrier_p = (get_frame_size ()
3733 + cfun->machine->frame.saved_varargs_size) != 0;
3735 /* Emit a barrier to prevent loads from a deallocated stack. */
3736 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3737 || crtl->calls_eh_return)
3739 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3740 need_barrier_p = false;
3743 /* Restore the stack pointer from the frame pointer if it may not
3744 be the same as the stack pointer. */
3745 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3747 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3748 hard_frame_pointer_rtx,
3749 GEN_INT (-callee_offset)));
3750 /* If writeback is used when restoring callee-saves, the CFA
3751 is restored on the instruction doing the writeback. */
3752 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3754 else
3755 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3757 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3758 callee_adjust != 0, &cfi_ops);
3759 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3760 callee_adjust != 0, &cfi_ops);
3762 if (need_barrier_p)
3763 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3765 if (callee_adjust != 0)
3766 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3768 if (callee_adjust != 0 || initial_adjust > 65536)
3770 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3771 insn = get_last_insn ();
3772 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3773 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3774 RTX_FRAME_RELATED_P (insn) = 1;
3775 cfi_ops = NULL;
3778 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3780 if (cfi_ops)
3782 /* Emit delayed restores and reset the CFA to be SP. */
3783 insn = get_last_insn ();
3784 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3785 REG_NOTES (insn) = cfi_ops;
3786 RTX_FRAME_RELATED_P (insn) = 1;
3789 /* We prefer to emit the combined return/authenticate instruction RETAA,
3790 however there are three cases in which we must instead emit an explicit
3791 authentication instruction.
3793 1) Sibcalls don't return in a normal way, so if we're about to call one
3794 we must authenticate.
3796 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3797 generating code for !TARGET_ARMV8_3 we can't use it and must
3798 explicitly authenticate.
3800 3) On an eh_return path we make extra stack adjustments to update the
3801 canonical frame address to be the exception handler's CFA. We want
3802 to authenticate using the CFA of the function which calls eh_return.
3804 if (aarch64_return_address_signing_enabled ()
3805 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3807 insn = emit_insn (gen_autisp ());
3808 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3809 RTX_FRAME_RELATED_P (insn) = 1;
3812 /* Stack adjustment for exception handler. */
3813 if (crtl->calls_eh_return)
3815 /* We need to unwind the stack by the offset computed by
3816 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3817 to be SP; letting the CFA move during this adjustment
3818 is just as correct as retaining the CFA from the body
3819 of the function. Therefore, do nothing special. */
3820 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3823 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3824 if (!for_sibcall)
3825 emit_jump_insn (ret_rtx);
3828 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3829 normally or return to a previous frame after unwinding.
3831 An EH return uses a single shared return sequence. The epilogue is
3832 exactly like a normal epilogue except that it has an extra input
3833 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3834 that must be applied after the frame has been destroyed. An extra label
3835 is inserted before the epilogue which initializes this register to zero,
3836 and this is the entry point for a normal return.
3838 An actual EH return updates the return address, initializes the stack
3839 adjustment and jumps directly into the epilogue (bypassing the zeroing
3840 of the adjustment). Since the return address is typically saved on the
3841 stack when a function makes a call, the saved LR must be updated outside
3842 the epilogue.
3844 This poses problems as the store is generated well before the epilogue,
3845 so the offset of LR is not known yet. Also optimizations will remove the
3846 store as it appears dead, even after the epilogue is generated (as the
3847 base or offset for loading LR is different in many cases).
3849 To avoid these problems this implementation forces the frame pointer
3850 in eh_return functions so that the location of LR is fixed and known early.
3851 It also marks the store volatile, so no optimization is permitted to
3852 remove the store. */
3854 aarch64_eh_return_handler_rtx (void)
3856 rtx tmp = gen_frame_mem (Pmode,
3857 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3859 /* Mark the store volatile, so no optimization is permitted to remove it. */
3860 MEM_VOLATILE_P (tmp) = true;
3861 return tmp;
3864 /* Output code to add DELTA to the first argument, and then jump
3865 to FUNCTION. Used for C++ multiple inheritance. */
3866 static void
3867 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3868 HOST_WIDE_INT delta,
3869 HOST_WIDE_INT vcall_offset,
3870 tree function)
3872 /* The this pointer is always in x0. Note that this differs from
3873 Arm where the this pointer maybe bumped to r1 if r0 is required
3874 to return a pointer to an aggregate. On AArch64 a result value
3875 pointer will be in x8. */
3876 int this_regno = R0_REGNUM;
3877 rtx this_rtx, temp0, temp1, addr, funexp;
3878 rtx_insn *insn;
3880 reload_completed = 1;
3881 emit_note (NOTE_INSN_PROLOGUE_END);
3883 if (vcall_offset == 0)
3884 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3885 else
3887 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3889 this_rtx = gen_rtx_REG (Pmode, this_regno);
3890 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3891 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3893 addr = this_rtx;
3894 if (delta != 0)
3896 if (delta >= -256 && delta < 256)
3897 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3898 plus_constant (Pmode, this_rtx, delta));
3899 else
3900 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3903 if (Pmode == ptr_mode)
3904 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3905 else
3906 aarch64_emit_move (temp0,
3907 gen_rtx_ZERO_EXTEND (Pmode,
3908 gen_rtx_MEM (ptr_mode, addr)));
3910 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3911 addr = plus_constant (Pmode, temp0, vcall_offset);
3912 else
3914 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3915 Pmode);
3916 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3919 if (Pmode == ptr_mode)
3920 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3921 else
3922 aarch64_emit_move (temp1,
3923 gen_rtx_SIGN_EXTEND (Pmode,
3924 gen_rtx_MEM (ptr_mode, addr)));
3926 emit_insn (gen_add2_insn (this_rtx, temp1));
3929 /* Generate a tail call to the target function. */
3930 if (!TREE_USED (function))
3932 assemble_external (function);
3933 TREE_USED (function) = 1;
3935 funexp = XEXP (DECL_RTL (function), 0);
3936 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3937 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3938 SIBLING_CALL_P (insn) = 1;
3940 insn = get_insns ();
3941 shorten_branches (insn);
3942 final_start_function (insn, file, 1);
3943 final (insn, file, 1);
3944 final_end_function ();
3946 /* Stop pretending to be a post-reload pass. */
3947 reload_completed = 0;
3950 static bool
3951 aarch64_tls_referenced_p (rtx x)
3953 if (!TARGET_HAVE_TLS)
3954 return false;
3955 subrtx_iterator::array_type array;
3956 FOR_EACH_SUBRTX (iter, array, x, ALL)
3958 const_rtx x = *iter;
3959 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3960 return true;
3961 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3962 TLS offsets, not real symbol references. */
3963 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3964 iter.skip_subrtxes ();
3966 return false;
3970 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3971 a left shift of 0 or 12 bits. */
3972 bool
3973 aarch64_uimm12_shift (HOST_WIDE_INT val)
3975 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3976 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3981 /* Return true if val is an immediate that can be loaded into a
3982 register by a MOVZ instruction. */
3983 static bool
3984 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3986 if (GET_MODE_SIZE (mode) > 4)
3988 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3989 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3990 return 1;
3992 else
3994 /* Ignore sign extension. */
3995 val &= (HOST_WIDE_INT) 0xffffffff;
3997 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3998 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4001 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4003 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4005 0x0000000100000001ull,
4006 0x0001000100010001ull,
4007 0x0101010101010101ull,
4008 0x1111111111111111ull,
4009 0x5555555555555555ull,
4013 /* Return true if val is a valid bitmask immediate. */
4015 bool
4016 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4018 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4019 int bits;
4021 /* Check for a single sequence of one bits and return quickly if so.
4022 The special cases of all ones and all zeroes returns false. */
4023 val = (unsigned HOST_WIDE_INT) val_in;
4024 tmp = val + (val & -val);
4026 if (tmp == (tmp & -tmp))
4027 return (val + 1) > 1;
4029 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4030 if (mode == SImode)
4031 val = (val << 32) | (val & 0xffffffff);
4033 /* Invert if the immediate doesn't start with a zero bit - this means we
4034 only need to search for sequences of one bits. */
4035 if (val & 1)
4036 val = ~val;
4038 /* Find the first set bit and set tmp to val with the first sequence of one
4039 bits removed. Return success if there is a single sequence of ones. */
4040 first_one = val & -val;
4041 tmp = val & (val + first_one);
4043 if (tmp == 0)
4044 return true;
4046 /* Find the next set bit and compute the difference in bit position. */
4047 next_one = tmp & -tmp;
4048 bits = clz_hwi (first_one) - clz_hwi (next_one);
4049 mask = val ^ tmp;
4051 /* Check the bit position difference is a power of 2, and that the first
4052 sequence of one bits fits within 'bits' bits. */
4053 if ((mask >> bits) != 0 || bits != (bits & -bits))
4054 return false;
4056 /* Check the sequence of one bits is repeated 64/bits times. */
4057 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4060 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4061 Assumed precondition: VAL_IN Is not zero. */
4063 unsigned HOST_WIDE_INT
4064 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4066 int lowest_bit_set = ctz_hwi (val_in);
4067 int highest_bit_set = floor_log2 (val_in);
4068 gcc_assert (val_in != 0);
4070 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4071 (HOST_WIDE_INT_1U << lowest_bit_set));
4074 /* Create constant where bits outside of lowest bit set to highest bit set
4075 are set to 1. */
4077 unsigned HOST_WIDE_INT
4078 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4080 return val_in | ~aarch64_and_split_imm1 (val_in);
4083 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4085 bool
4086 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4088 if (aarch64_bitmask_imm (val_in, mode))
4089 return false;
4091 if (aarch64_move_imm (val_in, mode))
4092 return false;
4094 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4096 return aarch64_bitmask_imm (imm2, mode);
4099 /* Return true if val is an immediate that can be loaded into a
4100 register in a single instruction. */
4101 bool
4102 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4104 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4105 return 1;
4106 return aarch64_bitmask_imm (val, mode);
4109 static bool
4110 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4112 rtx base, offset;
4114 if (GET_CODE (x) == HIGH)
4115 return true;
4117 split_const (x, &base, &offset);
4118 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4120 if (aarch64_classify_symbol (base, offset)
4121 != SYMBOL_FORCE_TO_MEM)
4122 return true;
4123 else
4124 /* Avoid generating a 64-bit relocation in ILP32; leave
4125 to aarch64_expand_mov_immediate to handle it properly. */
4126 return mode != ptr_mode;
4129 return aarch64_tls_referenced_p (x);
4132 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4133 The expansion for a table switch is quite expensive due to the number
4134 of instructions, the table lookup and hard to predict indirect jump.
4135 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4136 set, otherwise use tables for > 16 cases as a tradeoff between size and
4137 performance. When optimizing for size, use the default setting. */
4139 static unsigned int
4140 aarch64_case_values_threshold (void)
4142 /* Use the specified limit for the number of cases before using jump
4143 tables at higher optimization levels. */
4144 if (optimize > 2
4145 && selected_cpu->tune->max_case_values != 0)
4146 return selected_cpu->tune->max_case_values;
4147 else
4148 return optimize_size ? default_case_values_threshold () : 17;
4151 /* Return true if register REGNO is a valid index register.
4152 STRICT_P is true if REG_OK_STRICT is in effect. */
4154 bool
4155 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4157 if (!HARD_REGISTER_NUM_P (regno))
4159 if (!strict_p)
4160 return true;
4162 if (!reg_renumber)
4163 return false;
4165 regno = reg_renumber[regno];
4167 return GP_REGNUM_P (regno);
4170 /* Return true if register REGNO is a valid base register for mode MODE.
4171 STRICT_P is true if REG_OK_STRICT is in effect. */
4173 bool
4174 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4176 if (!HARD_REGISTER_NUM_P (regno))
4178 if (!strict_p)
4179 return true;
4181 if (!reg_renumber)
4182 return false;
4184 regno = reg_renumber[regno];
4187 /* The fake registers will be eliminated to either the stack or
4188 hard frame pointer, both of which are usually valid base registers.
4189 Reload deals with the cases where the eliminated form isn't valid. */
4190 return (GP_REGNUM_P (regno)
4191 || regno == SP_REGNUM
4192 || regno == FRAME_POINTER_REGNUM
4193 || regno == ARG_POINTER_REGNUM);
4196 /* Return true if X is a valid base register for mode MODE.
4197 STRICT_P is true if REG_OK_STRICT is in effect. */
4199 static bool
4200 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4202 if (!strict_p && GET_CODE (x) == SUBREG)
4203 x = SUBREG_REG (x);
4205 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4208 /* Return true if address offset is a valid index. If it is, fill in INFO
4209 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4211 static bool
4212 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4213 machine_mode mode, bool strict_p)
4215 enum aarch64_address_type type;
4216 rtx index;
4217 int shift;
4219 /* (reg:P) */
4220 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4221 && GET_MODE (x) == Pmode)
4223 type = ADDRESS_REG_REG;
4224 index = x;
4225 shift = 0;
4227 /* (sign_extend:DI (reg:SI)) */
4228 else if ((GET_CODE (x) == SIGN_EXTEND
4229 || GET_CODE (x) == ZERO_EXTEND)
4230 && GET_MODE (x) == DImode
4231 && GET_MODE (XEXP (x, 0)) == SImode)
4233 type = (GET_CODE (x) == SIGN_EXTEND)
4234 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4235 index = XEXP (x, 0);
4236 shift = 0;
4238 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4239 else if (GET_CODE (x) == MULT
4240 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4241 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4242 && GET_MODE (XEXP (x, 0)) == DImode
4243 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4244 && CONST_INT_P (XEXP (x, 1)))
4246 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4247 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4248 index = XEXP (XEXP (x, 0), 0);
4249 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4251 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4252 else if (GET_CODE (x) == ASHIFT
4253 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4254 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4255 && GET_MODE (XEXP (x, 0)) == DImode
4256 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4257 && CONST_INT_P (XEXP (x, 1)))
4259 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4260 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4261 index = XEXP (XEXP (x, 0), 0);
4262 shift = INTVAL (XEXP (x, 1));
4264 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4265 else if ((GET_CODE (x) == SIGN_EXTRACT
4266 || GET_CODE (x) == ZERO_EXTRACT)
4267 && GET_MODE (x) == DImode
4268 && GET_CODE (XEXP (x, 0)) == MULT
4269 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4270 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4272 type = (GET_CODE (x) == SIGN_EXTRACT)
4273 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4274 index = XEXP (XEXP (x, 0), 0);
4275 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4276 if (INTVAL (XEXP (x, 1)) != 32 + shift
4277 || INTVAL (XEXP (x, 2)) != 0)
4278 shift = -1;
4280 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4281 (const_int 0xffffffff<<shift)) */
4282 else if (GET_CODE (x) == AND
4283 && GET_MODE (x) == DImode
4284 && GET_CODE (XEXP (x, 0)) == MULT
4285 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4286 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4287 && CONST_INT_P (XEXP (x, 1)))
4289 type = ADDRESS_REG_UXTW;
4290 index = XEXP (XEXP (x, 0), 0);
4291 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4292 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4293 shift = -1;
4295 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4296 else if ((GET_CODE (x) == SIGN_EXTRACT
4297 || GET_CODE (x) == ZERO_EXTRACT)
4298 && GET_MODE (x) == DImode
4299 && GET_CODE (XEXP (x, 0)) == ASHIFT
4300 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4301 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4303 type = (GET_CODE (x) == SIGN_EXTRACT)
4304 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4305 index = XEXP (XEXP (x, 0), 0);
4306 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4307 if (INTVAL (XEXP (x, 1)) != 32 + shift
4308 || INTVAL (XEXP (x, 2)) != 0)
4309 shift = -1;
4311 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4312 (const_int 0xffffffff<<shift)) */
4313 else if (GET_CODE (x) == AND
4314 && GET_MODE (x) == DImode
4315 && GET_CODE (XEXP (x, 0)) == ASHIFT
4316 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4317 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4318 && CONST_INT_P (XEXP (x, 1)))
4320 type = ADDRESS_REG_UXTW;
4321 index = XEXP (XEXP (x, 0), 0);
4322 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4323 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4324 shift = -1;
4326 /* (mult:P (reg:P) (const_int scale)) */
4327 else if (GET_CODE (x) == MULT
4328 && GET_MODE (x) == Pmode
4329 && GET_MODE (XEXP (x, 0)) == Pmode
4330 && CONST_INT_P (XEXP (x, 1)))
4332 type = ADDRESS_REG_REG;
4333 index = XEXP (x, 0);
4334 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4336 /* (ashift:P (reg:P) (const_int shift)) */
4337 else if (GET_CODE (x) == ASHIFT
4338 && GET_MODE (x) == Pmode
4339 && GET_MODE (XEXP (x, 0)) == Pmode
4340 && CONST_INT_P (XEXP (x, 1)))
4342 type = ADDRESS_REG_REG;
4343 index = XEXP (x, 0);
4344 shift = INTVAL (XEXP (x, 1));
4346 else
4347 return false;
4349 if (GET_CODE (index) == SUBREG)
4350 index = SUBREG_REG (index);
4352 if ((shift == 0 ||
4353 (shift > 0 && shift <= 3
4354 && (1 << shift) == GET_MODE_SIZE (mode)))
4355 && REG_P (index)
4356 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4358 info->type = type;
4359 info->offset = index;
4360 info->shift = shift;
4361 return true;
4364 return false;
4367 /* Return true if MODE is one of the modes for which we
4368 support LDP/STP operations. */
4370 static bool
4371 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4373 return mode == SImode || mode == DImode
4374 || mode == SFmode || mode == DFmode
4375 || (aarch64_vector_mode_supported_p (mode)
4376 && GET_MODE_SIZE (mode) == 8);
4379 /* Return true if REGNO is a virtual pointer register, or an eliminable
4380 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4381 include stack_pointer or hard_frame_pointer. */
4382 static bool
4383 virt_or_elim_regno_p (unsigned regno)
4385 return ((regno >= FIRST_VIRTUAL_REGISTER
4386 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4387 || regno == FRAME_POINTER_REGNUM
4388 || regno == ARG_POINTER_REGNUM);
4391 /* Return true if X is a valid address for machine mode MODE. If it is,
4392 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4393 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4395 static bool
4396 aarch64_classify_address (struct aarch64_address_info *info,
4397 rtx x, machine_mode mode,
4398 RTX_CODE outer_code, bool strict_p)
4400 enum rtx_code code = GET_CODE (x);
4401 rtx op0, op1;
4403 /* On BE, we use load/store pair for all large int mode load/stores.
4404 TI/TFmode may also use a load/store pair. */
4405 bool load_store_pair_p = (outer_code == PARALLEL
4406 || mode == TImode
4407 || mode == TFmode
4408 || (BYTES_BIG_ENDIAN
4409 && aarch64_vect_struct_mode_p (mode)));
4411 bool allow_reg_index_p =
4412 !load_store_pair_p
4413 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4414 && !aarch64_vect_struct_mode_p (mode);
4416 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4417 REG addressing. */
4418 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4419 && (code != POST_INC && code != REG))
4420 return false;
4422 switch (code)
4424 case REG:
4425 case SUBREG:
4426 info->type = ADDRESS_REG_IMM;
4427 info->base = x;
4428 info->offset = const0_rtx;
4429 return aarch64_base_register_rtx_p (x, strict_p);
4431 case PLUS:
4432 op0 = XEXP (x, 0);
4433 op1 = XEXP (x, 1);
4435 if (! strict_p
4436 && REG_P (op0)
4437 && virt_or_elim_regno_p (REGNO (op0))
4438 && CONST_INT_P (op1))
4440 info->type = ADDRESS_REG_IMM;
4441 info->base = op0;
4442 info->offset = op1;
4444 return true;
4447 if (GET_MODE_SIZE (mode) != 0
4448 && CONST_INT_P (op1)
4449 && aarch64_base_register_rtx_p (op0, strict_p))
4451 HOST_WIDE_INT offset = INTVAL (op1);
4453 info->type = ADDRESS_REG_IMM;
4454 info->base = op0;
4455 info->offset = op1;
4457 /* TImode and TFmode values are allowed in both pairs of X
4458 registers and individual Q registers. The available
4459 address modes are:
4460 X,X: 7-bit signed scaled offset
4461 Q: 9-bit signed offset
4462 We conservatively require an offset representable in either mode.
4463 When performing the check for pairs of X registers i.e. LDP/STP
4464 pass down DImode since that is the natural size of the LDP/STP
4465 instruction memory accesses. */
4466 if (mode == TImode || mode == TFmode)
4467 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4468 && (offset_9bit_signed_unscaled_p (mode, offset)
4469 || offset_12bit_unsigned_scaled_p (mode, offset)));
4471 /* A 7bit offset check because OImode will emit a ldp/stp
4472 instruction (only big endian will get here).
4473 For ldp/stp instructions, the offset is scaled for the size of a
4474 single element of the pair. */
4475 if (mode == OImode)
4476 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4478 /* Three 9/12 bit offsets checks because CImode will emit three
4479 ldr/str instructions (only big endian will get here). */
4480 if (mode == CImode)
4481 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4482 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4483 || offset_12bit_unsigned_scaled_p (V16QImode,
4484 offset + 32)));
4486 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4487 instructions (only big endian will get here). */
4488 if (mode == XImode)
4489 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4490 && aarch64_offset_7bit_signed_scaled_p (TImode,
4491 offset + 32));
4493 if (load_store_pair_p)
4494 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4495 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4496 else
4497 return (offset_9bit_signed_unscaled_p (mode, offset)
4498 || offset_12bit_unsigned_scaled_p (mode, offset));
4501 if (allow_reg_index_p)
4503 /* Look for base + (scaled/extended) index register. */
4504 if (aarch64_base_register_rtx_p (op0, strict_p)
4505 && aarch64_classify_index (info, op1, mode, strict_p))
4507 info->base = op0;
4508 return true;
4510 if (aarch64_base_register_rtx_p (op1, strict_p)
4511 && aarch64_classify_index (info, op0, mode, strict_p))
4513 info->base = op1;
4514 return true;
4518 return false;
4520 case POST_INC:
4521 case POST_DEC:
4522 case PRE_INC:
4523 case PRE_DEC:
4524 info->type = ADDRESS_REG_WB;
4525 info->base = XEXP (x, 0);
4526 info->offset = NULL_RTX;
4527 return aarch64_base_register_rtx_p (info->base, strict_p);
4529 case POST_MODIFY:
4530 case PRE_MODIFY:
4531 info->type = ADDRESS_REG_WB;
4532 info->base = XEXP (x, 0);
4533 if (GET_CODE (XEXP (x, 1)) == PLUS
4534 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4535 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4536 && aarch64_base_register_rtx_p (info->base, strict_p))
4538 HOST_WIDE_INT offset;
4539 info->offset = XEXP (XEXP (x, 1), 1);
4540 offset = INTVAL (info->offset);
4542 /* TImode and TFmode values are allowed in both pairs of X
4543 registers and individual Q registers. The available
4544 address modes are:
4545 X,X: 7-bit signed scaled offset
4546 Q: 9-bit signed offset
4547 We conservatively require an offset representable in either mode.
4549 if (mode == TImode || mode == TFmode)
4550 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4551 && offset_9bit_signed_unscaled_p (mode, offset));
4553 if (load_store_pair_p)
4554 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4555 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4556 else
4557 return offset_9bit_signed_unscaled_p (mode, offset);
4559 return false;
4561 case CONST:
4562 case SYMBOL_REF:
4563 case LABEL_REF:
4564 /* load literal: pc-relative constant pool entry. Only supported
4565 for SI mode or larger. */
4566 info->type = ADDRESS_SYMBOLIC;
4568 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4570 rtx sym, addend;
4572 split_const (x, &sym, &addend);
4573 return ((GET_CODE (sym) == LABEL_REF
4574 || (GET_CODE (sym) == SYMBOL_REF
4575 && CONSTANT_POOL_ADDRESS_P (sym)
4576 && aarch64_pcrelative_literal_loads)));
4578 return false;
4580 case LO_SUM:
4581 info->type = ADDRESS_LO_SUM;
4582 info->base = XEXP (x, 0);
4583 info->offset = XEXP (x, 1);
4584 if (allow_reg_index_p
4585 && aarch64_base_register_rtx_p (info->base, strict_p))
4587 rtx sym, offs;
4588 split_const (info->offset, &sym, &offs);
4589 if (GET_CODE (sym) == SYMBOL_REF
4590 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4592 /* The symbol and offset must be aligned to the access size. */
4593 unsigned int align;
4594 unsigned int ref_size;
4596 if (CONSTANT_POOL_ADDRESS_P (sym))
4597 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4598 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4600 tree exp = SYMBOL_REF_DECL (sym);
4601 align = TYPE_ALIGN (TREE_TYPE (exp));
4602 align = CONSTANT_ALIGNMENT (exp, align);
4604 else if (SYMBOL_REF_DECL (sym))
4605 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4606 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4607 && SYMBOL_REF_BLOCK (sym) != NULL)
4608 align = SYMBOL_REF_BLOCK (sym)->alignment;
4609 else
4610 align = BITS_PER_UNIT;
4612 ref_size = GET_MODE_SIZE (mode);
4613 if (ref_size == 0)
4614 ref_size = GET_MODE_SIZE (DImode);
4616 return ((INTVAL (offs) & (ref_size - 1)) == 0
4617 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4620 return false;
4622 default:
4623 return false;
4627 /* Return true if the address X is valid for a PRFM instruction.
4628 STRICT_P is true if we should do strict checking with
4629 aarch64_classify_address. */
4631 bool
4632 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4634 struct aarch64_address_info addr;
4636 /* PRFM accepts the same addresses as DImode... */
4637 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4638 if (!res)
4639 return false;
4641 /* ... except writeback forms. */
4642 return addr.type != ADDRESS_REG_WB;
4645 bool
4646 aarch64_symbolic_address_p (rtx x)
4648 rtx offset;
4650 split_const (x, &x, &offset);
4651 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4654 /* Classify the base of symbolic expression X. */
4656 enum aarch64_symbol_type
4657 aarch64_classify_symbolic_expression (rtx x)
4659 rtx offset;
4661 split_const (x, &x, &offset);
4662 return aarch64_classify_symbol (x, offset);
4666 /* Return TRUE if X is a legitimate address for accessing memory in
4667 mode MODE. */
4668 static bool
4669 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4671 struct aarch64_address_info addr;
4673 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4676 /* Return TRUE if X is a legitimate address for accessing memory in
4677 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4678 pair operation. */
4679 bool
4680 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4681 RTX_CODE outer_code, bool strict_p)
4683 struct aarch64_address_info addr;
4685 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4688 /* Split an out-of-range address displacement into a base and offset.
4689 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4690 to increase opportunities for sharing the base address of different sizes.
4691 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4692 static bool
4693 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4695 HOST_WIDE_INT offset = INTVAL (*disp);
4696 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4698 if (mode == TImode || mode == TFmode
4699 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4700 base = (offset + 0x100) & ~0x1ff;
4702 *off = GEN_INT (base);
4703 *disp = GEN_INT (offset - base);
4704 return true;
4707 /* Return the binary representation of floating point constant VALUE in INTVAL.
4708 If the value cannot be converted, return false without setting INTVAL.
4709 The conversion is done in the given MODE. */
4710 bool
4711 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4714 /* We make a general exception for 0. */
4715 if (aarch64_float_const_zero_rtx_p (value))
4717 *intval = 0;
4718 return true;
4721 machine_mode mode = GET_MODE (value);
4722 if (GET_CODE (value) != CONST_DOUBLE
4723 || !SCALAR_FLOAT_MODE_P (mode)
4724 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4725 /* Only support up to DF mode. */
4726 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4727 return false;
4729 unsigned HOST_WIDE_INT ival = 0;
4731 long res[2];
4732 real_to_target (res,
4733 CONST_DOUBLE_REAL_VALUE (value),
4734 REAL_MODE_FORMAT (mode));
4736 ival = zext_hwi (res[0], 32);
4737 if (GET_MODE_BITSIZE (mode) == GET_MODE_BITSIZE (DFmode))
4738 ival |= (zext_hwi (res[1], 32) << 32);
4740 *intval = ival;
4741 return true;
4744 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4745 single MOV(+MOVK) followed by an FMOV. */
4746 bool
4747 aarch64_float_const_rtx_p (rtx x)
4749 machine_mode mode = GET_MODE (x);
4750 if (mode == VOIDmode)
4751 return false;
4753 /* Determine whether it's cheaper to write float constants as
4754 mov/movk pairs over ldr/adrp pairs. */
4755 unsigned HOST_WIDE_INT ival;
4757 if (GET_CODE (x) == CONST_DOUBLE
4758 && SCALAR_FLOAT_MODE_P (mode)
4759 && aarch64_reinterpret_float_as_int (x, &ival))
4761 machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
4762 int num_instr = aarch64_internal_mov_immediate
4763 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4764 return num_instr < 3;
4767 return false;
4770 /* Return TRUE if rtx X is immediate constant 0.0 */
4771 bool
4772 aarch64_float_const_zero_rtx_p (rtx x)
4774 if (GET_MODE (x) == VOIDmode)
4775 return false;
4777 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4778 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4779 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4782 /* Return TRUE if rtx X is immediate constant that fits in a single
4783 MOVI immediate operation. */
4784 bool
4785 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4787 if (!TARGET_SIMD)
4788 return false;
4790 machine_mode vmode, imode;
4791 unsigned HOST_WIDE_INT ival;
4793 if (GET_CODE (x) == CONST_DOUBLE
4794 && SCALAR_FLOAT_MODE_P (mode))
4796 if (!aarch64_reinterpret_float_as_int (x, &ival))
4797 return false;
4799 /* We make a general exception for 0. */
4800 if (aarch64_float_const_zero_rtx_p (x))
4801 return true;
4803 imode = int_mode_for_mode (mode);
4805 else if (GET_CODE (x) == CONST_INT
4806 && SCALAR_INT_MODE_P (mode))
4808 imode = mode;
4809 ival = INTVAL (x);
4811 else
4812 return false;
4814 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4815 a 128 bit vector mode. */
4816 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4818 vmode = aarch64_simd_container_mode (imode, width);
4819 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4821 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4825 /* Return the fixed registers used for condition codes. */
4827 static bool
4828 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4830 *p1 = CC_REGNUM;
4831 *p2 = INVALID_REGNUM;
4832 return true;
4835 /* This function is used by the call expanders of the machine description.
4836 RESULT is the register in which the result is returned. It's NULL for
4837 "call" and "sibcall".
4838 MEM is the location of the function call.
4839 SIBCALL indicates whether this function call is normal call or sibling call.
4840 It will generate different pattern accordingly. */
4842 void
4843 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4845 rtx call, callee, tmp;
4846 rtvec vec;
4847 machine_mode mode;
4849 gcc_assert (MEM_P (mem));
4850 callee = XEXP (mem, 0);
4851 mode = GET_MODE (callee);
4852 gcc_assert (mode == Pmode);
4854 /* Decide if we should generate indirect calls by loading the
4855 address of the callee into a register before performing
4856 the branch-and-link. */
4857 if (SYMBOL_REF_P (callee)
4858 ? (aarch64_is_long_call_p (callee)
4859 || aarch64_is_noplt_call_p (callee))
4860 : !REG_P (callee))
4861 XEXP (mem, 0) = force_reg (mode, callee);
4863 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4865 if (result != NULL_RTX)
4866 call = gen_rtx_SET (result, call);
4868 if (sibcall)
4869 tmp = ret_rtx;
4870 else
4871 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4873 vec = gen_rtvec (2, call, tmp);
4874 call = gen_rtx_PARALLEL (VOIDmode, vec);
4876 aarch64_emit_call_insn (call);
4879 /* Emit call insn with PAT and do aarch64-specific handling. */
4881 void
4882 aarch64_emit_call_insn (rtx pat)
4884 rtx insn = emit_call_insn (pat);
4886 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4887 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4888 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4891 machine_mode
4892 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4894 /* All floating point compares return CCFP if it is an equality
4895 comparison, and CCFPE otherwise. */
4896 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4898 switch (code)
4900 case EQ:
4901 case NE:
4902 case UNORDERED:
4903 case ORDERED:
4904 case UNLT:
4905 case UNLE:
4906 case UNGT:
4907 case UNGE:
4908 case UNEQ:
4909 case LTGT:
4910 return CCFPmode;
4912 case LT:
4913 case LE:
4914 case GT:
4915 case GE:
4916 return CCFPEmode;
4918 default:
4919 gcc_unreachable ();
4923 /* Equality comparisons of short modes against zero can be performed
4924 using the TST instruction with the appropriate bitmask. */
4925 if (y == const0_rtx && REG_P (x)
4926 && (code == EQ || code == NE)
4927 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4928 return CC_NZmode;
4930 /* Similarly, comparisons of zero_extends from shorter modes can
4931 be performed using an ANDS with an immediate mask. */
4932 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4933 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4934 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4935 && (code == EQ || code == NE))
4936 return CC_NZmode;
4938 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4939 && y == const0_rtx
4940 && (code == EQ || code == NE || code == LT || code == GE)
4941 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4942 || GET_CODE (x) == NEG
4943 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4944 && CONST_INT_P (XEXP (x, 2)))))
4945 return CC_NZmode;
4947 /* A compare with a shifted operand. Because of canonicalization,
4948 the comparison will have to be swapped when we emit the assembly
4949 code. */
4950 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4951 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4952 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4953 || GET_CODE (x) == LSHIFTRT
4954 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4955 return CC_SWPmode;
4957 /* Similarly for a negated operand, but we can only do this for
4958 equalities. */
4959 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4960 && (REG_P (y) || GET_CODE (y) == SUBREG)
4961 && (code == EQ || code == NE)
4962 && GET_CODE (x) == NEG)
4963 return CC_Zmode;
4965 /* A test for unsigned overflow. */
4966 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4967 && code == NE
4968 && GET_CODE (x) == PLUS
4969 && GET_CODE (y) == ZERO_EXTEND)
4970 return CC_Cmode;
4972 /* For everything else, return CCmode. */
4973 return CCmode;
4976 static int
4977 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4980 aarch64_get_condition_code (rtx x)
4982 machine_mode mode = GET_MODE (XEXP (x, 0));
4983 enum rtx_code comp_code = GET_CODE (x);
4985 if (GET_MODE_CLASS (mode) != MODE_CC)
4986 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4987 return aarch64_get_condition_code_1 (mode, comp_code);
4990 static int
4991 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
4993 switch (mode)
4995 case CCFPmode:
4996 case CCFPEmode:
4997 switch (comp_code)
4999 case GE: return AARCH64_GE;
5000 case GT: return AARCH64_GT;
5001 case LE: return AARCH64_LS;
5002 case LT: return AARCH64_MI;
5003 case NE: return AARCH64_NE;
5004 case EQ: return AARCH64_EQ;
5005 case ORDERED: return AARCH64_VC;
5006 case UNORDERED: return AARCH64_VS;
5007 case UNLT: return AARCH64_LT;
5008 case UNLE: return AARCH64_LE;
5009 case UNGT: return AARCH64_HI;
5010 case UNGE: return AARCH64_PL;
5011 default: return -1;
5013 break;
5015 case CCmode:
5016 switch (comp_code)
5018 case NE: return AARCH64_NE;
5019 case EQ: return AARCH64_EQ;
5020 case GE: return AARCH64_GE;
5021 case GT: return AARCH64_GT;
5022 case LE: return AARCH64_LE;
5023 case LT: return AARCH64_LT;
5024 case GEU: return AARCH64_CS;
5025 case GTU: return AARCH64_HI;
5026 case LEU: return AARCH64_LS;
5027 case LTU: return AARCH64_CC;
5028 default: return -1;
5030 break;
5032 case CC_SWPmode:
5033 switch (comp_code)
5035 case NE: return AARCH64_NE;
5036 case EQ: return AARCH64_EQ;
5037 case GE: return AARCH64_LE;
5038 case GT: return AARCH64_LT;
5039 case LE: return AARCH64_GE;
5040 case LT: return AARCH64_GT;
5041 case GEU: return AARCH64_LS;
5042 case GTU: return AARCH64_CC;
5043 case LEU: return AARCH64_CS;
5044 case LTU: return AARCH64_HI;
5045 default: return -1;
5047 break;
5049 case CC_NZmode:
5050 switch (comp_code)
5052 case NE: return AARCH64_NE;
5053 case EQ: return AARCH64_EQ;
5054 case GE: return AARCH64_PL;
5055 case LT: return AARCH64_MI;
5056 default: return -1;
5058 break;
5060 case CC_Zmode:
5061 switch (comp_code)
5063 case NE: return AARCH64_NE;
5064 case EQ: return AARCH64_EQ;
5065 default: return -1;
5067 break;
5069 case CC_Cmode:
5070 switch (comp_code)
5072 case NE: return AARCH64_CS;
5073 case EQ: return AARCH64_CC;
5074 default: return -1;
5076 break;
5078 default:
5079 return -1;
5082 return -1;
5085 bool
5086 aarch64_const_vec_all_same_in_range_p (rtx x,
5087 HOST_WIDE_INT minval,
5088 HOST_WIDE_INT maxval)
5090 HOST_WIDE_INT firstval;
5091 int count, i;
5093 if (GET_CODE (x) != CONST_VECTOR
5094 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5095 return false;
5097 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5098 if (firstval < minval || firstval > maxval)
5099 return false;
5101 count = CONST_VECTOR_NUNITS (x);
5102 for (i = 1; i < count; i++)
5103 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5104 return false;
5106 return true;
5109 bool
5110 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5112 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5116 /* N Z C V. */
5117 #define AARCH64_CC_V 1
5118 #define AARCH64_CC_C (1 << 1)
5119 #define AARCH64_CC_Z (1 << 2)
5120 #define AARCH64_CC_N (1 << 3)
5122 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5123 static const int aarch64_nzcv_codes[] =
5125 0, /* EQ, Z == 1. */
5126 AARCH64_CC_Z, /* NE, Z == 0. */
5127 0, /* CS, C == 1. */
5128 AARCH64_CC_C, /* CC, C == 0. */
5129 0, /* MI, N == 1. */
5130 AARCH64_CC_N, /* PL, N == 0. */
5131 0, /* VS, V == 1. */
5132 AARCH64_CC_V, /* VC, V == 0. */
5133 0, /* HI, C ==1 && Z == 0. */
5134 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5135 AARCH64_CC_V, /* GE, N == V. */
5136 0, /* LT, N != V. */
5137 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5138 0, /* LE, !(Z == 0 && N == V). */
5139 0, /* AL, Any. */
5140 0 /* NV, Any. */
5143 /* Print operand X to file F in a target specific manner according to CODE.
5144 The acceptable formatting commands given by CODE are:
5145 'c': An integer or symbol address without a preceding #
5146 sign.
5147 'e': Print the sign/zero-extend size as a character 8->b,
5148 16->h, 32->w.
5149 'p': Prints N such that 2^N == X (X must be power of 2 and
5150 const int).
5151 'P': Print the number of non-zero bits in X (a const_int).
5152 'H': Print the higher numbered register of a pair (TImode)
5153 of regs.
5154 'm': Print a condition (eq, ne, etc).
5155 'M': Same as 'm', but invert condition.
5156 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5157 'S/T/U/V': Print a FP/SIMD register name for a register list.
5158 The register printed is the FP/SIMD register name
5159 of X + 0/1/2/3 for S/T/U/V.
5160 'R': Print a scalar FP/SIMD register name + 1.
5161 'X': Print bottom 16 bits of integer constant in hex.
5162 'w/x': Print a general register name or the zero register
5163 (32-bit or 64-bit).
5164 '0': Print a normal operand, if it's a general register,
5165 then we assume DImode.
5166 'k': Print NZCV for conditional compare instructions.
5167 'A': Output address constant representing the first
5168 argument of X, specifying a relocation offset
5169 if appropriate.
5170 'L': Output constant address specified by X
5171 with a relocation offset if appropriate.
5172 'G': Prints address of X, specifying a PC relative
5173 relocation mode if appropriate. */
5175 static void
5176 aarch64_print_operand (FILE *f, rtx x, int code)
5178 switch (code)
5180 case 'c':
5181 switch (GET_CODE (x))
5183 case CONST_INT:
5184 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5185 break;
5187 case SYMBOL_REF:
5188 output_addr_const (f, x);
5189 break;
5191 case CONST:
5192 if (GET_CODE (XEXP (x, 0)) == PLUS
5193 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5195 output_addr_const (f, x);
5196 break;
5198 /* Fall through. */
5200 default:
5201 output_operand_lossage ("Unsupported operand for code '%c'", code);
5203 break;
5205 case 'e':
5207 int n;
5209 if (!CONST_INT_P (x)
5210 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5212 output_operand_lossage ("invalid operand for '%%%c'", code);
5213 return;
5216 switch (n)
5218 case 3:
5219 fputc ('b', f);
5220 break;
5221 case 4:
5222 fputc ('h', f);
5223 break;
5224 case 5:
5225 fputc ('w', f);
5226 break;
5227 default:
5228 output_operand_lossage ("invalid operand for '%%%c'", code);
5229 return;
5232 break;
5234 case 'p':
5236 int n;
5238 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5240 output_operand_lossage ("invalid operand for '%%%c'", code);
5241 return;
5244 asm_fprintf (f, "%d", n);
5246 break;
5248 case 'P':
5249 if (!CONST_INT_P (x))
5251 output_operand_lossage ("invalid operand for '%%%c'", code);
5252 return;
5255 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5256 break;
5258 case 'H':
5259 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5261 output_operand_lossage ("invalid operand for '%%%c'", code);
5262 return;
5265 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5266 break;
5268 case 'M':
5269 case 'm':
5271 int cond_code;
5272 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5273 if (x == const_true_rtx)
5275 if (code == 'M')
5276 fputs ("nv", f);
5277 return;
5280 if (!COMPARISON_P (x))
5282 output_operand_lossage ("invalid operand for '%%%c'", code);
5283 return;
5286 cond_code = aarch64_get_condition_code (x);
5287 gcc_assert (cond_code >= 0);
5288 if (code == 'M')
5289 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5290 fputs (aarch64_condition_codes[cond_code], f);
5292 break;
5294 case 'b':
5295 case 'h':
5296 case 's':
5297 case 'd':
5298 case 'q':
5299 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5301 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5302 return;
5304 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5305 break;
5307 case 'S':
5308 case 'T':
5309 case 'U':
5310 case 'V':
5311 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5313 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5314 return;
5316 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5317 break;
5319 case 'R':
5320 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5322 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5323 return;
5325 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5326 break;
5328 case 'X':
5329 if (!CONST_INT_P (x))
5331 output_operand_lossage ("invalid operand for '%%%c'", code);
5332 return;
5334 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5335 break;
5337 case 'w':
5338 case 'x':
5339 if (x == const0_rtx
5340 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5342 asm_fprintf (f, "%czr", code);
5343 break;
5346 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5348 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5349 break;
5352 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5354 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5355 break;
5358 /* Fall through */
5360 case 0:
5361 if (x == NULL)
5363 output_operand_lossage ("missing operand");
5364 return;
5367 switch (GET_CODE (x))
5369 case REG:
5370 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5371 break;
5373 case MEM:
5374 output_address (GET_MODE (x), XEXP (x, 0));
5375 /* Check all memory references are Pmode - even with ILP32. */
5376 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5377 break;
5379 case CONST:
5380 case LABEL_REF:
5381 case SYMBOL_REF:
5382 output_addr_const (asm_out_file, x);
5383 break;
5385 case CONST_INT:
5386 asm_fprintf (f, "%wd", INTVAL (x));
5387 break;
5389 case CONST_VECTOR:
5390 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5392 gcc_assert (
5393 aarch64_const_vec_all_same_in_range_p (x,
5394 HOST_WIDE_INT_MIN,
5395 HOST_WIDE_INT_MAX));
5396 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5398 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5400 fputc ('0', f);
5402 else
5403 gcc_unreachable ();
5404 break;
5406 case CONST_DOUBLE:
5407 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5408 be getting CONST_DOUBLEs holding integers. */
5409 gcc_assert (GET_MODE (x) != VOIDmode);
5410 if (aarch64_float_const_zero_rtx_p (x))
5412 fputc ('0', f);
5413 break;
5415 else if (aarch64_float_const_representable_p (x))
5417 #define buf_size 20
5418 char float_buf[buf_size] = {'\0'};
5419 real_to_decimal_for_mode (float_buf,
5420 CONST_DOUBLE_REAL_VALUE (x),
5421 buf_size, buf_size,
5422 1, GET_MODE (x));
5423 asm_fprintf (asm_out_file, "%s", float_buf);
5424 break;
5425 #undef buf_size
5427 output_operand_lossage ("invalid constant");
5428 return;
5429 default:
5430 output_operand_lossage ("invalid operand");
5431 return;
5433 break;
5435 case 'A':
5436 if (GET_CODE (x) == HIGH)
5437 x = XEXP (x, 0);
5439 switch (aarch64_classify_symbolic_expression (x))
5441 case SYMBOL_SMALL_GOT_4G:
5442 asm_fprintf (asm_out_file, ":got:");
5443 break;
5445 case SYMBOL_SMALL_TLSGD:
5446 asm_fprintf (asm_out_file, ":tlsgd:");
5447 break;
5449 case SYMBOL_SMALL_TLSDESC:
5450 asm_fprintf (asm_out_file, ":tlsdesc:");
5451 break;
5453 case SYMBOL_SMALL_TLSIE:
5454 asm_fprintf (asm_out_file, ":gottprel:");
5455 break;
5457 case SYMBOL_TLSLE24:
5458 asm_fprintf (asm_out_file, ":tprel:");
5459 break;
5461 case SYMBOL_TINY_GOT:
5462 gcc_unreachable ();
5463 break;
5465 default:
5466 break;
5468 output_addr_const (asm_out_file, x);
5469 break;
5471 case 'L':
5472 switch (aarch64_classify_symbolic_expression (x))
5474 case SYMBOL_SMALL_GOT_4G:
5475 asm_fprintf (asm_out_file, ":lo12:");
5476 break;
5478 case SYMBOL_SMALL_TLSGD:
5479 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5480 break;
5482 case SYMBOL_SMALL_TLSDESC:
5483 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5484 break;
5486 case SYMBOL_SMALL_TLSIE:
5487 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5488 break;
5490 case SYMBOL_TLSLE12:
5491 asm_fprintf (asm_out_file, ":tprel_lo12:");
5492 break;
5494 case SYMBOL_TLSLE24:
5495 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5496 break;
5498 case SYMBOL_TINY_GOT:
5499 asm_fprintf (asm_out_file, ":got:");
5500 break;
5502 case SYMBOL_TINY_TLSIE:
5503 asm_fprintf (asm_out_file, ":gottprel:");
5504 break;
5506 default:
5507 break;
5509 output_addr_const (asm_out_file, x);
5510 break;
5512 case 'G':
5513 switch (aarch64_classify_symbolic_expression (x))
5515 case SYMBOL_TLSLE24:
5516 asm_fprintf (asm_out_file, ":tprel_hi12:");
5517 break;
5518 default:
5519 break;
5521 output_addr_const (asm_out_file, x);
5522 break;
5524 case 'k':
5526 HOST_WIDE_INT cond_code;
5528 if (!CONST_INT_P (x))
5530 output_operand_lossage ("invalid operand for '%%%c'", code);
5531 return;
5534 cond_code = INTVAL (x);
5535 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5536 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5538 break;
5540 default:
5541 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5542 return;
5546 static void
5547 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5549 struct aarch64_address_info addr;
5551 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5552 switch (addr.type)
5554 case ADDRESS_REG_IMM:
5555 if (addr.offset == const0_rtx)
5556 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5557 else
5558 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5559 INTVAL (addr.offset));
5560 return;
5562 case ADDRESS_REG_REG:
5563 if (addr.shift == 0)
5564 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5565 reg_names [REGNO (addr.offset)]);
5566 else
5567 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5568 reg_names [REGNO (addr.offset)], addr.shift);
5569 return;
5571 case ADDRESS_REG_UXTW:
5572 if (addr.shift == 0)
5573 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5574 REGNO (addr.offset) - R0_REGNUM);
5575 else
5576 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5577 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5578 return;
5580 case ADDRESS_REG_SXTW:
5581 if (addr.shift == 0)
5582 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5583 REGNO (addr.offset) - R0_REGNUM);
5584 else
5585 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5586 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5587 return;
5589 case ADDRESS_REG_WB:
5590 switch (GET_CODE (x))
5592 case PRE_INC:
5593 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5594 GET_MODE_SIZE (mode));
5595 return;
5596 case POST_INC:
5597 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5598 GET_MODE_SIZE (mode));
5599 return;
5600 case PRE_DEC:
5601 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5602 GET_MODE_SIZE (mode));
5603 return;
5604 case POST_DEC:
5605 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5606 GET_MODE_SIZE (mode));
5607 return;
5608 case PRE_MODIFY:
5609 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5610 INTVAL (addr.offset));
5611 return;
5612 case POST_MODIFY:
5613 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5614 INTVAL (addr.offset));
5615 return;
5616 default:
5617 break;
5619 break;
5621 case ADDRESS_LO_SUM:
5622 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5623 output_addr_const (f, addr.offset);
5624 asm_fprintf (f, "]");
5625 return;
5627 case ADDRESS_SYMBOLIC:
5628 break;
5631 output_addr_const (f, x);
5634 bool
5635 aarch64_label_mentioned_p (rtx x)
5637 const char *fmt;
5638 int i;
5640 if (GET_CODE (x) == LABEL_REF)
5641 return true;
5643 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5644 referencing instruction, but they are constant offsets, not
5645 symbols. */
5646 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5647 return false;
5649 fmt = GET_RTX_FORMAT (GET_CODE (x));
5650 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5652 if (fmt[i] == 'E')
5654 int j;
5656 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5657 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5658 return 1;
5660 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5661 return 1;
5664 return 0;
5667 /* Implement REGNO_REG_CLASS. */
5669 enum reg_class
5670 aarch64_regno_regclass (unsigned regno)
5672 if (GP_REGNUM_P (regno))
5673 return GENERAL_REGS;
5675 if (regno == SP_REGNUM)
5676 return STACK_REG;
5678 if (regno == FRAME_POINTER_REGNUM
5679 || regno == ARG_POINTER_REGNUM)
5680 return POINTER_REGS;
5682 if (FP_REGNUM_P (regno))
5683 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5685 return NO_REGS;
5688 static rtx
5689 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5691 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5692 where mask is selected by alignment and size of the offset.
5693 We try to pick as large a range for the offset as possible to
5694 maximize the chance of a CSE. However, for aligned addresses
5695 we limit the range to 4k so that structures with different sized
5696 elements are likely to use the same base. We need to be careful
5697 not to split a CONST for some forms of address expression, otherwise
5698 it will generate sub-optimal code. */
5700 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5702 rtx base = XEXP (x, 0);
5703 rtx offset_rtx = XEXP (x, 1);
5704 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5706 if (GET_CODE (base) == PLUS)
5708 rtx op0 = XEXP (base, 0);
5709 rtx op1 = XEXP (base, 1);
5711 /* Force any scaling into a temp for CSE. */
5712 op0 = force_reg (Pmode, op0);
5713 op1 = force_reg (Pmode, op1);
5715 /* Let the pointer register be in op0. */
5716 if (REG_POINTER (op1))
5717 std::swap (op0, op1);
5719 /* If the pointer is virtual or frame related, then we know that
5720 virtual register instantiation or register elimination is going
5721 to apply a second constant. We want the two constants folded
5722 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5723 if (virt_or_elim_regno_p (REGNO (op0)))
5725 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5726 NULL_RTX, true, OPTAB_DIRECT);
5727 return gen_rtx_PLUS (Pmode, base, op1);
5730 /* Otherwise, in order to encourage CSE (and thence loop strength
5731 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5732 base = expand_binop (Pmode, add_optab, op0, op1,
5733 NULL_RTX, true, OPTAB_DIRECT);
5734 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5737 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5738 HOST_WIDE_INT base_offset;
5739 if (GET_MODE_SIZE (mode) > 16)
5740 base_offset = (offset + 0x400) & ~0x7f0;
5741 /* For offsets aren't a multiple of the access size, the limit is
5742 -256...255. */
5743 else if (offset & (GET_MODE_SIZE (mode) - 1))
5745 base_offset = (offset + 0x100) & ~0x1ff;
5747 /* BLKmode typically uses LDP of X-registers. */
5748 if (mode == BLKmode)
5749 base_offset = (offset + 512) & ~0x3ff;
5751 /* Small negative offsets are supported. */
5752 else if (IN_RANGE (offset, -256, 0))
5753 base_offset = 0;
5754 else if (mode == TImode || mode == TFmode)
5755 base_offset = (offset + 0x100) & ~0x1ff;
5756 /* Use 12-bit offset by access size. */
5757 else
5758 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5760 if (base_offset != 0)
5762 base = plus_constant (Pmode, base, base_offset);
5763 base = force_operand (base, NULL_RTX);
5764 return plus_constant (Pmode, base, offset - base_offset);
5768 return x;
5771 /* Return the reload icode required for a constant pool in mode. */
5772 static enum insn_code
5773 aarch64_constant_pool_reload_icode (machine_mode mode)
5775 switch (mode)
5777 case SFmode:
5778 return CODE_FOR_aarch64_reload_movcpsfdi;
5780 case DFmode:
5781 return CODE_FOR_aarch64_reload_movcpdfdi;
5783 case TFmode:
5784 return CODE_FOR_aarch64_reload_movcptfdi;
5786 case V8QImode:
5787 return CODE_FOR_aarch64_reload_movcpv8qidi;
5789 case V16QImode:
5790 return CODE_FOR_aarch64_reload_movcpv16qidi;
5792 case V4HImode:
5793 return CODE_FOR_aarch64_reload_movcpv4hidi;
5795 case V8HImode:
5796 return CODE_FOR_aarch64_reload_movcpv8hidi;
5798 case V2SImode:
5799 return CODE_FOR_aarch64_reload_movcpv2sidi;
5801 case V4SImode:
5802 return CODE_FOR_aarch64_reload_movcpv4sidi;
5804 case V2DImode:
5805 return CODE_FOR_aarch64_reload_movcpv2didi;
5807 case V2DFmode:
5808 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5810 default:
5811 gcc_unreachable ();
5814 gcc_unreachable ();
5816 static reg_class_t
5817 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5818 reg_class_t rclass,
5819 machine_mode mode,
5820 secondary_reload_info *sri)
5823 /* If we have to disable direct literal pool loads and stores because the
5824 function is too big, then we need a scratch register. */
5825 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5826 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5827 || targetm.vector_mode_supported_p (GET_MODE (x)))
5828 && !aarch64_pcrelative_literal_loads)
5830 sri->icode = aarch64_constant_pool_reload_icode (mode);
5831 return NO_REGS;
5834 /* Without the TARGET_SIMD instructions we cannot move a Q register
5835 to a Q register directly. We need a scratch. */
5836 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5837 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5838 && reg_class_subset_p (rclass, FP_REGS))
5840 if (mode == TFmode)
5841 sri->icode = CODE_FOR_aarch64_reload_movtf;
5842 else if (mode == TImode)
5843 sri->icode = CODE_FOR_aarch64_reload_movti;
5844 return NO_REGS;
5847 /* A TFmode or TImode memory access should be handled via an FP_REGS
5848 because AArch64 has richer addressing modes for LDR/STR instructions
5849 than LDP/STP instructions. */
5850 if (TARGET_FLOAT && rclass == GENERAL_REGS
5851 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5852 return FP_REGS;
5854 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5855 return GENERAL_REGS;
5857 return NO_REGS;
5860 static bool
5861 aarch64_can_eliminate (const int from, const int to)
5863 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5864 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5866 if (frame_pointer_needed)
5868 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5869 return true;
5870 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5871 return false;
5872 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5873 && !cfun->calls_alloca)
5874 return true;
5875 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5876 return true;
5878 return false;
5880 else
5882 /* If we decided that we didn't need a leaf frame pointer but then used
5883 LR in the function, then we'll want a frame pointer after all, so
5884 prevent this elimination to ensure a frame pointer is used. */
5885 if (to == STACK_POINTER_REGNUM
5886 && flag_omit_leaf_frame_pointer
5887 && df_regs_ever_live_p (LR_REGNUM))
5888 return false;
5891 return true;
5894 HOST_WIDE_INT
5895 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5897 aarch64_layout_frame ();
5899 if (to == HARD_FRAME_POINTER_REGNUM)
5901 if (from == ARG_POINTER_REGNUM)
5902 return cfun->machine->frame.hard_fp_offset;
5904 if (from == FRAME_POINTER_REGNUM)
5905 return cfun->machine->frame.hard_fp_offset
5906 - cfun->machine->frame.locals_offset;
5909 if (to == STACK_POINTER_REGNUM)
5911 if (from == FRAME_POINTER_REGNUM)
5912 return cfun->machine->frame.frame_size
5913 - cfun->machine->frame.locals_offset;
5916 return cfun->machine->frame.frame_size;
5919 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5920 previous frame. */
5923 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5925 if (count != 0)
5926 return const0_rtx;
5927 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5931 static void
5932 aarch64_asm_trampoline_template (FILE *f)
5934 if (TARGET_ILP32)
5936 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5937 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5939 else
5941 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5942 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5944 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5945 assemble_aligned_integer (4, const0_rtx);
5946 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5947 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5950 static void
5951 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5953 rtx fnaddr, mem, a_tramp;
5954 const int tramp_code_sz = 16;
5956 /* Don't need to copy the trailing D-words, we fill those in below. */
5957 emit_block_move (m_tramp, assemble_trampoline_template (),
5958 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5959 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5960 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5961 if (GET_MODE (fnaddr) != ptr_mode)
5962 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5963 emit_move_insn (mem, fnaddr);
5965 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5966 emit_move_insn (mem, chain_value);
5968 /* XXX We should really define a "clear_cache" pattern and use
5969 gen_clear_cache(). */
5970 a_tramp = XEXP (m_tramp, 0);
5971 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5972 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5973 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5974 ptr_mode);
5977 static unsigned char
5978 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5980 switch (regclass)
5982 case CALLER_SAVE_REGS:
5983 case POINTER_REGS:
5984 case GENERAL_REGS:
5985 case ALL_REGS:
5986 case FP_REGS:
5987 case FP_LO_REGS:
5988 return
5989 aarch64_vector_mode_p (mode)
5990 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5991 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5992 case STACK_REG:
5993 return 1;
5995 case NO_REGS:
5996 return 0;
5998 default:
5999 break;
6001 gcc_unreachable ();
6004 static reg_class_t
6005 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6007 if (regclass == POINTER_REGS)
6008 return GENERAL_REGS;
6010 if (regclass == STACK_REG)
6012 if (REG_P(x)
6013 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6014 return regclass;
6016 return NO_REGS;
6019 /* Register eliminiation can result in a request for
6020 SP+constant->FP_REGS. We cannot support such operations which
6021 use SP as source and an FP_REG as destination, so reject out
6022 right now. */
6023 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6025 rtx lhs = XEXP (x, 0);
6027 /* Look through a possible SUBREG introduced by ILP32. */
6028 if (GET_CODE (lhs) == SUBREG)
6029 lhs = SUBREG_REG (lhs);
6031 gcc_assert (REG_P (lhs));
6032 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6033 POINTER_REGS));
6034 return NO_REGS;
6037 return regclass;
6040 void
6041 aarch64_asm_output_labelref (FILE* f, const char *name)
6043 asm_fprintf (f, "%U%s", name);
6046 static void
6047 aarch64_elf_asm_constructor (rtx symbol, int priority)
6049 if (priority == DEFAULT_INIT_PRIORITY)
6050 default_ctor_section_asm_out_constructor (symbol, priority);
6051 else
6053 section *s;
6054 /* While priority is known to be in range [0, 65535], so 18 bytes
6055 would be enough, the compiler might not know that. To avoid
6056 -Wformat-truncation false positive, use a larger size. */
6057 char buf[23];
6058 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6059 s = get_section (buf, SECTION_WRITE, NULL);
6060 switch_to_section (s);
6061 assemble_align (POINTER_SIZE);
6062 assemble_aligned_integer (POINTER_BYTES, symbol);
6066 static void
6067 aarch64_elf_asm_destructor (rtx symbol, int priority)
6069 if (priority == DEFAULT_INIT_PRIORITY)
6070 default_dtor_section_asm_out_destructor (symbol, priority);
6071 else
6073 section *s;
6074 /* While priority is known to be in range [0, 65535], so 18 bytes
6075 would be enough, the compiler might not know that. To avoid
6076 -Wformat-truncation false positive, use a larger size. */
6077 char buf[23];
6078 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6079 s = get_section (buf, SECTION_WRITE, NULL);
6080 switch_to_section (s);
6081 assemble_align (POINTER_SIZE);
6082 assemble_aligned_integer (POINTER_BYTES, symbol);
6086 const char*
6087 aarch64_output_casesi (rtx *operands)
6089 char buf[100];
6090 char label[100];
6091 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6092 int index;
6093 static const char *const patterns[4][2] =
6096 "ldrb\t%w3, [%0,%w1,uxtw]",
6097 "add\t%3, %4, %w3, sxtb #2"
6100 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6101 "add\t%3, %4, %w3, sxth #2"
6104 "ldr\t%w3, [%0,%w1,uxtw #2]",
6105 "add\t%3, %4, %w3, sxtw #2"
6107 /* We assume that DImode is only generated when not optimizing and
6108 that we don't really need 64-bit address offsets. That would
6109 imply an object file with 8GB of code in a single function! */
6111 "ldr\t%w3, [%0,%w1,uxtw #2]",
6112 "add\t%3, %4, %w3, sxtw #2"
6116 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6118 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6120 gcc_assert (index >= 0 && index <= 3);
6122 /* Need to implement table size reduction, by chaning the code below. */
6123 output_asm_insn (patterns[index][0], operands);
6124 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6125 snprintf (buf, sizeof (buf),
6126 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6127 output_asm_insn (buf, operands);
6128 output_asm_insn (patterns[index][1], operands);
6129 output_asm_insn ("br\t%3", operands);
6130 assemble_label (asm_out_file, label);
6131 return "";
6135 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6136 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6137 operator. */
6140 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6142 if (shift >= 0 && shift <= 3)
6144 int size;
6145 for (size = 8; size <= 32; size *= 2)
6147 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6148 if (mask == bits << shift)
6149 return size;
6152 return 0;
6155 /* Constant pools are per function only when PC relative
6156 literal loads are true or we are in the large memory
6157 model. */
6159 static inline bool
6160 aarch64_can_use_per_function_literal_pools_p (void)
6162 return (aarch64_pcrelative_literal_loads
6163 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6166 static bool
6167 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6169 /* Fixme:: In an ideal world this would work similar
6170 to the logic in aarch64_select_rtx_section but this
6171 breaks bootstrap in gcc go. For now we workaround
6172 this by returning false here. */
6173 return false;
6176 /* Select appropriate section for constants depending
6177 on where we place literal pools. */
6179 static section *
6180 aarch64_select_rtx_section (machine_mode mode,
6181 rtx x,
6182 unsigned HOST_WIDE_INT align)
6184 if (aarch64_can_use_per_function_literal_pools_p ())
6185 return function_section (current_function_decl);
6187 return default_elf_select_rtx_section (mode, x, align);
6190 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6191 void
6192 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6193 HOST_WIDE_INT offset)
6195 /* When using per-function literal pools, we must ensure that any code
6196 section is aligned to the minimal instruction length, lest we get
6197 errors from the assembler re "unaligned instructions". */
6198 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6199 ASM_OUTPUT_ALIGN (f, 2);
6202 /* Costs. */
6204 /* Helper function for rtx cost calculation. Strip a shift expression
6205 from X. Returns the inner operand if successful, or the original
6206 expression on failure. */
6207 static rtx
6208 aarch64_strip_shift (rtx x)
6210 rtx op = x;
6212 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6213 we can convert both to ROR during final output. */
6214 if ((GET_CODE (op) == ASHIFT
6215 || GET_CODE (op) == ASHIFTRT
6216 || GET_CODE (op) == LSHIFTRT
6217 || GET_CODE (op) == ROTATERT
6218 || GET_CODE (op) == ROTATE)
6219 && CONST_INT_P (XEXP (op, 1)))
6220 return XEXP (op, 0);
6222 if (GET_CODE (op) == MULT
6223 && CONST_INT_P (XEXP (op, 1))
6224 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6225 return XEXP (op, 0);
6227 return x;
6230 /* Helper function for rtx cost calculation. Strip an extend
6231 expression from X. Returns the inner operand if successful, or the
6232 original expression on failure. We deal with a number of possible
6233 canonicalization variations here. If STRIP_SHIFT is true, then
6234 we can strip off a shift also. */
6235 static rtx
6236 aarch64_strip_extend (rtx x, bool strip_shift)
6238 rtx op = x;
6240 /* Zero and sign extraction of a widened value. */
6241 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6242 && XEXP (op, 2) == const0_rtx
6243 && GET_CODE (XEXP (op, 0)) == MULT
6244 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6245 XEXP (op, 1)))
6246 return XEXP (XEXP (op, 0), 0);
6248 /* It can also be represented (for zero-extend) as an AND with an
6249 immediate. */
6250 if (GET_CODE (op) == AND
6251 && GET_CODE (XEXP (op, 0)) == MULT
6252 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6253 && CONST_INT_P (XEXP (op, 1))
6254 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6255 INTVAL (XEXP (op, 1))) != 0)
6256 return XEXP (XEXP (op, 0), 0);
6258 /* Now handle extended register, as this may also have an optional
6259 left shift by 1..4. */
6260 if (strip_shift
6261 && GET_CODE (op) == ASHIFT
6262 && CONST_INT_P (XEXP (op, 1))
6263 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6264 op = XEXP (op, 0);
6266 if (GET_CODE (op) == ZERO_EXTEND
6267 || GET_CODE (op) == SIGN_EXTEND)
6268 op = XEXP (op, 0);
6270 if (op != x)
6271 return op;
6273 return x;
6276 /* Return true iff CODE is a shift supported in combination
6277 with arithmetic instructions. */
6279 static bool
6280 aarch64_shift_p (enum rtx_code code)
6282 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6286 /* Return true iff X is a cheap shift without a sign extend. */
6288 static bool
6289 aarch64_cheap_mult_shift_p (rtx x)
6291 rtx op0, op1;
6293 op0 = XEXP (x, 0);
6294 op1 = XEXP (x, 1);
6296 if (!(aarch64_tune_params.extra_tuning_flags
6297 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6298 return false;
6300 if (GET_CODE (op0) == SIGN_EXTEND)
6301 return false;
6303 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6304 && UINTVAL (op1) <= 4)
6305 return true;
6307 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6308 return false;
6310 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6312 if (l2 > 0 && l2 <= 4)
6313 return true;
6315 return false;
6318 /* Helper function for rtx cost calculation. Calculate the cost of
6319 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6320 Return the calculated cost of the expression, recursing manually in to
6321 operands where needed. */
6323 static int
6324 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6326 rtx op0, op1;
6327 const struct cpu_cost_table *extra_cost
6328 = aarch64_tune_params.insn_extra_cost;
6329 int cost = 0;
6330 bool compound_p = (outer == PLUS || outer == MINUS);
6331 machine_mode mode = GET_MODE (x);
6333 gcc_checking_assert (code == MULT);
6335 op0 = XEXP (x, 0);
6336 op1 = XEXP (x, 1);
6338 if (VECTOR_MODE_P (mode))
6339 mode = GET_MODE_INNER (mode);
6341 /* Integer multiply/fma. */
6342 if (GET_MODE_CLASS (mode) == MODE_INT)
6344 /* The multiply will be canonicalized as a shift, cost it as such. */
6345 if (aarch64_shift_p (GET_CODE (x))
6346 || (CONST_INT_P (op1)
6347 && exact_log2 (INTVAL (op1)) > 0))
6349 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6350 || GET_CODE (op0) == SIGN_EXTEND;
6351 if (speed)
6353 if (compound_p)
6355 /* If the shift is considered cheap,
6356 then don't add any cost. */
6357 if (aarch64_cheap_mult_shift_p (x))
6359 else if (REG_P (op1))
6360 /* ARITH + shift-by-register. */
6361 cost += extra_cost->alu.arith_shift_reg;
6362 else if (is_extend)
6363 /* ARITH + extended register. We don't have a cost field
6364 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6365 cost += extra_cost->alu.extend_arith;
6366 else
6367 /* ARITH + shift-by-immediate. */
6368 cost += extra_cost->alu.arith_shift;
6370 else
6371 /* LSL (immediate). */
6372 cost += extra_cost->alu.shift;
6375 /* Strip extends as we will have costed them in the case above. */
6376 if (is_extend)
6377 op0 = aarch64_strip_extend (op0, true);
6379 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6381 return cost;
6384 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6385 compound and let the below cases handle it. After all, MNEG is a
6386 special-case alias of MSUB. */
6387 if (GET_CODE (op0) == NEG)
6389 op0 = XEXP (op0, 0);
6390 compound_p = true;
6393 /* Integer multiplies or FMAs have zero/sign extending variants. */
6394 if ((GET_CODE (op0) == ZERO_EXTEND
6395 && GET_CODE (op1) == ZERO_EXTEND)
6396 || (GET_CODE (op0) == SIGN_EXTEND
6397 && GET_CODE (op1) == SIGN_EXTEND))
6399 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6400 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6402 if (speed)
6404 if (compound_p)
6405 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6406 cost += extra_cost->mult[0].extend_add;
6407 else
6408 /* MUL/SMULL/UMULL. */
6409 cost += extra_cost->mult[0].extend;
6412 return cost;
6415 /* This is either an integer multiply or a MADD. In both cases
6416 we want to recurse and cost the operands. */
6417 cost += rtx_cost (op0, mode, MULT, 0, speed);
6418 cost += rtx_cost (op1, mode, MULT, 1, speed);
6420 if (speed)
6422 if (compound_p)
6423 /* MADD/MSUB. */
6424 cost += extra_cost->mult[mode == DImode].add;
6425 else
6426 /* MUL. */
6427 cost += extra_cost->mult[mode == DImode].simple;
6430 return cost;
6432 else
6434 if (speed)
6436 /* Floating-point FMA/FMUL can also support negations of the
6437 operands, unless the rounding mode is upward or downward in
6438 which case FNMUL is different than FMUL with operand negation. */
6439 bool neg0 = GET_CODE (op0) == NEG;
6440 bool neg1 = GET_CODE (op1) == NEG;
6441 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6443 if (neg0)
6444 op0 = XEXP (op0, 0);
6445 if (neg1)
6446 op1 = XEXP (op1, 0);
6449 if (compound_p)
6450 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6451 cost += extra_cost->fp[mode == DFmode].fma;
6452 else
6453 /* FMUL/FNMUL. */
6454 cost += extra_cost->fp[mode == DFmode].mult;
6457 cost += rtx_cost (op0, mode, MULT, 0, speed);
6458 cost += rtx_cost (op1, mode, MULT, 1, speed);
6459 return cost;
6463 static int
6464 aarch64_address_cost (rtx x,
6465 machine_mode mode,
6466 addr_space_t as ATTRIBUTE_UNUSED,
6467 bool speed)
6469 enum rtx_code c = GET_CODE (x);
6470 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6471 struct aarch64_address_info info;
6472 int cost = 0;
6473 info.shift = 0;
6475 if (!aarch64_classify_address (&info, x, mode, c, false))
6477 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6479 /* This is a CONST or SYMBOL ref which will be split
6480 in a different way depending on the code model in use.
6481 Cost it through the generic infrastructure. */
6482 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6483 /* Divide through by the cost of one instruction to
6484 bring it to the same units as the address costs. */
6485 cost_symbol_ref /= COSTS_N_INSNS (1);
6486 /* The cost is then the cost of preparing the address,
6487 followed by an immediate (possibly 0) offset. */
6488 return cost_symbol_ref + addr_cost->imm_offset;
6490 else
6492 /* This is most likely a jump table from a case
6493 statement. */
6494 return addr_cost->register_offset;
6498 switch (info.type)
6500 case ADDRESS_LO_SUM:
6501 case ADDRESS_SYMBOLIC:
6502 case ADDRESS_REG_IMM:
6503 cost += addr_cost->imm_offset;
6504 break;
6506 case ADDRESS_REG_WB:
6507 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6508 cost += addr_cost->pre_modify;
6509 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6510 cost += addr_cost->post_modify;
6511 else
6512 gcc_unreachable ();
6514 break;
6516 case ADDRESS_REG_REG:
6517 cost += addr_cost->register_offset;
6518 break;
6520 case ADDRESS_REG_SXTW:
6521 cost += addr_cost->register_sextend;
6522 break;
6524 case ADDRESS_REG_UXTW:
6525 cost += addr_cost->register_zextend;
6526 break;
6528 default:
6529 gcc_unreachable ();
6533 if (info.shift > 0)
6535 /* For the sake of calculating the cost of the shifted register
6536 component, we can treat same sized modes in the same way. */
6537 switch (GET_MODE_BITSIZE (mode))
6539 case 16:
6540 cost += addr_cost->addr_scale_costs.hi;
6541 break;
6543 case 32:
6544 cost += addr_cost->addr_scale_costs.si;
6545 break;
6547 case 64:
6548 cost += addr_cost->addr_scale_costs.di;
6549 break;
6551 /* We can't tell, or this is a 128-bit vector. */
6552 default:
6553 cost += addr_cost->addr_scale_costs.ti;
6554 break;
6558 return cost;
6561 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6562 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6563 to be taken. */
6566 aarch64_branch_cost (bool speed_p, bool predictable_p)
6568 /* When optimizing for speed, use the cost of unpredictable branches. */
6569 const struct cpu_branch_cost *branch_costs =
6570 aarch64_tune_params.branch_costs;
6572 if (!speed_p || predictable_p)
6573 return branch_costs->predictable;
6574 else
6575 return branch_costs->unpredictable;
6578 /* Return true if the RTX X in mode MODE is a zero or sign extract
6579 usable in an ADD or SUB (extended register) instruction. */
6580 static bool
6581 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6583 /* Catch add with a sign extract.
6584 This is add_<optab><mode>_multp2. */
6585 if (GET_CODE (x) == SIGN_EXTRACT
6586 || GET_CODE (x) == ZERO_EXTRACT)
6588 rtx op0 = XEXP (x, 0);
6589 rtx op1 = XEXP (x, 1);
6590 rtx op2 = XEXP (x, 2);
6592 if (GET_CODE (op0) == MULT
6593 && CONST_INT_P (op1)
6594 && op2 == const0_rtx
6595 && CONST_INT_P (XEXP (op0, 1))
6596 && aarch64_is_extend_from_extract (mode,
6597 XEXP (op0, 1),
6598 op1))
6600 return true;
6603 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6604 No shift. */
6605 else if (GET_CODE (x) == SIGN_EXTEND
6606 || GET_CODE (x) == ZERO_EXTEND)
6607 return REG_P (XEXP (x, 0));
6609 return false;
6612 static bool
6613 aarch64_frint_unspec_p (unsigned int u)
6615 switch (u)
6617 case UNSPEC_FRINTZ:
6618 case UNSPEC_FRINTP:
6619 case UNSPEC_FRINTM:
6620 case UNSPEC_FRINTA:
6621 case UNSPEC_FRINTN:
6622 case UNSPEC_FRINTX:
6623 case UNSPEC_FRINTI:
6624 return true;
6626 default:
6627 return false;
6631 /* Return true iff X is an rtx that will match an extr instruction
6632 i.e. as described in the *extr<mode>5_insn family of patterns.
6633 OP0 and OP1 will be set to the operands of the shifts involved
6634 on success and will be NULL_RTX otherwise. */
6636 static bool
6637 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6639 rtx op0, op1;
6640 machine_mode mode = GET_MODE (x);
6642 *res_op0 = NULL_RTX;
6643 *res_op1 = NULL_RTX;
6645 if (GET_CODE (x) != IOR)
6646 return false;
6648 op0 = XEXP (x, 0);
6649 op1 = XEXP (x, 1);
6651 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6652 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6654 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6655 if (GET_CODE (op1) == ASHIFT)
6656 std::swap (op0, op1);
6658 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6659 return false;
6661 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6662 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6664 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6665 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6667 *res_op0 = XEXP (op0, 0);
6668 *res_op1 = XEXP (op1, 0);
6669 return true;
6673 return false;
6676 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6677 storing it in *COST. Result is true if the total cost of the operation
6678 has now been calculated. */
6679 static bool
6680 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6682 rtx inner;
6683 rtx comparator;
6684 enum rtx_code cmpcode;
6686 if (COMPARISON_P (op0))
6688 inner = XEXP (op0, 0);
6689 comparator = XEXP (op0, 1);
6690 cmpcode = GET_CODE (op0);
6692 else
6694 inner = op0;
6695 comparator = const0_rtx;
6696 cmpcode = NE;
6699 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6701 /* Conditional branch. */
6702 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6703 return true;
6704 else
6706 if (cmpcode == NE || cmpcode == EQ)
6708 if (comparator == const0_rtx)
6710 /* TBZ/TBNZ/CBZ/CBNZ. */
6711 if (GET_CODE (inner) == ZERO_EXTRACT)
6712 /* TBZ/TBNZ. */
6713 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6714 ZERO_EXTRACT, 0, speed);
6715 else
6716 /* CBZ/CBNZ. */
6717 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6719 return true;
6722 else if (cmpcode == LT || cmpcode == GE)
6724 /* TBZ/TBNZ. */
6725 if (comparator == const0_rtx)
6726 return true;
6730 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6732 /* CCMP. */
6733 if (GET_CODE (op1) == COMPARE)
6735 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6736 if (XEXP (op1, 1) == const0_rtx)
6737 *cost += 1;
6738 if (speed)
6740 machine_mode mode = GET_MODE (XEXP (op1, 0));
6741 const struct cpu_cost_table *extra_cost
6742 = aarch64_tune_params.insn_extra_cost;
6744 if (GET_MODE_CLASS (mode) == MODE_INT)
6745 *cost += extra_cost->alu.arith;
6746 else
6747 *cost += extra_cost->fp[mode == DFmode].compare;
6749 return true;
6752 /* It's a conditional operation based on the status flags,
6753 so it must be some flavor of CSEL. */
6755 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6756 if (GET_CODE (op1) == NEG
6757 || GET_CODE (op1) == NOT
6758 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6759 op1 = XEXP (op1, 0);
6760 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6762 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6763 op1 = XEXP (op1, 0);
6764 op2 = XEXP (op2, 0);
6767 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6768 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6769 return true;
6772 /* We don't know what this is, cost all operands. */
6773 return false;
6776 /* Check whether X is a bitfield operation of the form shift + extend that
6777 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6778 operand to which the bitfield operation is applied. Otherwise return
6779 NULL_RTX. */
6781 static rtx
6782 aarch64_extend_bitfield_pattern_p (rtx x)
6784 rtx_code outer_code = GET_CODE (x);
6785 machine_mode outer_mode = GET_MODE (x);
6787 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6788 && outer_mode != SImode && outer_mode != DImode)
6789 return NULL_RTX;
6791 rtx inner = XEXP (x, 0);
6792 rtx_code inner_code = GET_CODE (inner);
6793 machine_mode inner_mode = GET_MODE (inner);
6794 rtx op = NULL_RTX;
6796 switch (inner_code)
6798 case ASHIFT:
6799 if (CONST_INT_P (XEXP (inner, 1))
6800 && (inner_mode == QImode || inner_mode == HImode))
6801 op = XEXP (inner, 0);
6802 break;
6803 case LSHIFTRT:
6804 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6805 && (inner_mode == QImode || inner_mode == HImode))
6806 op = XEXP (inner, 0);
6807 break;
6808 case ASHIFTRT:
6809 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6810 && (inner_mode == QImode || inner_mode == HImode))
6811 op = XEXP (inner, 0);
6812 break;
6813 default:
6814 break;
6817 return op;
6820 /* Return true if the mask and a shift amount from an RTX of the form
6821 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6822 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6824 bool
6825 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6827 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6828 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6829 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6830 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6833 /* Calculate the cost of calculating X, storing it in *COST. Result
6834 is true if the total cost of the operation has now been calculated. */
6835 static bool
6836 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6837 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6839 rtx op0, op1, op2;
6840 const struct cpu_cost_table *extra_cost
6841 = aarch64_tune_params.insn_extra_cost;
6842 int code = GET_CODE (x);
6844 /* By default, assume that everything has equivalent cost to the
6845 cheapest instruction. Any additional costs are applied as a delta
6846 above this default. */
6847 *cost = COSTS_N_INSNS (1);
6849 switch (code)
6851 case SET:
6852 /* The cost depends entirely on the operands to SET. */
6853 *cost = 0;
6854 op0 = SET_DEST (x);
6855 op1 = SET_SRC (x);
6857 switch (GET_CODE (op0))
6859 case MEM:
6860 if (speed)
6862 rtx address = XEXP (op0, 0);
6863 if (VECTOR_MODE_P (mode))
6864 *cost += extra_cost->ldst.storev;
6865 else if (GET_MODE_CLASS (mode) == MODE_INT)
6866 *cost += extra_cost->ldst.store;
6867 else if (mode == SFmode)
6868 *cost += extra_cost->ldst.storef;
6869 else if (mode == DFmode)
6870 *cost += extra_cost->ldst.stored;
6872 *cost +=
6873 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6874 0, speed));
6877 *cost += rtx_cost (op1, mode, SET, 1, speed);
6878 return true;
6880 case SUBREG:
6881 if (! REG_P (SUBREG_REG (op0)))
6882 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6884 /* Fall through. */
6885 case REG:
6886 /* The cost is one per vector-register copied. */
6887 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6889 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6890 / GET_MODE_SIZE (V4SImode);
6891 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6893 /* const0_rtx is in general free, but we will use an
6894 instruction to set a register to 0. */
6895 else if (REG_P (op1) || op1 == const0_rtx)
6897 /* The cost is 1 per register copied. */
6898 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6899 / UNITS_PER_WORD;
6900 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6902 else
6903 /* Cost is just the cost of the RHS of the set. */
6904 *cost += rtx_cost (op1, mode, SET, 1, speed);
6905 return true;
6907 case ZERO_EXTRACT:
6908 case SIGN_EXTRACT:
6909 /* Bit-field insertion. Strip any redundant widening of
6910 the RHS to meet the width of the target. */
6911 if (GET_CODE (op1) == SUBREG)
6912 op1 = SUBREG_REG (op1);
6913 if ((GET_CODE (op1) == ZERO_EXTEND
6914 || GET_CODE (op1) == SIGN_EXTEND)
6915 && CONST_INT_P (XEXP (op0, 1))
6916 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6917 >= INTVAL (XEXP (op0, 1))))
6918 op1 = XEXP (op1, 0);
6920 if (CONST_INT_P (op1))
6922 /* MOV immediate is assumed to always be cheap. */
6923 *cost = COSTS_N_INSNS (1);
6925 else
6927 /* BFM. */
6928 if (speed)
6929 *cost += extra_cost->alu.bfi;
6930 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6933 return true;
6935 default:
6936 /* We can't make sense of this, assume default cost. */
6937 *cost = COSTS_N_INSNS (1);
6938 return false;
6940 return false;
6942 case CONST_INT:
6943 /* If an instruction can incorporate a constant within the
6944 instruction, the instruction's expression avoids calling
6945 rtx_cost() on the constant. If rtx_cost() is called on a
6946 constant, then it is usually because the constant must be
6947 moved into a register by one or more instructions.
6949 The exception is constant 0, which can be expressed
6950 as XZR/WZR and is therefore free. The exception to this is
6951 if we have (set (reg) (const0_rtx)) in which case we must cost
6952 the move. However, we can catch that when we cost the SET, so
6953 we don't need to consider that here. */
6954 if (x == const0_rtx)
6955 *cost = 0;
6956 else
6958 /* To an approximation, building any other constant is
6959 proportionally expensive to the number of instructions
6960 required to build that constant. This is true whether we
6961 are compiling for SPEED or otherwise. */
6962 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6963 (NULL_RTX, x, false, mode));
6965 return true;
6967 case CONST_DOUBLE:
6969 /* First determine number of instructions to do the move
6970 as an integer constant. */
6971 if (!aarch64_float_const_representable_p (x)
6972 && !aarch64_can_const_movi_rtx_p (x, mode)
6973 && aarch64_float_const_rtx_p (x))
6975 unsigned HOST_WIDE_INT ival;
6976 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6977 gcc_assert (succeed);
6979 machine_mode imode = mode == HFmode ? SImode
6980 : int_mode_for_mode (mode);
6981 int ncost = aarch64_internal_mov_immediate
6982 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6983 *cost += COSTS_N_INSNS (ncost);
6984 return true;
6987 if (speed)
6989 /* mov[df,sf]_aarch64. */
6990 if (aarch64_float_const_representable_p (x))
6991 /* FMOV (scalar immediate). */
6992 *cost += extra_cost->fp[mode == DFmode].fpconst;
6993 else if (!aarch64_float_const_zero_rtx_p (x))
6995 /* This will be a load from memory. */
6996 if (mode == DFmode)
6997 *cost += extra_cost->ldst.loadd;
6998 else
6999 *cost += extra_cost->ldst.loadf;
7001 else
7002 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7003 or MOV v0.s[0], wzr - neither of which are modeled by the
7004 cost tables. Just use the default cost. */
7009 return true;
7011 case MEM:
7012 if (speed)
7014 /* For loads we want the base cost of a load, plus an
7015 approximation for the additional cost of the addressing
7016 mode. */
7017 rtx address = XEXP (x, 0);
7018 if (VECTOR_MODE_P (mode))
7019 *cost += extra_cost->ldst.loadv;
7020 else if (GET_MODE_CLASS (mode) == MODE_INT)
7021 *cost += extra_cost->ldst.load;
7022 else if (mode == SFmode)
7023 *cost += extra_cost->ldst.loadf;
7024 else if (mode == DFmode)
7025 *cost += extra_cost->ldst.loadd;
7027 *cost +=
7028 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7029 0, speed));
7032 return true;
7034 case NEG:
7035 op0 = XEXP (x, 0);
7037 if (VECTOR_MODE_P (mode))
7039 if (speed)
7041 /* FNEG. */
7042 *cost += extra_cost->vect.alu;
7044 return false;
7047 if (GET_MODE_CLASS (mode) == MODE_INT)
7049 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7050 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7052 /* CSETM. */
7053 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7054 return true;
7057 /* Cost this as SUB wzr, X. */
7058 op0 = CONST0_RTX (mode);
7059 op1 = XEXP (x, 0);
7060 goto cost_minus;
7063 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7065 /* Support (neg(fma...)) as a single instruction only if
7066 sign of zeros is unimportant. This matches the decision
7067 making in aarch64.md. */
7068 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7070 /* FNMADD. */
7071 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7072 return true;
7074 if (GET_CODE (op0) == MULT)
7076 /* FNMUL. */
7077 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7078 return true;
7080 if (speed)
7081 /* FNEG. */
7082 *cost += extra_cost->fp[mode == DFmode].neg;
7083 return false;
7086 return false;
7088 case CLRSB:
7089 case CLZ:
7090 if (speed)
7092 if (VECTOR_MODE_P (mode))
7093 *cost += extra_cost->vect.alu;
7094 else
7095 *cost += extra_cost->alu.clz;
7098 return false;
7100 case COMPARE:
7101 op0 = XEXP (x, 0);
7102 op1 = XEXP (x, 1);
7104 if (op1 == const0_rtx
7105 && GET_CODE (op0) == AND)
7107 x = op0;
7108 mode = GET_MODE (op0);
7109 goto cost_logic;
7112 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7114 /* TODO: A write to the CC flags possibly costs extra, this
7115 needs encoding in the cost tables. */
7117 mode = GET_MODE (op0);
7118 /* ANDS. */
7119 if (GET_CODE (op0) == AND)
7121 x = op0;
7122 goto cost_logic;
7125 if (GET_CODE (op0) == PLUS)
7127 /* ADDS (and CMN alias). */
7128 x = op0;
7129 goto cost_plus;
7132 if (GET_CODE (op0) == MINUS)
7134 /* SUBS. */
7135 x = op0;
7136 goto cost_minus;
7139 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7140 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7141 && CONST_INT_P (XEXP (op0, 2)))
7143 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7144 Handle it here directly rather than going to cost_logic
7145 since we know the immediate generated for the TST is valid
7146 so we can avoid creating an intermediate rtx for it only
7147 for costing purposes. */
7148 if (speed)
7149 *cost += extra_cost->alu.logical;
7151 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7152 ZERO_EXTRACT, 0, speed);
7153 return true;
7156 if (GET_CODE (op1) == NEG)
7158 /* CMN. */
7159 if (speed)
7160 *cost += extra_cost->alu.arith;
7162 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7163 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7164 return true;
7167 /* CMP.
7169 Compare can freely swap the order of operands, and
7170 canonicalization puts the more complex operation first.
7171 But the integer MINUS logic expects the shift/extend
7172 operation in op1. */
7173 if (! (REG_P (op0)
7174 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7176 op0 = XEXP (x, 1);
7177 op1 = XEXP (x, 0);
7179 goto cost_minus;
7182 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7184 /* FCMP. */
7185 if (speed)
7186 *cost += extra_cost->fp[mode == DFmode].compare;
7188 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7190 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7191 /* FCMP supports constant 0.0 for no extra cost. */
7192 return true;
7194 return false;
7197 if (VECTOR_MODE_P (mode))
7199 /* Vector compare. */
7200 if (speed)
7201 *cost += extra_cost->vect.alu;
7203 if (aarch64_float_const_zero_rtx_p (op1))
7205 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7206 cost. */
7207 return true;
7209 return false;
7211 return false;
7213 case MINUS:
7215 op0 = XEXP (x, 0);
7216 op1 = XEXP (x, 1);
7218 cost_minus:
7219 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7221 /* Detect valid immediates. */
7222 if ((GET_MODE_CLASS (mode) == MODE_INT
7223 || (GET_MODE_CLASS (mode) == MODE_CC
7224 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7225 && CONST_INT_P (op1)
7226 && aarch64_uimm12_shift (INTVAL (op1)))
7228 if (speed)
7229 /* SUB(S) (immediate). */
7230 *cost += extra_cost->alu.arith;
7231 return true;
7234 /* Look for SUB (extended register). */
7235 if (aarch64_rtx_arith_op_extract_p (op1, mode))
7237 if (speed)
7238 *cost += extra_cost->alu.extend_arith;
7240 op1 = aarch64_strip_extend (op1, true);
7241 *cost += rtx_cost (op1, VOIDmode,
7242 (enum rtx_code) GET_CODE (op1), 0, speed);
7243 return true;
7246 rtx new_op1 = aarch64_strip_extend (op1, false);
7248 /* Cost this as an FMA-alike operation. */
7249 if ((GET_CODE (new_op1) == MULT
7250 || aarch64_shift_p (GET_CODE (new_op1)))
7251 && code != COMPARE)
7253 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7254 (enum rtx_code) code,
7255 speed);
7256 return true;
7259 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7261 if (speed)
7263 if (VECTOR_MODE_P (mode))
7265 /* Vector SUB. */
7266 *cost += extra_cost->vect.alu;
7268 else if (GET_MODE_CLASS (mode) == MODE_INT)
7270 /* SUB(S). */
7271 *cost += extra_cost->alu.arith;
7273 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7275 /* FSUB. */
7276 *cost += extra_cost->fp[mode == DFmode].addsub;
7279 return true;
7282 case PLUS:
7284 rtx new_op0;
7286 op0 = XEXP (x, 0);
7287 op1 = XEXP (x, 1);
7289 cost_plus:
7290 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7291 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7293 /* CSINC. */
7294 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7295 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7296 return true;
7299 if (GET_MODE_CLASS (mode) == MODE_INT
7300 && CONST_INT_P (op1)
7301 && aarch64_uimm12_shift (INTVAL (op1)))
7303 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7305 if (speed)
7306 /* ADD (immediate). */
7307 *cost += extra_cost->alu.arith;
7308 return true;
7311 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7313 /* Look for ADD (extended register). */
7314 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7316 if (speed)
7317 *cost += extra_cost->alu.extend_arith;
7319 op0 = aarch64_strip_extend (op0, true);
7320 *cost += rtx_cost (op0, VOIDmode,
7321 (enum rtx_code) GET_CODE (op0), 0, speed);
7322 return true;
7325 /* Strip any extend, leave shifts behind as we will
7326 cost them through mult_cost. */
7327 new_op0 = aarch64_strip_extend (op0, false);
7329 if (GET_CODE (new_op0) == MULT
7330 || aarch64_shift_p (GET_CODE (new_op0)))
7332 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7333 speed);
7334 return true;
7337 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7339 if (speed)
7341 if (VECTOR_MODE_P (mode))
7343 /* Vector ADD. */
7344 *cost += extra_cost->vect.alu;
7346 else if (GET_MODE_CLASS (mode) == MODE_INT)
7348 /* ADD. */
7349 *cost += extra_cost->alu.arith;
7351 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7353 /* FADD. */
7354 *cost += extra_cost->fp[mode == DFmode].addsub;
7357 return true;
7360 case BSWAP:
7361 *cost = COSTS_N_INSNS (1);
7363 if (speed)
7365 if (VECTOR_MODE_P (mode))
7366 *cost += extra_cost->vect.alu;
7367 else
7368 *cost += extra_cost->alu.rev;
7370 return false;
7372 case IOR:
7373 if (aarch_rev16_p (x))
7375 *cost = COSTS_N_INSNS (1);
7377 if (speed)
7379 if (VECTOR_MODE_P (mode))
7380 *cost += extra_cost->vect.alu;
7381 else
7382 *cost += extra_cost->alu.rev;
7384 return true;
7387 if (aarch64_extr_rtx_p (x, &op0, &op1))
7389 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7390 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7391 if (speed)
7392 *cost += extra_cost->alu.shift;
7394 return true;
7396 /* Fall through. */
7397 case XOR:
7398 case AND:
7399 cost_logic:
7400 op0 = XEXP (x, 0);
7401 op1 = XEXP (x, 1);
7403 if (VECTOR_MODE_P (mode))
7405 if (speed)
7406 *cost += extra_cost->vect.alu;
7407 return true;
7410 if (code == AND
7411 && GET_CODE (op0) == MULT
7412 && CONST_INT_P (XEXP (op0, 1))
7413 && CONST_INT_P (op1)
7414 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7415 INTVAL (op1)) != 0)
7417 /* This is a UBFM/SBFM. */
7418 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7419 if (speed)
7420 *cost += extra_cost->alu.bfx;
7421 return true;
7424 if (GET_MODE_CLASS (mode) == MODE_INT)
7426 if (CONST_INT_P (op1))
7428 /* We have a mask + shift version of a UBFIZ
7429 i.e. the *andim_ashift<mode>_bfiz pattern. */
7430 if (GET_CODE (op0) == ASHIFT
7431 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7432 XEXP (op0, 1)))
7434 *cost += rtx_cost (XEXP (op0, 0), mode,
7435 (enum rtx_code) code, 0, speed);
7436 if (speed)
7437 *cost += extra_cost->alu.bfx;
7439 return true;
7441 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7443 /* We possibly get the immediate for free, this is not
7444 modelled. */
7445 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7446 if (speed)
7447 *cost += extra_cost->alu.logical;
7449 return true;
7452 else
7454 rtx new_op0 = op0;
7456 /* Handle ORN, EON, or BIC. */
7457 if (GET_CODE (op0) == NOT)
7458 op0 = XEXP (op0, 0);
7460 new_op0 = aarch64_strip_shift (op0);
7462 /* If we had a shift on op0 then this is a logical-shift-
7463 by-register/immediate operation. Otherwise, this is just
7464 a logical operation. */
7465 if (speed)
7467 if (new_op0 != op0)
7469 /* Shift by immediate. */
7470 if (CONST_INT_P (XEXP (op0, 1)))
7471 *cost += extra_cost->alu.log_shift;
7472 else
7473 *cost += extra_cost->alu.log_shift_reg;
7475 else
7476 *cost += extra_cost->alu.logical;
7479 /* In both cases we want to cost both operands. */
7480 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7481 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7483 return true;
7486 return false;
7488 case NOT:
7489 x = XEXP (x, 0);
7490 op0 = aarch64_strip_shift (x);
7492 if (VECTOR_MODE_P (mode))
7494 /* Vector NOT. */
7495 *cost += extra_cost->vect.alu;
7496 return false;
7499 /* MVN-shifted-reg. */
7500 if (op0 != x)
7502 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7504 if (speed)
7505 *cost += extra_cost->alu.log_shift;
7507 return true;
7509 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7510 Handle the second form here taking care that 'a' in the above can
7511 be a shift. */
7512 else if (GET_CODE (op0) == XOR)
7514 rtx newop0 = XEXP (op0, 0);
7515 rtx newop1 = XEXP (op0, 1);
7516 rtx op0_stripped = aarch64_strip_shift (newop0);
7518 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7519 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7521 if (speed)
7523 if (op0_stripped != newop0)
7524 *cost += extra_cost->alu.log_shift;
7525 else
7526 *cost += extra_cost->alu.logical;
7529 return true;
7531 /* MVN. */
7532 if (speed)
7533 *cost += extra_cost->alu.logical;
7535 return false;
7537 case ZERO_EXTEND:
7539 op0 = XEXP (x, 0);
7540 /* If a value is written in SI mode, then zero extended to DI
7541 mode, the operation will in general be free as a write to
7542 a 'w' register implicitly zeroes the upper bits of an 'x'
7543 register. However, if this is
7545 (set (reg) (zero_extend (reg)))
7547 we must cost the explicit register move. */
7548 if (mode == DImode
7549 && GET_MODE (op0) == SImode
7550 && outer == SET)
7552 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7554 /* If OP_COST is non-zero, then the cost of the zero extend
7555 is effectively the cost of the inner operation. Otherwise
7556 we have a MOV instruction and we take the cost from the MOV
7557 itself. This is true independently of whether we are
7558 optimizing for space or time. */
7559 if (op_cost)
7560 *cost = op_cost;
7562 return true;
7564 else if (MEM_P (op0))
7566 /* All loads can zero extend to any size for free. */
7567 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7568 return true;
7571 op0 = aarch64_extend_bitfield_pattern_p (x);
7572 if (op0)
7574 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7575 if (speed)
7576 *cost += extra_cost->alu.bfx;
7577 return true;
7580 if (speed)
7582 if (VECTOR_MODE_P (mode))
7584 /* UMOV. */
7585 *cost += extra_cost->vect.alu;
7587 else
7589 /* We generate an AND instead of UXTB/UXTH. */
7590 *cost += extra_cost->alu.logical;
7593 return false;
7595 case SIGN_EXTEND:
7596 if (MEM_P (XEXP (x, 0)))
7598 /* LDRSH. */
7599 if (speed)
7601 rtx address = XEXP (XEXP (x, 0), 0);
7602 *cost += extra_cost->ldst.load_sign_extend;
7604 *cost +=
7605 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7606 0, speed));
7608 return true;
7611 op0 = aarch64_extend_bitfield_pattern_p (x);
7612 if (op0)
7614 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7615 if (speed)
7616 *cost += extra_cost->alu.bfx;
7617 return true;
7620 if (speed)
7622 if (VECTOR_MODE_P (mode))
7623 *cost += extra_cost->vect.alu;
7624 else
7625 *cost += extra_cost->alu.extend;
7627 return false;
7629 case ASHIFT:
7630 op0 = XEXP (x, 0);
7631 op1 = XEXP (x, 1);
7633 if (CONST_INT_P (op1))
7635 if (speed)
7637 if (VECTOR_MODE_P (mode))
7639 /* Vector shift (immediate). */
7640 *cost += extra_cost->vect.alu;
7642 else
7644 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7645 aliases. */
7646 *cost += extra_cost->alu.shift;
7650 /* We can incorporate zero/sign extend for free. */
7651 if (GET_CODE (op0) == ZERO_EXTEND
7652 || GET_CODE (op0) == SIGN_EXTEND)
7653 op0 = XEXP (op0, 0);
7655 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7656 return true;
7658 else
7660 if (VECTOR_MODE_P (mode))
7662 if (speed)
7663 /* Vector shift (register). */
7664 *cost += extra_cost->vect.alu;
7666 else
7668 if (speed)
7669 /* LSLV. */
7670 *cost += extra_cost->alu.shift_reg;
7672 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7673 && CONST_INT_P (XEXP (op1, 1))
7674 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7676 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7677 /* We already demanded XEXP (op1, 0) to be REG_P, so
7678 don't recurse into it. */
7679 return true;
7682 return false; /* All arguments need to be in registers. */
7685 case ROTATE:
7686 case ROTATERT:
7687 case LSHIFTRT:
7688 case ASHIFTRT:
7689 op0 = XEXP (x, 0);
7690 op1 = XEXP (x, 1);
7692 if (CONST_INT_P (op1))
7694 /* ASR (immediate) and friends. */
7695 if (speed)
7697 if (VECTOR_MODE_P (mode))
7698 *cost += extra_cost->vect.alu;
7699 else
7700 *cost += extra_cost->alu.shift;
7703 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7704 return true;
7706 else
7708 if (VECTOR_MODE_P (mode))
7710 if (speed)
7711 /* Vector shift (register). */
7712 *cost += extra_cost->vect.alu;
7714 else
7716 if (speed)
7717 /* ASR (register) and friends. */
7718 *cost += extra_cost->alu.shift_reg;
7720 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7721 && CONST_INT_P (XEXP (op1, 1))
7722 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7724 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7725 /* We already demanded XEXP (op1, 0) to be REG_P, so
7726 don't recurse into it. */
7727 return true;
7730 return false; /* All arguments need to be in registers. */
7733 case SYMBOL_REF:
7735 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7736 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7738 /* LDR. */
7739 if (speed)
7740 *cost += extra_cost->ldst.load;
7742 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7743 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7745 /* ADRP, followed by ADD. */
7746 *cost += COSTS_N_INSNS (1);
7747 if (speed)
7748 *cost += 2 * extra_cost->alu.arith;
7750 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7751 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7753 /* ADR. */
7754 if (speed)
7755 *cost += extra_cost->alu.arith;
7758 if (flag_pic)
7760 /* One extra load instruction, after accessing the GOT. */
7761 *cost += COSTS_N_INSNS (1);
7762 if (speed)
7763 *cost += extra_cost->ldst.load;
7765 return true;
7767 case HIGH:
7768 case LO_SUM:
7769 /* ADRP/ADD (immediate). */
7770 if (speed)
7771 *cost += extra_cost->alu.arith;
7772 return true;
7774 case ZERO_EXTRACT:
7775 case SIGN_EXTRACT:
7776 /* UBFX/SBFX. */
7777 if (speed)
7779 if (VECTOR_MODE_P (mode))
7780 *cost += extra_cost->vect.alu;
7781 else
7782 *cost += extra_cost->alu.bfx;
7785 /* We can trust that the immediates used will be correct (there
7786 are no by-register forms), so we need only cost op0. */
7787 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7788 return true;
7790 case MULT:
7791 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7792 /* aarch64_rtx_mult_cost always handles recursion to its
7793 operands. */
7794 return true;
7796 case MOD:
7797 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7798 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7799 an unconditional negate. This case should only ever be reached through
7800 the set_smod_pow2_cheap check in expmed.c. */
7801 if (CONST_INT_P (XEXP (x, 1))
7802 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7803 && (mode == SImode || mode == DImode))
7805 /* We expand to 4 instructions. Reset the baseline. */
7806 *cost = COSTS_N_INSNS (4);
7808 if (speed)
7809 *cost += 2 * extra_cost->alu.logical
7810 + 2 * extra_cost->alu.arith;
7812 return true;
7815 /* Fall-through. */
7816 case UMOD:
7817 if (speed)
7819 /* Slighly prefer UMOD over SMOD. */
7820 if (VECTOR_MODE_P (mode))
7821 *cost += extra_cost->vect.alu;
7822 else if (GET_MODE_CLASS (mode) == MODE_INT)
7823 *cost += (extra_cost->mult[mode == DImode].add
7824 + extra_cost->mult[mode == DImode].idiv
7825 + (code == MOD ? 1 : 0));
7827 return false; /* All arguments need to be in registers. */
7829 case DIV:
7830 case UDIV:
7831 case SQRT:
7832 if (speed)
7834 if (VECTOR_MODE_P (mode))
7835 *cost += extra_cost->vect.alu;
7836 else if (GET_MODE_CLASS (mode) == MODE_INT)
7837 /* There is no integer SQRT, so only DIV and UDIV can get
7838 here. */
7839 *cost += (extra_cost->mult[mode == DImode].idiv
7840 /* Slighly prefer UDIV over SDIV. */
7841 + (code == DIV ? 1 : 0));
7842 else
7843 *cost += extra_cost->fp[mode == DFmode].div;
7845 return false; /* All arguments need to be in registers. */
7847 case IF_THEN_ELSE:
7848 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7849 XEXP (x, 2), cost, speed);
7851 case EQ:
7852 case NE:
7853 case GT:
7854 case GTU:
7855 case LT:
7856 case LTU:
7857 case GE:
7858 case GEU:
7859 case LE:
7860 case LEU:
7862 return false; /* All arguments must be in registers. */
7864 case FMA:
7865 op0 = XEXP (x, 0);
7866 op1 = XEXP (x, 1);
7867 op2 = XEXP (x, 2);
7869 if (speed)
7871 if (VECTOR_MODE_P (mode))
7872 *cost += extra_cost->vect.alu;
7873 else
7874 *cost += extra_cost->fp[mode == DFmode].fma;
7877 /* FMSUB, FNMADD, and FNMSUB are free. */
7878 if (GET_CODE (op0) == NEG)
7879 op0 = XEXP (op0, 0);
7881 if (GET_CODE (op2) == NEG)
7882 op2 = XEXP (op2, 0);
7884 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7885 and the by-element operand as operand 0. */
7886 if (GET_CODE (op1) == NEG)
7887 op1 = XEXP (op1, 0);
7889 /* Catch vector-by-element operations. The by-element operand can
7890 either be (vec_duplicate (vec_select (x))) or just
7891 (vec_select (x)), depending on whether we are multiplying by
7892 a vector or a scalar.
7894 Canonicalization is not very good in these cases, FMA4 will put the
7895 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7896 if (GET_CODE (op0) == VEC_DUPLICATE)
7897 op0 = XEXP (op0, 0);
7898 else if (GET_CODE (op1) == VEC_DUPLICATE)
7899 op1 = XEXP (op1, 0);
7901 if (GET_CODE (op0) == VEC_SELECT)
7902 op0 = XEXP (op0, 0);
7903 else if (GET_CODE (op1) == VEC_SELECT)
7904 op1 = XEXP (op1, 0);
7906 /* If the remaining parameters are not registers,
7907 get the cost to put them into registers. */
7908 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7909 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7910 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7911 return true;
7913 case FLOAT:
7914 case UNSIGNED_FLOAT:
7915 if (speed)
7916 *cost += extra_cost->fp[mode == DFmode].fromint;
7917 return false;
7919 case FLOAT_EXTEND:
7920 if (speed)
7922 if (VECTOR_MODE_P (mode))
7924 /*Vector truncate. */
7925 *cost += extra_cost->vect.alu;
7927 else
7928 *cost += extra_cost->fp[mode == DFmode].widen;
7930 return false;
7932 case FLOAT_TRUNCATE:
7933 if (speed)
7935 if (VECTOR_MODE_P (mode))
7937 /*Vector conversion. */
7938 *cost += extra_cost->vect.alu;
7940 else
7941 *cost += extra_cost->fp[mode == DFmode].narrow;
7943 return false;
7945 case FIX:
7946 case UNSIGNED_FIX:
7947 x = XEXP (x, 0);
7948 /* Strip the rounding part. They will all be implemented
7949 by the fcvt* family of instructions anyway. */
7950 if (GET_CODE (x) == UNSPEC)
7952 unsigned int uns_code = XINT (x, 1);
7954 if (uns_code == UNSPEC_FRINTA
7955 || uns_code == UNSPEC_FRINTM
7956 || uns_code == UNSPEC_FRINTN
7957 || uns_code == UNSPEC_FRINTP
7958 || uns_code == UNSPEC_FRINTZ)
7959 x = XVECEXP (x, 0, 0);
7962 if (speed)
7964 if (VECTOR_MODE_P (mode))
7965 *cost += extra_cost->vect.alu;
7966 else
7967 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7970 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7971 fixed-point fcvt. */
7972 if (GET_CODE (x) == MULT
7973 && ((VECTOR_MODE_P (mode)
7974 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7975 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7977 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7978 0, speed);
7979 return true;
7982 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7983 return true;
7985 case ABS:
7986 if (VECTOR_MODE_P (mode))
7988 /* ABS (vector). */
7989 if (speed)
7990 *cost += extra_cost->vect.alu;
7992 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7994 op0 = XEXP (x, 0);
7996 /* FABD, which is analogous to FADD. */
7997 if (GET_CODE (op0) == MINUS)
7999 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8000 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8001 if (speed)
8002 *cost += extra_cost->fp[mode == DFmode].addsub;
8004 return true;
8006 /* Simple FABS is analogous to FNEG. */
8007 if (speed)
8008 *cost += extra_cost->fp[mode == DFmode].neg;
8010 else
8012 /* Integer ABS will either be split to
8013 two arithmetic instructions, or will be an ABS
8014 (scalar), which we don't model. */
8015 *cost = COSTS_N_INSNS (2);
8016 if (speed)
8017 *cost += 2 * extra_cost->alu.arith;
8019 return false;
8021 case SMAX:
8022 case SMIN:
8023 if (speed)
8025 if (VECTOR_MODE_P (mode))
8026 *cost += extra_cost->vect.alu;
8027 else
8029 /* FMAXNM/FMINNM/FMAX/FMIN.
8030 TODO: This may not be accurate for all implementations, but
8031 we do not model this in the cost tables. */
8032 *cost += extra_cost->fp[mode == DFmode].addsub;
8035 return false;
8037 case UNSPEC:
8038 /* The floating point round to integer frint* instructions. */
8039 if (aarch64_frint_unspec_p (XINT (x, 1)))
8041 if (speed)
8042 *cost += extra_cost->fp[mode == DFmode].roundint;
8044 return false;
8047 if (XINT (x, 1) == UNSPEC_RBIT)
8049 if (speed)
8050 *cost += extra_cost->alu.rev;
8052 return false;
8054 break;
8056 case TRUNCATE:
8058 /* Decompose <su>muldi3_highpart. */
8059 if (/* (truncate:DI */
8060 mode == DImode
8061 /* (lshiftrt:TI */
8062 && GET_MODE (XEXP (x, 0)) == TImode
8063 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8064 /* (mult:TI */
8065 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8066 /* (ANY_EXTEND:TI (reg:DI))
8067 (ANY_EXTEND:TI (reg:DI))) */
8068 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8069 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8070 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8071 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8072 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8073 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8074 /* (const_int 64) */
8075 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8076 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8078 /* UMULH/SMULH. */
8079 if (speed)
8080 *cost += extra_cost->mult[mode == DImode].extend;
8081 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8082 mode, MULT, 0, speed);
8083 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8084 mode, MULT, 1, speed);
8085 return true;
8088 /* Fall through. */
8089 default:
8090 break;
8093 if (dump_file
8094 && flag_aarch64_verbose_cost)
8095 fprintf (dump_file,
8096 "\nFailed to cost RTX. Assuming default cost.\n");
8098 return true;
8101 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8102 calculated for X. This cost is stored in *COST. Returns true
8103 if the total cost of X was calculated. */
8104 static bool
8105 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8106 int param, int *cost, bool speed)
8108 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8110 if (dump_file
8111 && flag_aarch64_verbose_cost)
8113 print_rtl_single (dump_file, x);
8114 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8115 speed ? "Hot" : "Cold",
8116 *cost, result ? "final" : "partial");
8119 return result;
8122 static int
8123 aarch64_register_move_cost (machine_mode mode,
8124 reg_class_t from_i, reg_class_t to_i)
8126 enum reg_class from = (enum reg_class) from_i;
8127 enum reg_class to = (enum reg_class) to_i;
8128 const struct cpu_regmove_cost *regmove_cost
8129 = aarch64_tune_params.regmove_cost;
8131 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8132 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8133 to = GENERAL_REGS;
8135 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8136 from = GENERAL_REGS;
8138 /* Moving between GPR and stack cost is the same as GP2GP. */
8139 if ((from == GENERAL_REGS && to == STACK_REG)
8140 || (to == GENERAL_REGS && from == STACK_REG))
8141 return regmove_cost->GP2GP;
8143 /* To/From the stack register, we move via the gprs. */
8144 if (to == STACK_REG || from == STACK_REG)
8145 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8146 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8148 if (GET_MODE_SIZE (mode) == 16)
8150 /* 128-bit operations on general registers require 2 instructions. */
8151 if (from == GENERAL_REGS && to == GENERAL_REGS)
8152 return regmove_cost->GP2GP * 2;
8153 else if (from == GENERAL_REGS)
8154 return regmove_cost->GP2FP * 2;
8155 else if (to == GENERAL_REGS)
8156 return regmove_cost->FP2GP * 2;
8158 /* When AdvSIMD instructions are disabled it is not possible to move
8159 a 128-bit value directly between Q registers. This is handled in
8160 secondary reload. A general register is used as a scratch to move
8161 the upper DI value and the lower DI value is moved directly,
8162 hence the cost is the sum of three moves. */
8163 if (! TARGET_SIMD)
8164 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8166 return regmove_cost->FP2FP;
8169 if (from == GENERAL_REGS && to == GENERAL_REGS)
8170 return regmove_cost->GP2GP;
8171 else if (from == GENERAL_REGS)
8172 return regmove_cost->GP2FP;
8173 else if (to == GENERAL_REGS)
8174 return regmove_cost->FP2GP;
8176 return regmove_cost->FP2FP;
8179 static int
8180 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8181 reg_class_t rclass ATTRIBUTE_UNUSED,
8182 bool in ATTRIBUTE_UNUSED)
8184 return aarch64_tune_params.memmov_cost;
8187 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8188 to optimize 1.0/sqrt. */
8190 static bool
8191 use_rsqrt_p (machine_mode mode)
8193 return (!flag_trapping_math
8194 && flag_unsafe_math_optimizations
8195 && ((aarch64_tune_params.approx_modes->recip_sqrt
8196 & AARCH64_APPROX_MODE (mode))
8197 || flag_mrecip_low_precision_sqrt));
8200 /* Function to decide when to use the approximate reciprocal square root
8201 builtin. */
8203 static tree
8204 aarch64_builtin_reciprocal (tree fndecl)
8206 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8208 if (!use_rsqrt_p (mode))
8209 return NULL_TREE;
8210 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8213 typedef rtx (*rsqrte_type) (rtx, rtx);
8215 /* Select reciprocal square root initial estimate insn depending on machine
8216 mode. */
8218 static rsqrte_type
8219 get_rsqrte_type (machine_mode mode)
8221 switch (mode)
8223 case DFmode: return gen_aarch64_rsqrtedf;
8224 case SFmode: return gen_aarch64_rsqrtesf;
8225 case V2DFmode: return gen_aarch64_rsqrtev2df;
8226 case V2SFmode: return gen_aarch64_rsqrtev2sf;
8227 case V4SFmode: return gen_aarch64_rsqrtev4sf;
8228 default: gcc_unreachable ();
8232 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8234 /* Select reciprocal square root series step insn depending on machine mode. */
8236 static rsqrts_type
8237 get_rsqrts_type (machine_mode mode)
8239 switch (mode)
8241 case DFmode: return gen_aarch64_rsqrtsdf;
8242 case SFmode: return gen_aarch64_rsqrtssf;
8243 case V2DFmode: return gen_aarch64_rsqrtsv2df;
8244 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
8245 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
8246 default: gcc_unreachable ();
8250 /* Emit instruction sequence to compute either the approximate square root
8251 or its approximate reciprocal, depending on the flag RECP, and return
8252 whether the sequence was emitted or not. */
8254 bool
8255 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8257 machine_mode mode = GET_MODE (dst);
8259 if (GET_MODE_INNER (mode) == HFmode)
8261 gcc_assert (!recp);
8262 return false;
8265 machine_mode mmsk
8266 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8267 GET_MODE_NUNITS (mode));
8268 if (!recp)
8270 if (!(flag_mlow_precision_sqrt
8271 || (aarch64_tune_params.approx_modes->sqrt
8272 & AARCH64_APPROX_MODE (mode))))
8273 return false;
8275 if (flag_finite_math_only
8276 || flag_trapping_math
8277 || !flag_unsafe_math_optimizations
8278 || optimize_function_for_size_p (cfun))
8279 return false;
8281 else
8282 /* Caller assumes we cannot fail. */
8283 gcc_assert (use_rsqrt_p (mode));
8286 rtx xmsk = gen_reg_rtx (mmsk);
8287 if (!recp)
8288 /* When calculating the approximate square root, compare the
8289 argument with 0.0 and create a mask. */
8290 emit_insn (gen_rtx_SET (xmsk,
8291 gen_rtx_NEG (mmsk,
8292 gen_rtx_EQ (mmsk, src,
8293 CONST0_RTX (mode)))));
8295 /* Estimate the approximate reciprocal square root. */
8296 rtx xdst = gen_reg_rtx (mode);
8297 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8299 /* Iterate over the series twice for SF and thrice for DF. */
8300 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8302 /* Optionally iterate over the series once less for faster performance
8303 while sacrificing the accuracy. */
8304 if ((recp && flag_mrecip_low_precision_sqrt)
8305 || (!recp && flag_mlow_precision_sqrt))
8306 iterations--;
8308 /* Iterate over the series to calculate the approximate reciprocal square
8309 root. */
8310 rtx x1 = gen_reg_rtx (mode);
8311 while (iterations--)
8313 rtx x2 = gen_reg_rtx (mode);
8314 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8316 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8318 if (iterations > 0)
8319 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8322 if (!recp)
8324 /* Qualify the approximate reciprocal square root when the argument is
8325 0.0 by squashing the intermediary result to 0.0. */
8326 rtx xtmp = gen_reg_rtx (mmsk);
8327 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8328 gen_rtx_SUBREG (mmsk, xdst, 0)));
8329 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8331 /* Calculate the approximate square root. */
8332 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8335 /* Finalize the approximation. */
8336 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8338 return true;
8341 typedef rtx (*recpe_type) (rtx, rtx);
8343 /* Select reciprocal initial estimate insn depending on machine mode. */
8345 static recpe_type
8346 get_recpe_type (machine_mode mode)
8348 switch (mode)
8350 case SFmode: return (gen_aarch64_frecpesf);
8351 case V2SFmode: return (gen_aarch64_frecpev2sf);
8352 case V4SFmode: return (gen_aarch64_frecpev4sf);
8353 case DFmode: return (gen_aarch64_frecpedf);
8354 case V2DFmode: return (gen_aarch64_frecpev2df);
8355 default: gcc_unreachable ();
8359 typedef rtx (*recps_type) (rtx, rtx, rtx);
8361 /* Select reciprocal series step insn depending on machine mode. */
8363 static recps_type
8364 get_recps_type (machine_mode mode)
8366 switch (mode)
8368 case SFmode: return (gen_aarch64_frecpssf);
8369 case V2SFmode: return (gen_aarch64_frecpsv2sf);
8370 case V4SFmode: return (gen_aarch64_frecpsv4sf);
8371 case DFmode: return (gen_aarch64_frecpsdf);
8372 case V2DFmode: return (gen_aarch64_frecpsv2df);
8373 default: gcc_unreachable ();
8377 /* Emit the instruction sequence to compute the approximation for the division
8378 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8380 bool
8381 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8383 machine_mode mode = GET_MODE (quo);
8385 if (GET_MODE_INNER (mode) == HFmode)
8386 return false;
8388 bool use_approx_division_p = (flag_mlow_precision_div
8389 || (aarch64_tune_params.approx_modes->division
8390 & AARCH64_APPROX_MODE (mode)));
8392 if (!flag_finite_math_only
8393 || flag_trapping_math
8394 || !flag_unsafe_math_optimizations
8395 || optimize_function_for_size_p (cfun)
8396 || !use_approx_division_p)
8397 return false;
8399 /* Estimate the approximate reciprocal. */
8400 rtx xrcp = gen_reg_rtx (mode);
8401 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8403 /* Iterate over the series twice for SF and thrice for DF. */
8404 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8406 /* Optionally iterate over the series once less for faster performance,
8407 while sacrificing the accuracy. */
8408 if (flag_mlow_precision_div)
8409 iterations--;
8411 /* Iterate over the series to calculate the approximate reciprocal. */
8412 rtx xtmp = gen_reg_rtx (mode);
8413 while (iterations--)
8415 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8417 if (iterations > 0)
8418 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8421 if (num != CONST1_RTX (mode))
8423 /* As the approximate reciprocal of DEN is already calculated, only
8424 calculate the approximate division when NUM is not 1.0. */
8425 rtx xnum = force_reg (mode, num);
8426 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8429 /* Finalize the approximation. */
8430 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8431 return true;
8434 /* Return the number of instructions that can be issued per cycle. */
8435 static int
8436 aarch64_sched_issue_rate (void)
8438 return aarch64_tune_params.issue_rate;
8441 static int
8442 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8444 int issue_rate = aarch64_sched_issue_rate ();
8446 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8450 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8451 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8452 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8454 static int
8455 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8456 int ready_index)
8458 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8462 /* Vectorizer cost model target hooks. */
8464 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8465 static int
8466 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8467 tree vectype,
8468 int misalign ATTRIBUTE_UNUSED)
8470 unsigned elements;
8471 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8472 bool fp = false;
8474 if (vectype != NULL)
8475 fp = FLOAT_TYPE_P (vectype);
8477 switch (type_of_cost)
8479 case scalar_stmt:
8480 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8482 case scalar_load:
8483 return costs->scalar_load_cost;
8485 case scalar_store:
8486 return costs->scalar_store_cost;
8488 case vector_stmt:
8489 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8491 case vector_load:
8492 return costs->vec_align_load_cost;
8494 case vector_store:
8495 return costs->vec_store_cost;
8497 case vec_to_scalar:
8498 return costs->vec_to_scalar_cost;
8500 case scalar_to_vec:
8501 return costs->scalar_to_vec_cost;
8503 case unaligned_load:
8504 return costs->vec_unalign_load_cost;
8506 case unaligned_store:
8507 return costs->vec_unalign_store_cost;
8509 case cond_branch_taken:
8510 return costs->cond_taken_branch_cost;
8512 case cond_branch_not_taken:
8513 return costs->cond_not_taken_branch_cost;
8515 case vec_perm:
8516 return costs->vec_permute_cost;
8518 case vec_promote_demote:
8519 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8521 case vec_construct:
8522 elements = TYPE_VECTOR_SUBPARTS (vectype);
8523 return elements / 2 + 1;
8525 default:
8526 gcc_unreachable ();
8530 /* Implement targetm.vectorize.add_stmt_cost. */
8531 static unsigned
8532 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8533 struct _stmt_vec_info *stmt_info, int misalign,
8534 enum vect_cost_model_location where)
8536 unsigned *cost = (unsigned *) data;
8537 unsigned retval = 0;
8539 if (flag_vect_cost_model)
8541 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8542 int stmt_cost =
8543 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8545 /* Statements in an inner loop relative to the loop being
8546 vectorized are weighted more heavily. The value here is
8547 arbitrary and could potentially be improved with analysis. */
8548 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8549 count *= 50; /* FIXME */
8551 retval = (unsigned) (count * stmt_cost);
8552 cost[where] += retval;
8555 return retval;
8558 static void initialize_aarch64_code_model (struct gcc_options *);
8560 /* Parse the TO_PARSE string and put the architecture struct that it
8561 selects into RES and the architectural features into ISA_FLAGS.
8562 Return an aarch64_parse_opt_result describing the parse result.
8563 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8565 static enum aarch64_parse_opt_result
8566 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8567 unsigned long *isa_flags)
8569 char *ext;
8570 const struct processor *arch;
8571 char *str = (char *) alloca (strlen (to_parse) + 1);
8572 size_t len;
8574 strcpy (str, to_parse);
8576 ext = strchr (str, '+');
8578 if (ext != NULL)
8579 len = ext - str;
8580 else
8581 len = strlen (str);
8583 if (len == 0)
8584 return AARCH64_PARSE_MISSING_ARG;
8587 /* Loop through the list of supported ARCHes to find a match. */
8588 for (arch = all_architectures; arch->name != NULL; arch++)
8590 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8592 unsigned long isa_temp = arch->flags;
8594 if (ext != NULL)
8596 /* TO_PARSE string contains at least one extension. */
8597 enum aarch64_parse_opt_result ext_res
8598 = aarch64_parse_extension (ext, &isa_temp);
8600 if (ext_res != AARCH64_PARSE_OK)
8601 return ext_res;
8603 /* Extension parsing was successful. Confirm the result
8604 arch and ISA flags. */
8605 *res = arch;
8606 *isa_flags = isa_temp;
8607 return AARCH64_PARSE_OK;
8611 /* ARCH name not found in list. */
8612 return AARCH64_PARSE_INVALID_ARG;
8615 /* Parse the TO_PARSE string and put the result tuning in RES and the
8616 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8617 describing the parse result. If there is an error parsing, RES and
8618 ISA_FLAGS are left unchanged. */
8620 static enum aarch64_parse_opt_result
8621 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8622 unsigned long *isa_flags)
8624 char *ext;
8625 const struct processor *cpu;
8626 char *str = (char *) alloca (strlen (to_parse) + 1);
8627 size_t len;
8629 strcpy (str, to_parse);
8631 ext = strchr (str, '+');
8633 if (ext != NULL)
8634 len = ext - str;
8635 else
8636 len = strlen (str);
8638 if (len == 0)
8639 return AARCH64_PARSE_MISSING_ARG;
8642 /* Loop through the list of supported CPUs to find a match. */
8643 for (cpu = all_cores; cpu->name != NULL; cpu++)
8645 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8647 unsigned long isa_temp = cpu->flags;
8650 if (ext != NULL)
8652 /* TO_PARSE string contains at least one extension. */
8653 enum aarch64_parse_opt_result ext_res
8654 = aarch64_parse_extension (ext, &isa_temp);
8656 if (ext_res != AARCH64_PARSE_OK)
8657 return ext_res;
8659 /* Extension parsing was successfull. Confirm the result
8660 cpu and ISA flags. */
8661 *res = cpu;
8662 *isa_flags = isa_temp;
8663 return AARCH64_PARSE_OK;
8667 /* CPU name not found in list. */
8668 return AARCH64_PARSE_INVALID_ARG;
8671 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8672 Return an aarch64_parse_opt_result describing the parse result.
8673 If the parsing fails the RES does not change. */
8675 static enum aarch64_parse_opt_result
8676 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8678 const struct processor *cpu;
8679 char *str = (char *) alloca (strlen (to_parse) + 1);
8681 strcpy (str, to_parse);
8683 /* Loop through the list of supported CPUs to find a match. */
8684 for (cpu = all_cores; cpu->name != NULL; cpu++)
8686 if (strcmp (cpu->name, str) == 0)
8688 *res = cpu;
8689 return AARCH64_PARSE_OK;
8693 /* CPU name not found in list. */
8694 return AARCH64_PARSE_INVALID_ARG;
8697 /* Parse TOKEN, which has length LENGTH to see if it is an option
8698 described in FLAG. If it is, return the index bit for that fusion type.
8699 If not, error (printing OPTION_NAME) and return zero. */
8701 static unsigned int
8702 aarch64_parse_one_option_token (const char *token,
8703 size_t length,
8704 const struct aarch64_flag_desc *flag,
8705 const char *option_name)
8707 for (; flag->name != NULL; flag++)
8709 if (length == strlen (flag->name)
8710 && !strncmp (flag->name, token, length))
8711 return flag->flag;
8714 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8715 return 0;
8718 /* Parse OPTION which is a comma-separated list of flags to enable.
8719 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8720 default state we inherit from the CPU tuning structures. OPTION_NAME
8721 gives the top-level option we are parsing in the -moverride string,
8722 for use in error messages. */
8724 static unsigned int
8725 aarch64_parse_boolean_options (const char *option,
8726 const struct aarch64_flag_desc *flags,
8727 unsigned int initial_state,
8728 const char *option_name)
8730 const char separator = '.';
8731 const char* specs = option;
8732 const char* ntoken = option;
8733 unsigned int found_flags = initial_state;
8735 while ((ntoken = strchr (specs, separator)))
8737 size_t token_length = ntoken - specs;
8738 unsigned token_ops = aarch64_parse_one_option_token (specs,
8739 token_length,
8740 flags,
8741 option_name);
8742 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8743 in the token stream, reset the supported operations. So:
8745 adrp+add.cmp+branch.none.adrp+add
8747 would have the result of turning on only adrp+add fusion. */
8748 if (!token_ops)
8749 found_flags = 0;
8751 found_flags |= token_ops;
8752 specs = ++ntoken;
8755 /* We ended with a comma, print something. */
8756 if (!(*specs))
8758 error ("%s string ill-formed\n", option_name);
8759 return 0;
8762 /* We still have one more token to parse. */
8763 size_t token_length = strlen (specs);
8764 unsigned token_ops = aarch64_parse_one_option_token (specs,
8765 token_length,
8766 flags,
8767 option_name);
8768 if (!token_ops)
8769 found_flags = 0;
8771 found_flags |= token_ops;
8772 return found_flags;
8775 /* Support for overriding instruction fusion. */
8777 static void
8778 aarch64_parse_fuse_string (const char *fuse_string,
8779 struct tune_params *tune)
8781 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8782 aarch64_fusible_pairs,
8783 tune->fusible_ops,
8784 "fuse=");
8787 /* Support for overriding other tuning flags. */
8789 static void
8790 aarch64_parse_tune_string (const char *tune_string,
8791 struct tune_params *tune)
8793 tune->extra_tuning_flags
8794 = aarch64_parse_boolean_options (tune_string,
8795 aarch64_tuning_flags,
8796 tune->extra_tuning_flags,
8797 "tune=");
8800 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8801 we understand. If it is, extract the option string and handoff to
8802 the appropriate function. */
8804 void
8805 aarch64_parse_one_override_token (const char* token,
8806 size_t length,
8807 struct tune_params *tune)
8809 const struct aarch64_tuning_override_function *fn
8810 = aarch64_tuning_override_functions;
8812 const char *option_part = strchr (token, '=');
8813 if (!option_part)
8815 error ("tuning string missing in option (%s)", token);
8816 return;
8819 /* Get the length of the option name. */
8820 length = option_part - token;
8821 /* Skip the '=' to get to the option string. */
8822 option_part++;
8824 for (; fn->name != NULL; fn++)
8826 if (!strncmp (fn->name, token, length))
8828 fn->parse_override (option_part, tune);
8829 return;
8833 error ("unknown tuning option (%s)",token);
8834 return;
8837 /* A checking mechanism for the implementation of the tls size. */
8839 static void
8840 initialize_aarch64_tls_size (struct gcc_options *opts)
8842 if (aarch64_tls_size == 0)
8843 aarch64_tls_size = 24;
8845 switch (opts->x_aarch64_cmodel_var)
8847 case AARCH64_CMODEL_TINY:
8848 /* Both the default and maximum TLS size allowed under tiny is 1M which
8849 needs two instructions to address, so we clamp the size to 24. */
8850 if (aarch64_tls_size > 24)
8851 aarch64_tls_size = 24;
8852 break;
8853 case AARCH64_CMODEL_SMALL:
8854 /* The maximum TLS size allowed under small is 4G. */
8855 if (aarch64_tls_size > 32)
8856 aarch64_tls_size = 32;
8857 break;
8858 case AARCH64_CMODEL_LARGE:
8859 /* The maximum TLS size allowed under large is 16E.
8860 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8861 if (aarch64_tls_size > 48)
8862 aarch64_tls_size = 48;
8863 break;
8864 default:
8865 gcc_unreachable ();
8868 return;
8871 /* Parse STRING looking for options in the format:
8872 string :: option:string
8873 option :: name=substring
8874 name :: {a-z}
8875 substring :: defined by option. */
8877 static void
8878 aarch64_parse_override_string (const char* input_string,
8879 struct tune_params* tune)
8881 const char separator = ':';
8882 size_t string_length = strlen (input_string) + 1;
8883 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8884 char *string = string_root;
8885 strncpy (string, input_string, string_length);
8886 string[string_length - 1] = '\0';
8888 char* ntoken = string;
8890 while ((ntoken = strchr (string, separator)))
8892 size_t token_length = ntoken - string;
8893 /* Make this substring look like a string. */
8894 *ntoken = '\0';
8895 aarch64_parse_one_override_token (string, token_length, tune);
8896 string = ++ntoken;
8899 /* One last option to parse. */
8900 aarch64_parse_one_override_token (string, strlen (string), tune);
8901 free (string_root);
8905 static void
8906 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8908 /* The logic here is that if we are disabling all frame pointer generation
8909 then we do not need to disable leaf frame pointer generation as a
8910 separate operation. But if we are *only* disabling leaf frame pointer
8911 generation then we set flag_omit_frame_pointer to true, but in
8912 aarch64_frame_pointer_required we return false only for leaf functions.
8914 PR 70044: We have to be careful about being called multiple times for the
8915 same function. Once we have decided to set flag_omit_frame_pointer just
8916 so that we can omit leaf frame pointers, we must then not interpret a
8917 second call as meaning that all frame pointer generation should be
8918 omitted. We do this by setting flag_omit_frame_pointer to a special,
8919 non-zero value. */
8920 if (opts->x_flag_omit_frame_pointer == 2)
8921 opts->x_flag_omit_frame_pointer = 0;
8923 if (opts->x_flag_omit_frame_pointer)
8924 opts->x_flag_omit_leaf_frame_pointer = false;
8925 else if (opts->x_flag_omit_leaf_frame_pointer)
8926 opts->x_flag_omit_frame_pointer = 2;
8928 /* If not optimizing for size, set the default
8929 alignment to what the target wants. */
8930 if (!opts->x_optimize_size)
8932 if (opts->x_align_loops <= 0)
8933 opts->x_align_loops = aarch64_tune_params.loop_align;
8934 if (opts->x_align_jumps <= 0)
8935 opts->x_align_jumps = aarch64_tune_params.jump_align;
8936 if (opts->x_align_functions <= 0)
8937 opts->x_align_functions = aarch64_tune_params.function_align;
8940 /* We default to no pc-relative literal loads. */
8942 aarch64_pcrelative_literal_loads = false;
8944 /* If -mpc-relative-literal-loads is set on the command line, this
8945 implies that the user asked for PC relative literal loads. */
8946 if (opts->x_pcrelative_literal_loads == 1)
8947 aarch64_pcrelative_literal_loads = true;
8949 /* This is PR70113. When building the Linux kernel with
8950 CONFIG_ARM64_ERRATUM_843419, support for relocations
8951 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8952 removed from the kernel to avoid loading objects with possibly
8953 offending sequences. Without -mpc-relative-literal-loads we would
8954 generate such relocations, preventing the kernel build from
8955 succeeding. */
8956 if (opts->x_pcrelative_literal_loads == 2
8957 && TARGET_FIX_ERR_A53_843419)
8958 aarch64_pcrelative_literal_loads = true;
8960 /* In the tiny memory model it makes no sense to disallow PC relative
8961 literal pool loads. */
8962 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8963 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8964 aarch64_pcrelative_literal_loads = true;
8966 /* When enabling the lower precision Newton series for the square root, also
8967 enable it for the reciprocal square root, since the latter is an
8968 intermediary step for the former. */
8969 if (flag_mlow_precision_sqrt)
8970 flag_mrecip_low_precision_sqrt = true;
8973 /* 'Unpack' up the internal tuning structs and update the options
8974 in OPTS. The caller must have set up selected_tune and selected_arch
8975 as all the other target-specific codegen decisions are
8976 derived from them. */
8978 void
8979 aarch64_override_options_internal (struct gcc_options *opts)
8981 aarch64_tune_flags = selected_tune->flags;
8982 aarch64_tune = selected_tune->sched_core;
8983 /* Make a copy of the tuning parameters attached to the core, which
8984 we may later overwrite. */
8985 aarch64_tune_params = *(selected_tune->tune);
8986 aarch64_architecture_version = selected_arch->architecture_version;
8988 if (opts->x_aarch64_override_tune_string)
8989 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8990 &aarch64_tune_params);
8992 /* This target defaults to strict volatile bitfields. */
8993 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8994 opts->x_flag_strict_volatile_bitfields = 1;
8996 initialize_aarch64_code_model (opts);
8997 initialize_aarch64_tls_size (opts);
8999 int queue_depth = 0;
9000 switch (aarch64_tune_params.autoprefetcher_model)
9002 case tune_params::AUTOPREFETCHER_OFF:
9003 queue_depth = -1;
9004 break;
9005 case tune_params::AUTOPREFETCHER_WEAK:
9006 queue_depth = 0;
9007 break;
9008 case tune_params::AUTOPREFETCHER_STRONG:
9009 queue_depth = max_insn_queue_index + 1;
9010 break;
9011 default:
9012 gcc_unreachable ();
9015 /* We don't mind passing in global_options_set here as we don't use
9016 the *options_set structs anyway. */
9017 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9018 queue_depth,
9019 opts->x_param_values,
9020 global_options_set.x_param_values);
9022 /* Set up parameters to be used in prefetching algorithm. Do not
9023 override the defaults unless we are tuning for a core we have
9024 researched values for. */
9025 if (aarch64_tune_params.prefetch->num_slots > 0)
9026 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9027 aarch64_tune_params.prefetch->num_slots,
9028 opts->x_param_values,
9029 global_options_set.x_param_values);
9030 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9031 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9032 aarch64_tune_params.prefetch->l1_cache_size,
9033 opts->x_param_values,
9034 global_options_set.x_param_values);
9035 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9036 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9037 aarch64_tune_params.prefetch->l1_cache_line_size,
9038 opts->x_param_values,
9039 global_options_set.x_param_values);
9040 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9041 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9042 aarch64_tune_params.prefetch->l2_cache_size,
9043 opts->x_param_values,
9044 global_options_set.x_param_values);
9046 /* Enable sw prefetching at specified optimization level for
9047 CPUS that have prefetch. Lower optimization level threshold by 1
9048 when profiling is enabled. */
9049 if (opts->x_flag_prefetch_loop_arrays < 0
9050 && !opts->x_optimize_size
9051 && aarch64_tune_params.prefetch->default_opt_level >= 0
9052 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9053 opts->x_flag_prefetch_loop_arrays = 1;
9055 aarch64_override_options_after_change_1 (opts);
9058 /* Print a hint with a suggestion for a core or architecture name that
9059 most closely resembles what the user passed in STR. ARCH is true if
9060 the user is asking for an architecture name. ARCH is false if the user
9061 is asking for a core name. */
9063 static void
9064 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9066 auto_vec<const char *> candidates;
9067 const struct processor *entry = arch ? all_architectures : all_cores;
9068 for (; entry->name != NULL; entry++)
9069 candidates.safe_push (entry->name);
9070 char *s;
9071 const char *hint = candidates_list_and_hint (str, s, candidates);
9072 if (hint)
9073 inform (input_location, "valid arguments are: %s;"
9074 " did you mean %qs?", s, hint);
9075 XDELETEVEC (s);
9078 /* Print a hint with a suggestion for a core name that most closely resembles
9079 what the user passed in STR. */
9081 inline static void
9082 aarch64_print_hint_for_core (const char *str)
9084 aarch64_print_hint_for_core_or_arch (str, false);
9087 /* Print a hint with a suggestion for an architecture name that most closely
9088 resembles what the user passed in STR. */
9090 inline static void
9091 aarch64_print_hint_for_arch (const char *str)
9093 aarch64_print_hint_for_core_or_arch (str, true);
9096 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9097 specified in STR and throw errors if appropriate. Put the results if
9098 they are valid in RES and ISA_FLAGS. Return whether the option is
9099 valid. */
9101 static bool
9102 aarch64_validate_mcpu (const char *str, const struct processor **res,
9103 unsigned long *isa_flags)
9105 enum aarch64_parse_opt_result parse_res
9106 = aarch64_parse_cpu (str, res, isa_flags);
9108 if (parse_res == AARCH64_PARSE_OK)
9109 return true;
9111 switch (parse_res)
9113 case AARCH64_PARSE_MISSING_ARG:
9114 error ("missing cpu name in %<-mcpu=%s%>", str);
9115 break;
9116 case AARCH64_PARSE_INVALID_ARG:
9117 error ("unknown value %qs for -mcpu", str);
9118 aarch64_print_hint_for_core (str);
9119 break;
9120 case AARCH64_PARSE_INVALID_FEATURE:
9121 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9122 break;
9123 default:
9124 gcc_unreachable ();
9127 return false;
9130 /* Validate a command-line -march option. Parse the arch and extensions
9131 (if any) specified in STR and throw errors if appropriate. Put the
9132 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9133 option is valid. */
9135 static bool
9136 aarch64_validate_march (const char *str, const struct processor **res,
9137 unsigned long *isa_flags)
9139 enum aarch64_parse_opt_result parse_res
9140 = aarch64_parse_arch (str, res, isa_flags);
9142 if (parse_res == AARCH64_PARSE_OK)
9143 return true;
9145 switch (parse_res)
9147 case AARCH64_PARSE_MISSING_ARG:
9148 error ("missing arch name in %<-march=%s%>", str);
9149 break;
9150 case AARCH64_PARSE_INVALID_ARG:
9151 error ("unknown value %qs for -march", str);
9152 aarch64_print_hint_for_arch (str);
9153 break;
9154 case AARCH64_PARSE_INVALID_FEATURE:
9155 error ("invalid feature modifier in %<-march=%s%>", str);
9156 break;
9157 default:
9158 gcc_unreachable ();
9161 return false;
9164 /* Validate a command-line -mtune option. Parse the cpu
9165 specified in STR and throw errors if appropriate. Put the
9166 result, if it is valid, in RES. Return whether the option is
9167 valid. */
9169 static bool
9170 aarch64_validate_mtune (const char *str, const struct processor **res)
9172 enum aarch64_parse_opt_result parse_res
9173 = aarch64_parse_tune (str, res);
9175 if (parse_res == AARCH64_PARSE_OK)
9176 return true;
9178 switch (parse_res)
9180 case AARCH64_PARSE_MISSING_ARG:
9181 error ("missing cpu name in %<-mtune=%s%>", str);
9182 break;
9183 case AARCH64_PARSE_INVALID_ARG:
9184 error ("unknown value %qs for -mtune", str);
9185 aarch64_print_hint_for_core (str);
9186 break;
9187 default:
9188 gcc_unreachable ();
9190 return false;
9193 /* Return the CPU corresponding to the enum CPU.
9194 If it doesn't specify a cpu, return the default. */
9196 static const struct processor *
9197 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9199 if (cpu != aarch64_none)
9200 return &all_cores[cpu];
9202 /* The & 0x3f is to extract the bottom 6 bits that encode the
9203 default cpu as selected by the --with-cpu GCC configure option
9204 in config.gcc.
9205 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9206 flags mechanism should be reworked to make it more sane. */
9207 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9210 /* Return the architecture corresponding to the enum ARCH.
9211 If it doesn't specify a valid architecture, return the default. */
9213 static const struct processor *
9214 aarch64_get_arch (enum aarch64_arch arch)
9216 if (arch != aarch64_no_arch)
9217 return &all_architectures[arch];
9219 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9221 return &all_architectures[cpu->arch];
9224 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9225 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9226 tuning structs. In particular it must set selected_tune and
9227 aarch64_isa_flags that define the available ISA features and tuning
9228 decisions. It must also set selected_arch as this will be used to
9229 output the .arch asm tags for each function. */
9231 static void
9232 aarch64_override_options (void)
9234 unsigned long cpu_isa = 0;
9235 unsigned long arch_isa = 0;
9236 aarch64_isa_flags = 0;
9238 bool valid_cpu = true;
9239 bool valid_tune = true;
9240 bool valid_arch = true;
9242 selected_cpu = NULL;
9243 selected_arch = NULL;
9244 selected_tune = NULL;
9246 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9247 If either of -march or -mtune is given, they override their
9248 respective component of -mcpu. */
9249 if (aarch64_cpu_string)
9250 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9251 &cpu_isa);
9253 if (aarch64_arch_string)
9254 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9255 &arch_isa);
9257 if (aarch64_tune_string)
9258 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9260 /* If the user did not specify a processor, choose the default
9261 one for them. This will be the CPU set during configuration using
9262 --with-cpu, otherwise it is "generic". */
9263 if (!selected_cpu)
9265 if (selected_arch)
9267 selected_cpu = &all_cores[selected_arch->ident];
9268 aarch64_isa_flags = arch_isa;
9269 explicit_arch = selected_arch->arch;
9271 else
9273 /* Get default configure-time CPU. */
9274 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9275 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9278 if (selected_tune)
9279 explicit_tune_core = selected_tune->ident;
9281 /* If both -mcpu and -march are specified check that they are architecturally
9282 compatible, warn if they're not and prefer the -march ISA flags. */
9283 else if (selected_arch)
9285 if (selected_arch->arch != selected_cpu->arch)
9287 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9288 all_architectures[selected_cpu->arch].name,
9289 selected_arch->name);
9291 aarch64_isa_flags = arch_isa;
9292 explicit_arch = selected_arch->arch;
9293 explicit_tune_core = selected_tune ? selected_tune->ident
9294 : selected_cpu->ident;
9296 else
9298 /* -mcpu but no -march. */
9299 aarch64_isa_flags = cpu_isa;
9300 explicit_tune_core = selected_tune ? selected_tune->ident
9301 : selected_cpu->ident;
9302 gcc_assert (selected_cpu);
9303 selected_arch = &all_architectures[selected_cpu->arch];
9304 explicit_arch = selected_arch->arch;
9307 /* Set the arch as well as we will need it when outputing
9308 the .arch directive in assembly. */
9309 if (!selected_arch)
9311 gcc_assert (selected_cpu);
9312 selected_arch = &all_architectures[selected_cpu->arch];
9315 if (!selected_tune)
9316 selected_tune = selected_cpu;
9318 #ifndef HAVE_AS_MABI_OPTION
9319 /* The compiler may have been configured with 2.23.* binutils, which does
9320 not have support for ILP32. */
9321 if (TARGET_ILP32)
9322 error ("Assembler does not support -mabi=ilp32");
9323 #endif
9325 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9326 sorry ("Return address signing is only supported for -mabi=lp64");
9328 /* Make sure we properly set up the explicit options. */
9329 if ((aarch64_cpu_string && valid_cpu)
9330 || (aarch64_tune_string && valid_tune))
9331 gcc_assert (explicit_tune_core != aarch64_none);
9333 if ((aarch64_cpu_string && valid_cpu)
9334 || (aarch64_arch_string && valid_arch))
9335 gcc_assert (explicit_arch != aarch64_no_arch);
9337 aarch64_override_options_internal (&global_options);
9339 /* Save these options as the default ones in case we push and pop them later
9340 while processing functions with potential target attributes. */
9341 target_option_default_node = target_option_current_node
9342 = build_target_option_node (&global_options);
9345 /* Implement targetm.override_options_after_change. */
9347 static void
9348 aarch64_override_options_after_change (void)
9350 aarch64_override_options_after_change_1 (&global_options);
9353 static struct machine_function *
9354 aarch64_init_machine_status (void)
9356 struct machine_function *machine;
9357 machine = ggc_cleared_alloc<machine_function> ();
9358 return machine;
9361 void
9362 aarch64_init_expanders (void)
9364 init_machine_status = aarch64_init_machine_status;
9367 /* A checking mechanism for the implementation of the various code models. */
9368 static void
9369 initialize_aarch64_code_model (struct gcc_options *opts)
9371 if (opts->x_flag_pic)
9373 switch (opts->x_aarch64_cmodel_var)
9375 case AARCH64_CMODEL_TINY:
9376 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9377 break;
9378 case AARCH64_CMODEL_SMALL:
9379 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9380 aarch64_cmodel = (flag_pic == 2
9381 ? AARCH64_CMODEL_SMALL_PIC
9382 : AARCH64_CMODEL_SMALL_SPIC);
9383 #else
9384 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9385 #endif
9386 break;
9387 case AARCH64_CMODEL_LARGE:
9388 sorry ("code model %qs with -f%s", "large",
9389 opts->x_flag_pic > 1 ? "PIC" : "pic");
9390 break;
9391 default:
9392 gcc_unreachable ();
9395 else
9396 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9399 /* Implement TARGET_OPTION_SAVE. */
9401 static void
9402 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9404 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9407 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9408 using the information saved in PTR. */
9410 static void
9411 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9413 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9414 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9415 opts->x_explicit_arch = ptr->x_explicit_arch;
9416 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9417 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9419 aarch64_override_options_internal (opts);
9422 /* Implement TARGET_OPTION_PRINT. */
9424 static void
9425 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9427 const struct processor *cpu
9428 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9429 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9430 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9431 std::string extension
9432 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9434 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9435 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9436 arch->name, extension.c_str ());
9439 static GTY(()) tree aarch64_previous_fndecl;
9441 void
9442 aarch64_reset_previous_fndecl (void)
9444 aarch64_previous_fndecl = NULL;
9447 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9448 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9449 make sure optab availability predicates are recomputed when necessary. */
9451 void
9452 aarch64_save_restore_target_globals (tree new_tree)
9454 if (TREE_TARGET_GLOBALS (new_tree))
9455 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9456 else if (new_tree == target_option_default_node)
9457 restore_target_globals (&default_target_globals);
9458 else
9459 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9462 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9463 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9464 of the function, if such exists. This function may be called multiple
9465 times on a single function so use aarch64_previous_fndecl to avoid
9466 setting up identical state. */
9468 static void
9469 aarch64_set_current_function (tree fndecl)
9471 if (!fndecl || fndecl == aarch64_previous_fndecl)
9472 return;
9474 tree old_tree = (aarch64_previous_fndecl
9475 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9476 : NULL_TREE);
9478 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9480 /* If current function has no attributes but the previous one did,
9481 use the default node. */
9482 if (!new_tree && old_tree)
9483 new_tree = target_option_default_node;
9485 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9486 the default have been handled by aarch64_save_restore_target_globals from
9487 aarch64_pragma_target_parse. */
9488 if (old_tree == new_tree)
9489 return;
9491 aarch64_previous_fndecl = fndecl;
9493 /* First set the target options. */
9494 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9496 aarch64_save_restore_target_globals (new_tree);
9499 /* Enum describing the various ways we can handle attributes.
9500 In many cases we can reuse the generic option handling machinery. */
9502 enum aarch64_attr_opt_type
9504 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9505 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9506 aarch64_attr_enum, /* Attribute sets an enum variable. */
9507 aarch64_attr_custom /* Attribute requires a custom handling function. */
9510 /* All the information needed to handle a target attribute.
9511 NAME is the name of the attribute.
9512 ATTR_TYPE specifies the type of behavior of the attribute as described
9513 in the definition of enum aarch64_attr_opt_type.
9514 ALLOW_NEG is true if the attribute supports a "no-" form.
9515 HANDLER is the function that takes the attribute string and whether
9516 it is a pragma or attribute and handles the option. It is needed only
9517 when the ATTR_TYPE is aarch64_attr_custom.
9518 OPT_NUM is the enum specifying the option that the attribute modifies.
9519 This is needed for attributes that mirror the behavior of a command-line
9520 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9521 aarch64_attr_enum. */
9523 struct aarch64_attribute_info
9525 const char *name;
9526 enum aarch64_attr_opt_type attr_type;
9527 bool allow_neg;
9528 bool (*handler) (const char *, const char *);
9529 enum opt_code opt_num;
9532 /* Handle the ARCH_STR argument to the arch= target attribute.
9533 PRAGMA_OR_ATTR is used in potential error messages. */
9535 static bool
9536 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9538 const struct processor *tmp_arch = NULL;
9539 enum aarch64_parse_opt_result parse_res
9540 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9542 if (parse_res == AARCH64_PARSE_OK)
9544 gcc_assert (tmp_arch);
9545 selected_arch = tmp_arch;
9546 explicit_arch = selected_arch->arch;
9547 return true;
9550 switch (parse_res)
9552 case AARCH64_PARSE_MISSING_ARG:
9553 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9554 break;
9555 case AARCH64_PARSE_INVALID_ARG:
9556 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9557 aarch64_print_hint_for_arch (str);
9558 break;
9559 case AARCH64_PARSE_INVALID_FEATURE:
9560 error ("invalid feature modifier %qs for 'arch' target %s",
9561 str, pragma_or_attr);
9562 break;
9563 default:
9564 gcc_unreachable ();
9567 return false;
9570 /* Handle the argument CPU_STR to the cpu= target attribute.
9571 PRAGMA_OR_ATTR is used in potential error messages. */
9573 static bool
9574 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9576 const struct processor *tmp_cpu = NULL;
9577 enum aarch64_parse_opt_result parse_res
9578 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9580 if (parse_res == AARCH64_PARSE_OK)
9582 gcc_assert (tmp_cpu);
9583 selected_tune = tmp_cpu;
9584 explicit_tune_core = selected_tune->ident;
9586 selected_arch = &all_architectures[tmp_cpu->arch];
9587 explicit_arch = selected_arch->arch;
9588 return true;
9591 switch (parse_res)
9593 case AARCH64_PARSE_MISSING_ARG:
9594 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9595 break;
9596 case AARCH64_PARSE_INVALID_ARG:
9597 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9598 aarch64_print_hint_for_core (str);
9599 break;
9600 case AARCH64_PARSE_INVALID_FEATURE:
9601 error ("invalid feature modifier %qs for 'cpu' target %s",
9602 str, pragma_or_attr);
9603 break;
9604 default:
9605 gcc_unreachable ();
9608 return false;
9611 /* Handle the argument STR to the tune= target attribute.
9612 PRAGMA_OR_ATTR is used in potential error messages. */
9614 static bool
9615 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9617 const struct processor *tmp_tune = NULL;
9618 enum aarch64_parse_opt_result parse_res
9619 = aarch64_parse_tune (str, &tmp_tune);
9621 if (parse_res == AARCH64_PARSE_OK)
9623 gcc_assert (tmp_tune);
9624 selected_tune = tmp_tune;
9625 explicit_tune_core = selected_tune->ident;
9626 return true;
9629 switch (parse_res)
9631 case AARCH64_PARSE_INVALID_ARG:
9632 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9633 aarch64_print_hint_for_core (str);
9634 break;
9635 default:
9636 gcc_unreachable ();
9639 return false;
9642 /* Parse an architecture extensions target attribute string specified in STR.
9643 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9644 if successful. Update aarch64_isa_flags to reflect the ISA features
9645 modified.
9646 PRAGMA_OR_ATTR is used in potential error messages. */
9648 static bool
9649 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9651 enum aarch64_parse_opt_result parse_res;
9652 unsigned long isa_flags = aarch64_isa_flags;
9654 /* We allow "+nothing" in the beginning to clear out all architectural
9655 features if the user wants to handpick specific features. */
9656 if (strncmp ("+nothing", str, 8) == 0)
9658 isa_flags = 0;
9659 str += 8;
9662 parse_res = aarch64_parse_extension (str, &isa_flags);
9664 if (parse_res == AARCH64_PARSE_OK)
9666 aarch64_isa_flags = isa_flags;
9667 return true;
9670 switch (parse_res)
9672 case AARCH64_PARSE_MISSING_ARG:
9673 error ("missing feature modifier in target %s %qs",
9674 pragma_or_attr, str);
9675 break;
9677 case AARCH64_PARSE_INVALID_FEATURE:
9678 error ("invalid feature modifier in target %s %qs",
9679 pragma_or_attr, str);
9680 break;
9682 default:
9683 gcc_unreachable ();
9686 return false;
9689 /* The target attributes that we support. On top of these we also support just
9690 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9691 handled explicitly in aarch64_process_one_target_attr. */
9693 static const struct aarch64_attribute_info aarch64_attributes[] =
9695 { "general-regs-only", aarch64_attr_mask, false, NULL,
9696 OPT_mgeneral_regs_only },
9697 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9698 OPT_mfix_cortex_a53_835769 },
9699 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9700 OPT_mfix_cortex_a53_843419 },
9701 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9702 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9703 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9704 OPT_momit_leaf_frame_pointer },
9705 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9706 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9707 OPT_march_ },
9708 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9709 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9710 OPT_mtune_ },
9711 { "sign-return-address", aarch64_attr_enum, false, NULL,
9712 OPT_msign_return_address_ },
9713 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9716 /* Parse ARG_STR which contains the definition of one target attribute.
9717 Show appropriate errors if any or return true if the attribute is valid.
9718 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9719 we're processing a target attribute or pragma. */
9721 static bool
9722 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9724 bool invert = false;
9726 size_t len = strlen (arg_str);
9728 if (len == 0)
9730 error ("malformed target %s", pragma_or_attr);
9731 return false;
9734 char *str_to_check = (char *) alloca (len + 1);
9735 strcpy (str_to_check, arg_str);
9737 /* Skip leading whitespace. */
9738 while (*str_to_check == ' ' || *str_to_check == '\t')
9739 str_to_check++;
9741 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9742 It is easier to detect and handle it explicitly here rather than going
9743 through the machinery for the rest of the target attributes in this
9744 function. */
9745 if (*str_to_check == '+')
9746 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9748 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9750 invert = true;
9751 str_to_check += 3;
9753 char *arg = strchr (str_to_check, '=');
9755 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9756 and point ARG to "foo". */
9757 if (arg)
9759 *arg = '\0';
9760 arg++;
9762 const struct aarch64_attribute_info *p_attr;
9763 bool found = false;
9764 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9766 /* If the names don't match up, or the user has given an argument
9767 to an attribute that doesn't accept one, or didn't give an argument
9768 to an attribute that expects one, fail to match. */
9769 if (strcmp (str_to_check, p_attr->name) != 0)
9770 continue;
9772 found = true;
9773 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9774 || p_attr->attr_type == aarch64_attr_enum;
9776 if (attr_need_arg_p ^ (arg != NULL))
9778 error ("target %s %qs does not accept an argument",
9779 pragma_or_attr, str_to_check);
9780 return false;
9783 /* If the name matches but the attribute does not allow "no-" versions
9784 then we can't match. */
9785 if (invert && !p_attr->allow_neg)
9787 error ("target %s %qs does not allow a negated form",
9788 pragma_or_attr, str_to_check);
9789 return false;
9792 switch (p_attr->attr_type)
9794 /* Has a custom handler registered.
9795 For example, cpu=, arch=, tune=. */
9796 case aarch64_attr_custom:
9797 gcc_assert (p_attr->handler);
9798 if (!p_attr->handler (arg, pragma_or_attr))
9799 return false;
9800 break;
9802 /* Either set or unset a boolean option. */
9803 case aarch64_attr_bool:
9805 struct cl_decoded_option decoded;
9807 generate_option (p_attr->opt_num, NULL, !invert,
9808 CL_TARGET, &decoded);
9809 aarch64_handle_option (&global_options, &global_options_set,
9810 &decoded, input_location);
9811 break;
9813 /* Set or unset a bit in the target_flags. aarch64_handle_option
9814 should know what mask to apply given the option number. */
9815 case aarch64_attr_mask:
9817 struct cl_decoded_option decoded;
9818 /* We only need to specify the option number.
9819 aarch64_handle_option will know which mask to apply. */
9820 decoded.opt_index = p_attr->opt_num;
9821 decoded.value = !invert;
9822 aarch64_handle_option (&global_options, &global_options_set,
9823 &decoded, input_location);
9824 break;
9826 /* Use the option setting machinery to set an option to an enum. */
9827 case aarch64_attr_enum:
9829 gcc_assert (arg);
9830 bool valid;
9831 int value;
9832 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9833 &value, CL_TARGET);
9834 if (valid)
9836 set_option (&global_options, NULL, p_attr->opt_num, value,
9837 NULL, DK_UNSPECIFIED, input_location,
9838 global_dc);
9840 else
9842 error ("target %s %s=%s is not valid",
9843 pragma_or_attr, str_to_check, arg);
9845 break;
9847 default:
9848 gcc_unreachable ();
9852 /* If we reached here we either have found an attribute and validated
9853 it or didn't match any. If we matched an attribute but its arguments
9854 were malformed we will have returned false already. */
9855 return found;
9858 /* Count how many times the character C appears in
9859 NULL-terminated string STR. */
9861 static unsigned int
9862 num_occurences_in_str (char c, char *str)
9864 unsigned int res = 0;
9865 while (*str != '\0')
9867 if (*str == c)
9868 res++;
9870 str++;
9873 return res;
9876 /* Parse the tree in ARGS that contains the target attribute information
9877 and update the global target options space. PRAGMA_OR_ATTR is a string
9878 to be used in error messages, specifying whether this is processing
9879 a target attribute or a target pragma. */
9881 bool
9882 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9884 if (TREE_CODE (args) == TREE_LIST)
9888 tree head = TREE_VALUE (args);
9889 if (head)
9891 if (!aarch64_process_target_attr (head, pragma_or_attr))
9892 return false;
9894 args = TREE_CHAIN (args);
9895 } while (args);
9897 return true;
9900 if (TREE_CODE (args) != STRING_CST)
9902 error ("attribute %<target%> argument not a string");
9903 return false;
9906 size_t len = strlen (TREE_STRING_POINTER (args));
9907 char *str_to_check = (char *) alloca (len + 1);
9908 strcpy (str_to_check, TREE_STRING_POINTER (args));
9910 if (len == 0)
9912 error ("malformed target %s value", pragma_or_attr);
9913 return false;
9916 /* Used to catch empty spaces between commas i.e.
9917 attribute ((target ("attr1,,attr2"))). */
9918 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9920 /* Handle multiple target attributes separated by ','. */
9921 char *token = strtok (str_to_check, ",");
9923 unsigned int num_attrs = 0;
9924 while (token)
9926 num_attrs++;
9927 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9929 error ("target %s %qs is invalid", pragma_or_attr, token);
9930 return false;
9933 token = strtok (NULL, ",");
9936 if (num_attrs != num_commas + 1)
9938 error ("malformed target %s list %qs",
9939 pragma_or_attr, TREE_STRING_POINTER (args));
9940 return false;
9943 return true;
9946 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9947 process attribute ((target ("..."))). */
9949 static bool
9950 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9952 struct cl_target_option cur_target;
9953 bool ret;
9954 tree old_optimize;
9955 tree new_target, new_optimize;
9956 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9958 /* If what we're processing is the current pragma string then the
9959 target option node is already stored in target_option_current_node
9960 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9961 having to re-parse the string. This is especially useful to keep
9962 arm_neon.h compile times down since that header contains a lot
9963 of intrinsics enclosed in pragmas. */
9964 if (!existing_target && args == current_target_pragma)
9966 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9967 return true;
9969 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9971 old_optimize = build_optimization_node (&global_options);
9972 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9974 /* If the function changed the optimization levels as well as setting
9975 target options, start with the optimizations specified. */
9976 if (func_optimize && func_optimize != old_optimize)
9977 cl_optimization_restore (&global_options,
9978 TREE_OPTIMIZATION (func_optimize));
9980 /* Save the current target options to restore at the end. */
9981 cl_target_option_save (&cur_target, &global_options);
9983 /* If fndecl already has some target attributes applied to it, unpack
9984 them so that we add this attribute on top of them, rather than
9985 overwriting them. */
9986 if (existing_target)
9988 struct cl_target_option *existing_options
9989 = TREE_TARGET_OPTION (existing_target);
9991 if (existing_options)
9992 cl_target_option_restore (&global_options, existing_options);
9994 else
9995 cl_target_option_restore (&global_options,
9996 TREE_TARGET_OPTION (target_option_current_node));
9999 ret = aarch64_process_target_attr (args, "attribute");
10001 /* Set up any additional state. */
10002 if (ret)
10004 aarch64_override_options_internal (&global_options);
10005 /* Initialize SIMD builtins if we haven't already.
10006 Set current_target_pragma to NULL for the duration so that
10007 the builtin initialization code doesn't try to tag the functions
10008 being built with the attributes specified by any current pragma, thus
10009 going into an infinite recursion. */
10010 if (TARGET_SIMD)
10012 tree saved_current_target_pragma = current_target_pragma;
10013 current_target_pragma = NULL;
10014 aarch64_init_simd_builtins ();
10015 current_target_pragma = saved_current_target_pragma;
10017 new_target = build_target_option_node (&global_options);
10019 else
10020 new_target = NULL;
10022 new_optimize = build_optimization_node (&global_options);
10024 if (fndecl && ret)
10026 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10028 if (old_optimize != new_optimize)
10029 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10032 cl_target_option_restore (&global_options, &cur_target);
10034 if (old_optimize != new_optimize)
10035 cl_optimization_restore (&global_options,
10036 TREE_OPTIMIZATION (old_optimize));
10037 return ret;
10040 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10041 tri-bool options (yes, no, don't care) and the default value is
10042 DEF, determine whether to reject inlining. */
10044 static bool
10045 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10046 int dont_care, int def)
10048 /* If the callee doesn't care, always allow inlining. */
10049 if (callee == dont_care)
10050 return true;
10052 /* If the caller doesn't care, always allow inlining. */
10053 if (caller == dont_care)
10054 return true;
10056 /* Otherwise, allow inlining if either the callee and caller values
10057 agree, or if the callee is using the default value. */
10058 return (callee == caller || callee == def);
10061 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10062 to inline CALLEE into CALLER based on target-specific info.
10063 Make sure that the caller and callee have compatible architectural
10064 features. Then go through the other possible target attributes
10065 and see if they can block inlining. Try not to reject always_inline
10066 callees unless they are incompatible architecturally. */
10068 static bool
10069 aarch64_can_inline_p (tree caller, tree callee)
10071 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10072 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10074 /* If callee has no option attributes, then it is ok to inline. */
10075 if (!callee_tree)
10076 return true;
10078 struct cl_target_option *caller_opts
10079 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10080 : target_option_default_node);
10082 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10085 /* Callee's ISA flags should be a subset of the caller's. */
10086 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10087 != callee_opts->x_aarch64_isa_flags)
10088 return false;
10090 /* Allow non-strict aligned functions inlining into strict
10091 aligned ones. */
10092 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10093 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10094 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10095 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10096 return false;
10098 bool always_inline = lookup_attribute ("always_inline",
10099 DECL_ATTRIBUTES (callee));
10101 /* If the architectural features match up and the callee is always_inline
10102 then the other attributes don't matter. */
10103 if (always_inline)
10104 return true;
10106 if (caller_opts->x_aarch64_cmodel_var
10107 != callee_opts->x_aarch64_cmodel_var)
10108 return false;
10110 if (caller_opts->x_aarch64_tls_dialect
10111 != callee_opts->x_aarch64_tls_dialect)
10112 return false;
10114 /* Honour explicit requests to workaround errata. */
10115 if (!aarch64_tribools_ok_for_inlining_p (
10116 caller_opts->x_aarch64_fix_a53_err835769,
10117 callee_opts->x_aarch64_fix_a53_err835769,
10118 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10119 return false;
10121 if (!aarch64_tribools_ok_for_inlining_p (
10122 caller_opts->x_aarch64_fix_a53_err843419,
10123 callee_opts->x_aarch64_fix_a53_err843419,
10124 2, TARGET_FIX_ERR_A53_843419))
10125 return false;
10127 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10128 caller and calle and they don't match up, reject inlining. */
10129 if (!aarch64_tribools_ok_for_inlining_p (
10130 caller_opts->x_flag_omit_leaf_frame_pointer,
10131 callee_opts->x_flag_omit_leaf_frame_pointer,
10132 2, 1))
10133 return false;
10135 /* If the callee has specific tuning overrides, respect them. */
10136 if (callee_opts->x_aarch64_override_tune_string != NULL
10137 && caller_opts->x_aarch64_override_tune_string == NULL)
10138 return false;
10140 /* If the user specified tuning override strings for the
10141 caller and callee and they don't match up, reject inlining.
10142 We just do a string compare here, we don't analyze the meaning
10143 of the string, as it would be too costly for little gain. */
10144 if (callee_opts->x_aarch64_override_tune_string
10145 && caller_opts->x_aarch64_override_tune_string
10146 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10147 caller_opts->x_aarch64_override_tune_string) != 0))
10148 return false;
10150 return true;
10153 /* Return true if SYMBOL_REF X binds locally. */
10155 static bool
10156 aarch64_symbol_binds_local_p (const_rtx x)
10158 return (SYMBOL_REF_DECL (x)
10159 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10160 : SYMBOL_REF_LOCAL_P (x));
10163 /* Return true if SYMBOL_REF X is thread local */
10164 static bool
10165 aarch64_tls_symbol_p (rtx x)
10167 if (! TARGET_HAVE_TLS)
10168 return false;
10170 if (GET_CODE (x) != SYMBOL_REF)
10171 return false;
10173 return SYMBOL_REF_TLS_MODEL (x) != 0;
10176 /* Classify a TLS symbol into one of the TLS kinds. */
10177 enum aarch64_symbol_type
10178 aarch64_classify_tls_symbol (rtx x)
10180 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10182 switch (tls_kind)
10184 case TLS_MODEL_GLOBAL_DYNAMIC:
10185 case TLS_MODEL_LOCAL_DYNAMIC:
10186 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10188 case TLS_MODEL_INITIAL_EXEC:
10189 switch (aarch64_cmodel)
10191 case AARCH64_CMODEL_TINY:
10192 case AARCH64_CMODEL_TINY_PIC:
10193 return SYMBOL_TINY_TLSIE;
10194 default:
10195 return SYMBOL_SMALL_TLSIE;
10198 case TLS_MODEL_LOCAL_EXEC:
10199 if (aarch64_tls_size == 12)
10200 return SYMBOL_TLSLE12;
10201 else if (aarch64_tls_size == 24)
10202 return SYMBOL_TLSLE24;
10203 else if (aarch64_tls_size == 32)
10204 return SYMBOL_TLSLE32;
10205 else if (aarch64_tls_size == 48)
10206 return SYMBOL_TLSLE48;
10207 else
10208 gcc_unreachable ();
10210 case TLS_MODEL_EMULATED:
10211 case TLS_MODEL_NONE:
10212 return SYMBOL_FORCE_TO_MEM;
10214 default:
10215 gcc_unreachable ();
10219 /* Return the method that should be used to access SYMBOL_REF or
10220 LABEL_REF X. */
10222 enum aarch64_symbol_type
10223 aarch64_classify_symbol (rtx x, rtx offset)
10225 if (GET_CODE (x) == LABEL_REF)
10227 switch (aarch64_cmodel)
10229 case AARCH64_CMODEL_LARGE:
10230 return SYMBOL_FORCE_TO_MEM;
10232 case AARCH64_CMODEL_TINY_PIC:
10233 case AARCH64_CMODEL_TINY:
10234 return SYMBOL_TINY_ABSOLUTE;
10236 case AARCH64_CMODEL_SMALL_SPIC:
10237 case AARCH64_CMODEL_SMALL_PIC:
10238 case AARCH64_CMODEL_SMALL:
10239 return SYMBOL_SMALL_ABSOLUTE;
10241 default:
10242 gcc_unreachable ();
10246 if (GET_CODE (x) == SYMBOL_REF)
10248 if (aarch64_tls_symbol_p (x))
10249 return aarch64_classify_tls_symbol (x);
10251 switch (aarch64_cmodel)
10253 case AARCH64_CMODEL_TINY:
10254 /* When we retrieve symbol + offset address, we have to make sure
10255 the offset does not cause overflow of the final address. But
10256 we have no way of knowing the address of symbol at compile time
10257 so we can't accurately say if the distance between the PC and
10258 symbol + offset is outside the addressible range of +/-1M in the
10259 TINY code model. So we rely on images not being greater than
10260 1M and cap the offset at 1M and anything beyond 1M will have to
10261 be loaded using an alternative mechanism. Furthermore if the
10262 symbol is a weak reference to something that isn't known to
10263 resolve to a symbol in this module, then force to memory. */
10264 if ((SYMBOL_REF_WEAK (x)
10265 && !aarch64_symbol_binds_local_p (x))
10266 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10267 return SYMBOL_FORCE_TO_MEM;
10268 return SYMBOL_TINY_ABSOLUTE;
10270 case AARCH64_CMODEL_SMALL:
10271 /* Same reasoning as the tiny code model, but the offset cap here is
10272 4G. */
10273 if ((SYMBOL_REF_WEAK (x)
10274 && !aarch64_symbol_binds_local_p (x))
10275 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10276 HOST_WIDE_INT_C (4294967264)))
10277 return SYMBOL_FORCE_TO_MEM;
10278 return SYMBOL_SMALL_ABSOLUTE;
10280 case AARCH64_CMODEL_TINY_PIC:
10281 if (!aarch64_symbol_binds_local_p (x))
10282 return SYMBOL_TINY_GOT;
10283 return SYMBOL_TINY_ABSOLUTE;
10285 case AARCH64_CMODEL_SMALL_SPIC:
10286 case AARCH64_CMODEL_SMALL_PIC:
10287 if (!aarch64_symbol_binds_local_p (x))
10288 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10289 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10290 return SYMBOL_SMALL_ABSOLUTE;
10292 case AARCH64_CMODEL_LARGE:
10293 /* This is alright even in PIC code as the constant
10294 pool reference is always PC relative and within
10295 the same translation unit. */
10296 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10297 return SYMBOL_SMALL_ABSOLUTE;
10298 else
10299 return SYMBOL_FORCE_TO_MEM;
10301 default:
10302 gcc_unreachable ();
10306 /* By default push everything into the constant pool. */
10307 return SYMBOL_FORCE_TO_MEM;
10310 bool
10311 aarch64_constant_address_p (rtx x)
10313 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10316 bool
10317 aarch64_legitimate_pic_operand_p (rtx x)
10319 if (GET_CODE (x) == SYMBOL_REF
10320 || (GET_CODE (x) == CONST
10321 && GET_CODE (XEXP (x, 0)) == PLUS
10322 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10323 return false;
10325 return true;
10328 /* Return true if X holds either a quarter-precision or
10329 floating-point +0.0 constant. */
10330 static bool
10331 aarch64_valid_floating_const (rtx x)
10333 if (!CONST_DOUBLE_P (x))
10334 return false;
10336 /* This call determines which constants can be used in mov<mode>
10337 as integer moves instead of constant loads. */
10338 if (aarch64_float_const_rtx_p (x))
10339 return true;
10341 return aarch64_float_const_representable_p (x);
10344 static bool
10345 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10347 /* Do not allow vector struct mode constants. We could support
10348 0 and -1 easily, but they need support in aarch64-simd.md. */
10349 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10350 return false;
10352 /* For these cases we never want to use a literal load.
10353 As such we have to prevent the compiler from forcing these
10354 to memory. */
10355 if ((GET_CODE (x) == CONST_VECTOR
10356 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10357 || CONST_INT_P (x)
10358 || aarch64_valid_floating_const (x)
10359 || aarch64_can_const_movi_rtx_p (x, mode)
10360 || aarch64_float_const_rtx_p (x))
10361 return !targetm.cannot_force_const_mem (mode, x);
10363 if (GET_CODE (x) == HIGH
10364 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10365 return true;
10367 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10368 so spilling them is better than rematerialization. */
10369 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10370 return true;
10372 return aarch64_constant_address_p (x);
10376 aarch64_load_tp (rtx target)
10378 if (!target
10379 || GET_MODE (target) != Pmode
10380 || !register_operand (target, Pmode))
10381 target = gen_reg_rtx (Pmode);
10383 /* Can return in any reg. */
10384 emit_insn (gen_aarch64_load_tp_hard (target));
10385 return target;
10388 /* On AAPCS systems, this is the "struct __va_list". */
10389 static GTY(()) tree va_list_type;
10391 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10392 Return the type to use as __builtin_va_list.
10394 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10396 struct __va_list
10398 void *__stack;
10399 void *__gr_top;
10400 void *__vr_top;
10401 int __gr_offs;
10402 int __vr_offs;
10403 }; */
10405 static tree
10406 aarch64_build_builtin_va_list (void)
10408 tree va_list_name;
10409 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10411 /* Create the type. */
10412 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10413 /* Give it the required name. */
10414 va_list_name = build_decl (BUILTINS_LOCATION,
10415 TYPE_DECL,
10416 get_identifier ("__va_list"),
10417 va_list_type);
10418 DECL_ARTIFICIAL (va_list_name) = 1;
10419 TYPE_NAME (va_list_type) = va_list_name;
10420 TYPE_STUB_DECL (va_list_type) = va_list_name;
10422 /* Create the fields. */
10423 f_stack = build_decl (BUILTINS_LOCATION,
10424 FIELD_DECL, get_identifier ("__stack"),
10425 ptr_type_node);
10426 f_grtop = build_decl (BUILTINS_LOCATION,
10427 FIELD_DECL, get_identifier ("__gr_top"),
10428 ptr_type_node);
10429 f_vrtop = build_decl (BUILTINS_LOCATION,
10430 FIELD_DECL, get_identifier ("__vr_top"),
10431 ptr_type_node);
10432 f_groff = build_decl (BUILTINS_LOCATION,
10433 FIELD_DECL, get_identifier ("__gr_offs"),
10434 integer_type_node);
10435 f_vroff = build_decl (BUILTINS_LOCATION,
10436 FIELD_DECL, get_identifier ("__vr_offs"),
10437 integer_type_node);
10439 /* Tell tree-stdarg pass about our internal offset fields.
10440 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10441 purpose to identify whether the code is updating va_list internal
10442 offset fields through irregular way. */
10443 va_list_gpr_counter_field = f_groff;
10444 va_list_fpr_counter_field = f_vroff;
10446 DECL_ARTIFICIAL (f_stack) = 1;
10447 DECL_ARTIFICIAL (f_grtop) = 1;
10448 DECL_ARTIFICIAL (f_vrtop) = 1;
10449 DECL_ARTIFICIAL (f_groff) = 1;
10450 DECL_ARTIFICIAL (f_vroff) = 1;
10452 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10453 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10454 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10455 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10456 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10458 TYPE_FIELDS (va_list_type) = f_stack;
10459 DECL_CHAIN (f_stack) = f_grtop;
10460 DECL_CHAIN (f_grtop) = f_vrtop;
10461 DECL_CHAIN (f_vrtop) = f_groff;
10462 DECL_CHAIN (f_groff) = f_vroff;
10464 /* Compute its layout. */
10465 layout_type (va_list_type);
10467 return va_list_type;
10470 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10471 static void
10472 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10474 const CUMULATIVE_ARGS *cum;
10475 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10476 tree stack, grtop, vrtop, groff, vroff;
10477 tree t;
10478 int gr_save_area_size = cfun->va_list_gpr_size;
10479 int vr_save_area_size = cfun->va_list_fpr_size;
10480 int vr_offset;
10482 cum = &crtl->args.info;
10483 if (cfun->va_list_gpr_size)
10484 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10485 cfun->va_list_gpr_size);
10486 if (cfun->va_list_fpr_size)
10487 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10488 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10490 if (!TARGET_FLOAT)
10492 gcc_assert (cum->aapcs_nvrn == 0);
10493 vr_save_area_size = 0;
10496 f_stack = TYPE_FIELDS (va_list_type_node);
10497 f_grtop = DECL_CHAIN (f_stack);
10498 f_vrtop = DECL_CHAIN (f_grtop);
10499 f_groff = DECL_CHAIN (f_vrtop);
10500 f_vroff = DECL_CHAIN (f_groff);
10502 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10503 NULL_TREE);
10504 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10505 NULL_TREE);
10506 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10507 NULL_TREE);
10508 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10509 NULL_TREE);
10510 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10511 NULL_TREE);
10513 /* Emit code to initialize STACK, which points to the next varargs stack
10514 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10515 by named arguments. STACK is 8-byte aligned. */
10516 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10517 if (cum->aapcs_stack_size > 0)
10518 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10519 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10520 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10522 /* Emit code to initialize GRTOP, the top of the GR save area.
10523 virtual_incoming_args_rtx should have been 16 byte aligned. */
10524 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10525 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10526 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10528 /* Emit code to initialize VRTOP, the top of the VR save area.
10529 This address is gr_save_area_bytes below GRTOP, rounded
10530 down to the next 16-byte boundary. */
10531 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10532 vr_offset = ROUND_UP (gr_save_area_size,
10533 STACK_BOUNDARY / BITS_PER_UNIT);
10535 if (vr_offset)
10536 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10537 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10538 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10540 /* Emit code to initialize GROFF, the offset from GRTOP of the
10541 next GPR argument. */
10542 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10543 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10544 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10546 /* Likewise emit code to initialize VROFF, the offset from FTOP
10547 of the next VR argument. */
10548 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10549 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10550 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10553 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10555 static tree
10556 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10557 gimple_seq *post_p ATTRIBUTE_UNUSED)
10559 tree addr;
10560 bool indirect_p;
10561 bool is_ha; /* is HFA or HVA. */
10562 bool dw_align; /* double-word align. */
10563 machine_mode ag_mode = VOIDmode;
10564 int nregs;
10565 machine_mode mode;
10567 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10568 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10569 HOST_WIDE_INT size, rsize, adjust, align;
10570 tree t, u, cond1, cond2;
10572 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10573 if (indirect_p)
10574 type = build_pointer_type (type);
10576 mode = TYPE_MODE (type);
10578 f_stack = TYPE_FIELDS (va_list_type_node);
10579 f_grtop = DECL_CHAIN (f_stack);
10580 f_vrtop = DECL_CHAIN (f_grtop);
10581 f_groff = DECL_CHAIN (f_vrtop);
10582 f_vroff = DECL_CHAIN (f_groff);
10584 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10585 f_stack, NULL_TREE);
10586 size = int_size_in_bytes (type);
10587 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10589 dw_align = false;
10590 adjust = 0;
10591 if (aarch64_vfp_is_call_or_return_candidate (mode,
10592 type,
10593 &ag_mode,
10594 &nregs,
10595 &is_ha))
10597 /* TYPE passed in fp/simd registers. */
10598 if (!TARGET_FLOAT)
10599 aarch64_err_no_fpadvsimd (mode, "varargs");
10601 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10602 unshare_expr (valist), f_vrtop, NULL_TREE);
10603 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10604 unshare_expr (valist), f_vroff, NULL_TREE);
10606 rsize = nregs * UNITS_PER_VREG;
10608 if (is_ha)
10610 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10611 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10613 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10614 && size < UNITS_PER_VREG)
10616 adjust = UNITS_PER_VREG - size;
10619 else
10621 /* TYPE passed in general registers. */
10622 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10623 unshare_expr (valist), f_grtop, NULL_TREE);
10624 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10625 unshare_expr (valist), f_groff, NULL_TREE);
10626 rsize = ROUND_UP (size, UNITS_PER_WORD);
10627 nregs = rsize / UNITS_PER_WORD;
10629 if (align > 8)
10630 dw_align = true;
10632 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10633 && size < UNITS_PER_WORD)
10635 adjust = UNITS_PER_WORD - size;
10639 /* Get a local temporary for the field value. */
10640 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10642 /* Emit code to branch if off >= 0. */
10643 t = build2 (GE_EXPR, boolean_type_node, off,
10644 build_int_cst (TREE_TYPE (off), 0));
10645 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10647 if (dw_align)
10649 /* Emit: offs = (offs + 15) & -16. */
10650 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10651 build_int_cst (TREE_TYPE (off), 15));
10652 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10653 build_int_cst (TREE_TYPE (off), -16));
10654 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10656 else
10657 roundup = NULL;
10659 /* Update ap.__[g|v]r_offs */
10660 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10661 build_int_cst (TREE_TYPE (off), rsize));
10662 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10664 /* String up. */
10665 if (roundup)
10666 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10668 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10669 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10670 build_int_cst (TREE_TYPE (f_off), 0));
10671 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10673 /* String up: make sure the assignment happens before the use. */
10674 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10675 COND_EXPR_ELSE (cond1) = t;
10677 /* Prepare the trees handling the argument that is passed on the stack;
10678 the top level node will store in ON_STACK. */
10679 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10680 if (align > 8)
10682 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10683 t = fold_convert (intDI_type_node, arg);
10684 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10685 build_int_cst (TREE_TYPE (t), 15));
10686 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10687 build_int_cst (TREE_TYPE (t), -16));
10688 t = fold_convert (TREE_TYPE (arg), t);
10689 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10691 else
10692 roundup = NULL;
10693 /* Advance ap.__stack */
10694 t = fold_convert (intDI_type_node, arg);
10695 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10696 build_int_cst (TREE_TYPE (t), size + 7));
10697 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10698 build_int_cst (TREE_TYPE (t), -8));
10699 t = fold_convert (TREE_TYPE (arg), t);
10700 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10701 /* String up roundup and advance. */
10702 if (roundup)
10703 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10704 /* String up with arg */
10705 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10706 /* Big-endianness related address adjustment. */
10707 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10708 && size < UNITS_PER_WORD)
10710 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10711 size_int (UNITS_PER_WORD - size));
10712 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10715 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10716 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10718 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10719 t = off;
10720 if (adjust)
10721 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10722 build_int_cst (TREE_TYPE (off), adjust));
10724 t = fold_convert (sizetype, t);
10725 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10727 if (is_ha)
10729 /* type ha; // treat as "struct {ftype field[n];}"
10730 ... [computing offs]
10731 for (i = 0; i <nregs; ++i, offs += 16)
10732 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10733 return ha; */
10734 int i;
10735 tree tmp_ha, field_t, field_ptr_t;
10737 /* Declare a local variable. */
10738 tmp_ha = create_tmp_var_raw (type, "ha");
10739 gimple_add_tmp_var (tmp_ha);
10741 /* Establish the base type. */
10742 switch (ag_mode)
10744 case SFmode:
10745 field_t = float_type_node;
10746 field_ptr_t = float_ptr_type_node;
10747 break;
10748 case DFmode:
10749 field_t = double_type_node;
10750 field_ptr_t = double_ptr_type_node;
10751 break;
10752 case TFmode:
10753 field_t = long_double_type_node;
10754 field_ptr_t = long_double_ptr_type_node;
10755 break;
10756 case HFmode:
10757 field_t = aarch64_fp16_type_node;
10758 field_ptr_t = aarch64_fp16_ptr_type_node;
10759 break;
10760 case V2SImode:
10761 case V4SImode:
10763 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10764 field_t = build_vector_type_for_mode (innertype, ag_mode);
10765 field_ptr_t = build_pointer_type (field_t);
10767 break;
10768 default:
10769 gcc_assert (0);
10772 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10773 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10774 addr = t;
10775 t = fold_convert (field_ptr_t, addr);
10776 t = build2 (MODIFY_EXPR, field_t,
10777 build1 (INDIRECT_REF, field_t, tmp_ha),
10778 build1 (INDIRECT_REF, field_t, t));
10780 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10781 for (i = 1; i < nregs; ++i)
10783 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10784 u = fold_convert (field_ptr_t, addr);
10785 u = build2 (MODIFY_EXPR, field_t,
10786 build2 (MEM_REF, field_t, tmp_ha,
10787 build_int_cst (field_ptr_t,
10788 (i *
10789 int_size_in_bytes (field_t)))),
10790 build1 (INDIRECT_REF, field_t, u));
10791 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10794 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10795 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10798 COND_EXPR_ELSE (cond2) = t;
10799 addr = fold_convert (build_pointer_type (type), cond1);
10800 addr = build_va_arg_indirect_ref (addr);
10802 if (indirect_p)
10803 addr = build_va_arg_indirect_ref (addr);
10805 return addr;
10808 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10810 static void
10811 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10812 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10813 int no_rtl)
10815 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10816 CUMULATIVE_ARGS local_cum;
10817 int gr_saved = cfun->va_list_gpr_size;
10818 int vr_saved = cfun->va_list_fpr_size;
10820 /* The caller has advanced CUM up to, but not beyond, the last named
10821 argument. Advance a local copy of CUM past the last "real" named
10822 argument, to find out how many registers are left over. */
10823 local_cum = *cum;
10824 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10826 /* Found out how many registers we need to save.
10827 Honor tree-stdvar analysis results. */
10828 if (cfun->va_list_gpr_size)
10829 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10830 cfun->va_list_gpr_size / UNITS_PER_WORD);
10831 if (cfun->va_list_fpr_size)
10832 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10833 cfun->va_list_fpr_size / UNITS_PER_VREG);
10835 if (!TARGET_FLOAT)
10837 gcc_assert (local_cum.aapcs_nvrn == 0);
10838 vr_saved = 0;
10841 if (!no_rtl)
10843 if (gr_saved > 0)
10845 rtx ptr, mem;
10847 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10848 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10849 - gr_saved * UNITS_PER_WORD);
10850 mem = gen_frame_mem (BLKmode, ptr);
10851 set_mem_alias_set (mem, get_varargs_alias_set ());
10853 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10854 mem, gr_saved);
10856 if (vr_saved > 0)
10858 /* We can't use move_block_from_reg, because it will use
10859 the wrong mode, storing D regs only. */
10860 machine_mode mode = TImode;
10861 int off, i, vr_start;
10863 /* Set OFF to the offset from virtual_incoming_args_rtx of
10864 the first vector register. The VR save area lies below
10865 the GR one, and is aligned to 16 bytes. */
10866 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10867 STACK_BOUNDARY / BITS_PER_UNIT);
10868 off -= vr_saved * UNITS_PER_VREG;
10870 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10871 for (i = 0; i < vr_saved; ++i)
10873 rtx ptr, mem;
10875 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10876 mem = gen_frame_mem (mode, ptr);
10877 set_mem_alias_set (mem, get_varargs_alias_set ());
10878 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10879 off += UNITS_PER_VREG;
10884 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10885 any complication of having crtl->args.pretend_args_size changed. */
10886 cfun->machine->frame.saved_varargs_size
10887 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10888 STACK_BOUNDARY / BITS_PER_UNIT)
10889 + vr_saved * UNITS_PER_VREG);
10892 static void
10893 aarch64_conditional_register_usage (void)
10895 int i;
10896 if (!TARGET_FLOAT)
10898 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10900 fixed_regs[i] = 1;
10901 call_used_regs[i] = 1;
10906 /* Walk down the type tree of TYPE counting consecutive base elements.
10907 If *MODEP is VOIDmode, then set it to the first valid floating point
10908 type. If a non-floating point type is found, or if a floating point
10909 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10910 otherwise return the count in the sub-tree. */
10911 static int
10912 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10914 machine_mode mode;
10915 HOST_WIDE_INT size;
10917 switch (TREE_CODE (type))
10919 case REAL_TYPE:
10920 mode = TYPE_MODE (type);
10921 if (mode != DFmode && mode != SFmode
10922 && mode != TFmode && mode != HFmode)
10923 return -1;
10925 if (*modep == VOIDmode)
10926 *modep = mode;
10928 if (*modep == mode)
10929 return 1;
10931 break;
10933 case COMPLEX_TYPE:
10934 mode = TYPE_MODE (TREE_TYPE (type));
10935 if (mode != DFmode && mode != SFmode
10936 && mode != TFmode && mode != HFmode)
10937 return -1;
10939 if (*modep == VOIDmode)
10940 *modep = mode;
10942 if (*modep == mode)
10943 return 2;
10945 break;
10947 case VECTOR_TYPE:
10948 /* Use V2SImode and V4SImode as representatives of all 64-bit
10949 and 128-bit vector types. */
10950 size = int_size_in_bytes (type);
10951 switch (size)
10953 case 8:
10954 mode = V2SImode;
10955 break;
10956 case 16:
10957 mode = V4SImode;
10958 break;
10959 default:
10960 return -1;
10963 if (*modep == VOIDmode)
10964 *modep = mode;
10966 /* Vector modes are considered to be opaque: two vectors are
10967 equivalent for the purposes of being homogeneous aggregates
10968 if they are the same size. */
10969 if (*modep == mode)
10970 return 1;
10972 break;
10974 case ARRAY_TYPE:
10976 int count;
10977 tree index = TYPE_DOMAIN (type);
10979 /* Can't handle incomplete types nor sizes that are not
10980 fixed. */
10981 if (!COMPLETE_TYPE_P (type)
10982 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10983 return -1;
10985 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10986 if (count == -1
10987 || !index
10988 || !TYPE_MAX_VALUE (index)
10989 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10990 || !TYPE_MIN_VALUE (index)
10991 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10992 || count < 0)
10993 return -1;
10995 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10996 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10998 /* There must be no padding. */
10999 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11000 return -1;
11002 return count;
11005 case RECORD_TYPE:
11007 int count = 0;
11008 int sub_count;
11009 tree field;
11011 /* Can't handle incomplete types nor sizes that are not
11012 fixed. */
11013 if (!COMPLETE_TYPE_P (type)
11014 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11015 return -1;
11017 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11019 if (TREE_CODE (field) != FIELD_DECL)
11020 continue;
11022 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11023 if (sub_count < 0)
11024 return -1;
11025 count += sub_count;
11028 /* There must be no padding. */
11029 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11030 return -1;
11032 return count;
11035 case UNION_TYPE:
11036 case QUAL_UNION_TYPE:
11038 /* These aren't very interesting except in a degenerate case. */
11039 int count = 0;
11040 int sub_count;
11041 tree field;
11043 /* Can't handle incomplete types nor sizes that are not
11044 fixed. */
11045 if (!COMPLETE_TYPE_P (type)
11046 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11047 return -1;
11049 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11051 if (TREE_CODE (field) != FIELD_DECL)
11052 continue;
11054 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11055 if (sub_count < 0)
11056 return -1;
11057 count = count > sub_count ? count : sub_count;
11060 /* There must be no padding. */
11061 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11062 return -1;
11064 return count;
11067 default:
11068 break;
11071 return -1;
11074 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11075 type as described in AAPCS64 \S 4.1.2.
11077 See the comment above aarch64_composite_type_p for the notes on MODE. */
11079 static bool
11080 aarch64_short_vector_p (const_tree type,
11081 machine_mode mode)
11083 HOST_WIDE_INT size = -1;
11085 if (type && TREE_CODE (type) == VECTOR_TYPE)
11086 size = int_size_in_bytes (type);
11087 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11088 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11089 size = GET_MODE_SIZE (mode);
11091 return (size == 8 || size == 16);
11094 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11095 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11096 array types. The C99 floating-point complex types are also considered
11097 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11098 types, which are GCC extensions and out of the scope of AAPCS64, are
11099 treated as composite types here as well.
11101 Note that MODE itself is not sufficient in determining whether a type
11102 is such a composite type or not. This is because
11103 stor-layout.c:compute_record_mode may have already changed the MODE
11104 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11105 structure with only one field may have its MODE set to the mode of the
11106 field. Also an integer mode whose size matches the size of the
11107 RECORD_TYPE type may be used to substitute the original mode
11108 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11109 solely relied on. */
11111 static bool
11112 aarch64_composite_type_p (const_tree type,
11113 machine_mode mode)
11115 if (aarch64_short_vector_p (type, mode))
11116 return false;
11118 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11119 return true;
11121 if (mode == BLKmode
11122 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11123 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11124 return true;
11126 return false;
11129 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11130 shall be passed or returned in simd/fp register(s) (providing these
11131 parameter passing registers are available).
11133 Upon successful return, *COUNT returns the number of needed registers,
11134 *BASE_MODE returns the mode of the individual register and when IS_HAF
11135 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11136 floating-point aggregate or a homogeneous short-vector aggregate. */
11138 static bool
11139 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11140 const_tree type,
11141 machine_mode *base_mode,
11142 int *count,
11143 bool *is_ha)
11145 machine_mode new_mode = VOIDmode;
11146 bool composite_p = aarch64_composite_type_p (type, mode);
11148 if (is_ha != NULL) *is_ha = false;
11150 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11151 || aarch64_short_vector_p (type, mode))
11153 *count = 1;
11154 new_mode = mode;
11156 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11158 if (is_ha != NULL) *is_ha = true;
11159 *count = 2;
11160 new_mode = GET_MODE_INNER (mode);
11162 else if (type && composite_p)
11164 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11166 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11168 if (is_ha != NULL) *is_ha = true;
11169 *count = ag_count;
11171 else
11172 return false;
11174 else
11175 return false;
11177 *base_mode = new_mode;
11178 return true;
11181 /* Implement TARGET_STRUCT_VALUE_RTX. */
11183 static rtx
11184 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11185 int incoming ATTRIBUTE_UNUSED)
11187 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11190 /* Implements target hook vector_mode_supported_p. */
11191 static bool
11192 aarch64_vector_mode_supported_p (machine_mode mode)
11194 if (TARGET_SIMD
11195 && (mode == V4SImode || mode == V8HImode
11196 || mode == V16QImode || mode == V2DImode
11197 || mode == V2SImode || mode == V4HImode
11198 || mode == V8QImode || mode == V2SFmode
11199 || mode == V4SFmode || mode == V2DFmode
11200 || mode == V4HFmode || mode == V8HFmode
11201 || mode == V1DFmode))
11202 return true;
11204 return false;
11207 /* Return appropriate SIMD container
11208 for MODE within a vector of WIDTH bits. */
11209 static machine_mode
11210 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11212 gcc_assert (width == 64 || width == 128);
11213 if (TARGET_SIMD)
11215 if (width == 128)
11216 switch (mode)
11218 case DFmode:
11219 return V2DFmode;
11220 case SFmode:
11221 return V4SFmode;
11222 case HFmode:
11223 return V8HFmode;
11224 case SImode:
11225 return V4SImode;
11226 case HImode:
11227 return V8HImode;
11228 case QImode:
11229 return V16QImode;
11230 case DImode:
11231 return V2DImode;
11232 default:
11233 break;
11235 else
11236 switch (mode)
11238 case SFmode:
11239 return V2SFmode;
11240 case HFmode:
11241 return V4HFmode;
11242 case SImode:
11243 return V2SImode;
11244 case HImode:
11245 return V4HImode;
11246 case QImode:
11247 return V8QImode;
11248 default:
11249 break;
11252 return word_mode;
11255 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11256 static machine_mode
11257 aarch64_preferred_simd_mode (machine_mode mode)
11259 return aarch64_simd_container_mode (mode, 128);
11262 /* Return the bitmask of possible vector sizes for the vectorizer
11263 to iterate over. */
11264 static unsigned int
11265 aarch64_autovectorize_vector_sizes (void)
11267 return (16 | 8);
11270 /* Implement TARGET_MANGLE_TYPE. */
11272 static const char *
11273 aarch64_mangle_type (const_tree type)
11275 /* The AArch64 ABI documents say that "__va_list" has to be
11276 managled as if it is in the "std" namespace. */
11277 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11278 return "St9__va_list";
11280 /* Half-precision float. */
11281 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11282 return "Dh";
11284 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11285 builtin types. */
11286 if (TYPE_NAME (type) != NULL)
11287 return aarch64_mangle_builtin_type (type);
11289 /* Use the default mangling. */
11290 return NULL;
11293 /* Find the first rtx_insn before insn that will generate an assembly
11294 instruction. */
11296 static rtx_insn *
11297 aarch64_prev_real_insn (rtx_insn *insn)
11299 if (!insn)
11300 return NULL;
11304 insn = prev_real_insn (insn);
11306 while (insn && recog_memoized (insn) < 0);
11308 return insn;
11311 static bool
11312 is_madd_op (enum attr_type t1)
11314 unsigned int i;
11315 /* A number of these may be AArch32 only. */
11316 enum attr_type mlatypes[] = {
11317 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11318 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11319 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11322 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11324 if (t1 == mlatypes[i])
11325 return true;
11328 return false;
11331 /* Check if there is a register dependency between a load and the insn
11332 for which we hold recog_data. */
11334 static bool
11335 dep_between_memop_and_curr (rtx memop)
11337 rtx load_reg;
11338 int opno;
11340 gcc_assert (GET_CODE (memop) == SET);
11342 if (!REG_P (SET_DEST (memop)))
11343 return false;
11345 load_reg = SET_DEST (memop);
11346 for (opno = 1; opno < recog_data.n_operands; opno++)
11348 rtx operand = recog_data.operand[opno];
11349 if (REG_P (operand)
11350 && reg_overlap_mentioned_p (load_reg, operand))
11351 return true;
11354 return false;
11358 /* When working around the Cortex-A53 erratum 835769,
11359 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11360 instruction and has a preceding memory instruction such that a NOP
11361 should be inserted between them. */
11363 bool
11364 aarch64_madd_needs_nop (rtx_insn* insn)
11366 enum attr_type attr_type;
11367 rtx_insn *prev;
11368 rtx body;
11370 if (!TARGET_FIX_ERR_A53_835769)
11371 return false;
11373 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11374 return false;
11376 attr_type = get_attr_type (insn);
11377 if (!is_madd_op (attr_type))
11378 return false;
11380 prev = aarch64_prev_real_insn (insn);
11381 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11382 Restore recog state to INSN to avoid state corruption. */
11383 extract_constrain_insn_cached (insn);
11385 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11386 return false;
11388 body = single_set (prev);
11390 /* If the previous insn is a memory op and there is no dependency between
11391 it and the DImode madd, emit a NOP between them. If body is NULL then we
11392 have a complex memory operation, probably a load/store pair.
11393 Be conservative for now and emit a NOP. */
11394 if (GET_MODE (recog_data.operand[0]) == DImode
11395 && (!body || !dep_between_memop_and_curr (body)))
11396 return true;
11398 return false;
11403 /* Implement FINAL_PRESCAN_INSN. */
11405 void
11406 aarch64_final_prescan_insn (rtx_insn *insn)
11408 if (aarch64_madd_needs_nop (insn))
11409 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11413 /* Return the equivalent letter for size. */
11414 static char
11415 sizetochar (int size)
11417 switch (size)
11419 case 64: return 'd';
11420 case 32: return 's';
11421 case 16: return 'h';
11422 case 8 : return 'b';
11423 default: gcc_unreachable ();
11427 /* Return true iff x is a uniform vector of floating-point
11428 constants, and the constant can be represented in
11429 quarter-precision form. Note, as aarch64_float_const_representable
11430 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11431 static bool
11432 aarch64_vect_float_const_representable_p (rtx x)
11434 rtx elt;
11435 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11436 && const_vec_duplicate_p (x, &elt)
11437 && aarch64_float_const_representable_p (elt));
11440 /* Return true for valid and false for invalid. */
11441 bool
11442 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11443 struct simd_immediate_info *info)
11445 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11446 matches = 1; \
11447 for (i = 0; i < idx; i += (STRIDE)) \
11448 if (!(TEST)) \
11449 matches = 0; \
11450 if (matches) \
11452 immtype = (CLASS); \
11453 elsize = (ELSIZE); \
11454 eshift = (SHIFT); \
11455 emvn = (NEG); \
11456 break; \
11459 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11460 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11461 unsigned char bytes[16];
11462 int immtype = -1, matches;
11463 unsigned int invmask = inverse ? 0xff : 0;
11464 int eshift, emvn;
11466 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11468 if (! (aarch64_simd_imm_zero_p (op, mode)
11469 || aarch64_vect_float_const_representable_p (op)))
11470 return false;
11472 if (info)
11474 info->value = CONST_VECTOR_ELT (op, 0);
11475 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11476 info->mvn = false;
11477 info->shift = 0;
11480 return true;
11483 /* Splat vector constant out into a byte vector. */
11484 for (i = 0; i < n_elts; i++)
11486 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11487 it must be laid out in the vector register in reverse order. */
11488 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11489 unsigned HOST_WIDE_INT elpart;
11491 gcc_assert (CONST_INT_P (el));
11492 elpart = INTVAL (el);
11494 for (unsigned int byte = 0; byte < innersize; byte++)
11496 bytes[idx++] = (elpart & 0xff) ^ invmask;
11497 elpart >>= BITS_PER_UNIT;
11502 /* Sanity check. */
11503 gcc_assert (idx == GET_MODE_SIZE (mode));
11507 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11508 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11510 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11511 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11513 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11514 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11516 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11517 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11519 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11521 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11523 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11524 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11526 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11527 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11529 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11530 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11532 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11533 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11535 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11537 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11539 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11540 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11542 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11543 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11545 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11546 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11548 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11549 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11551 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11553 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11554 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11556 while (0);
11558 if (immtype == -1)
11559 return false;
11561 if (info)
11563 info->element_width = elsize;
11564 info->mvn = emvn != 0;
11565 info->shift = eshift;
11567 unsigned HOST_WIDE_INT imm = 0;
11569 if (immtype >= 12 && immtype <= 15)
11570 info->msl = true;
11572 /* Un-invert bytes of recognized vector, if necessary. */
11573 if (invmask != 0)
11574 for (i = 0; i < idx; i++)
11575 bytes[i] ^= invmask;
11577 if (immtype == 17)
11579 /* FIXME: Broken on 32-bit H_W_I hosts. */
11580 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11582 for (i = 0; i < 8; i++)
11583 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11584 << (i * BITS_PER_UNIT);
11587 info->value = GEN_INT (imm);
11589 else
11591 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11592 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11594 /* Construct 'abcdefgh' because the assembler cannot handle
11595 generic constants. */
11596 if (info->mvn)
11597 imm = ~imm;
11598 imm = (imm >> info->shift) & 0xff;
11599 info->value = GEN_INT (imm);
11603 return true;
11604 #undef CHECK
11607 /* Check of immediate shift constants are within range. */
11608 bool
11609 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11611 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11612 if (left)
11613 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11614 else
11615 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11618 /* Return true if X is a uniform vector where all elements
11619 are either the floating-point constant 0.0 or the
11620 integer constant 0. */
11621 bool
11622 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11624 return x == CONST0_RTX (mode);
11628 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11629 operation of width WIDTH at bit position POS. */
11632 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11634 gcc_assert (CONST_INT_P (width));
11635 gcc_assert (CONST_INT_P (pos));
11637 unsigned HOST_WIDE_INT mask
11638 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11639 return GEN_INT (mask << UINTVAL (pos));
11642 bool
11643 aarch64_mov_operand_p (rtx x, machine_mode mode)
11645 if (GET_CODE (x) == HIGH
11646 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11647 return true;
11649 if (CONST_INT_P (x))
11650 return true;
11652 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11653 return true;
11655 return aarch64_classify_symbolic_expression (x)
11656 == SYMBOL_TINY_ABSOLUTE;
11659 /* Return a const_int vector of VAL. */
11661 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11663 int nunits = GET_MODE_NUNITS (mode);
11664 rtvec v = rtvec_alloc (nunits);
11665 int i;
11667 rtx cache = GEN_INT (val);
11669 for (i=0; i < nunits; i++)
11670 RTVEC_ELT (v, i) = cache;
11672 return gen_rtx_CONST_VECTOR (mode, v);
11675 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11677 bool
11678 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11680 machine_mode vmode;
11682 gcc_assert (!VECTOR_MODE_P (mode));
11683 vmode = aarch64_preferred_simd_mode (mode);
11684 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11685 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11688 /* Construct and return a PARALLEL RTX vector with elements numbering the
11689 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11690 the vector - from the perspective of the architecture. This does not
11691 line up with GCC's perspective on lane numbers, so we end up with
11692 different masks depending on our target endian-ness. The diagram
11693 below may help. We must draw the distinction when building masks
11694 which select one half of the vector. An instruction selecting
11695 architectural low-lanes for a big-endian target, must be described using
11696 a mask selecting GCC high-lanes.
11698 Big-Endian Little-Endian
11700 GCC 0 1 2 3 3 2 1 0
11701 | x | x | x | x | | x | x | x | x |
11702 Architecture 3 2 1 0 3 2 1 0
11704 Low Mask: { 2, 3 } { 0, 1 }
11705 High Mask: { 0, 1 } { 2, 3 }
11709 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11711 int nunits = GET_MODE_NUNITS (mode);
11712 rtvec v = rtvec_alloc (nunits / 2);
11713 int high_base = nunits / 2;
11714 int low_base = 0;
11715 int base;
11716 rtx t1;
11717 int i;
11719 if (BYTES_BIG_ENDIAN)
11720 base = high ? low_base : high_base;
11721 else
11722 base = high ? high_base : low_base;
11724 for (i = 0; i < nunits / 2; i++)
11725 RTVEC_ELT (v, i) = GEN_INT (base + i);
11727 t1 = gen_rtx_PARALLEL (mode, v);
11728 return t1;
11731 /* Check OP for validity as a PARALLEL RTX vector with elements
11732 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11733 from the perspective of the architecture. See the diagram above
11734 aarch64_simd_vect_par_cnst_half for more details. */
11736 bool
11737 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11738 bool high)
11740 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11741 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11742 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11743 int i = 0;
11745 if (!VECTOR_MODE_P (mode))
11746 return false;
11748 if (count_op != count_ideal)
11749 return false;
11751 for (i = 0; i < count_ideal; i++)
11753 rtx elt_op = XVECEXP (op, 0, i);
11754 rtx elt_ideal = XVECEXP (ideal, 0, i);
11756 if (!CONST_INT_P (elt_op)
11757 || INTVAL (elt_ideal) != INTVAL (elt_op))
11758 return false;
11760 return true;
11763 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11764 HIGH (exclusive). */
11765 void
11766 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11767 const_tree exp)
11769 HOST_WIDE_INT lane;
11770 gcc_assert (CONST_INT_P (operand));
11771 lane = INTVAL (operand);
11773 if (lane < low || lane >= high)
11775 if (exp)
11776 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11777 else
11778 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11782 /* Return TRUE if OP is a valid vector addressing mode. */
11783 bool
11784 aarch64_simd_mem_operand_p (rtx op)
11786 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11787 || REG_P (XEXP (op, 0)));
11790 /* Emit a register copy from operand to operand, taking care not to
11791 early-clobber source registers in the process.
11793 COUNT is the number of components into which the copy needs to be
11794 decomposed. */
11795 void
11796 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11797 unsigned int count)
11799 unsigned int i;
11800 int rdest = REGNO (operands[0]);
11801 int rsrc = REGNO (operands[1]);
11803 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11804 || rdest < rsrc)
11805 for (i = 0; i < count; i++)
11806 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11807 gen_rtx_REG (mode, rsrc + i));
11808 else
11809 for (i = 0; i < count; i++)
11810 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11811 gen_rtx_REG (mode, rsrc + count - i - 1));
11814 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11815 one of VSTRUCT modes: OI, CI, or XI. */
11817 aarch64_simd_attr_length_rglist (machine_mode mode)
11819 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11822 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11823 alignment of a vector to 128 bits. */
11824 static HOST_WIDE_INT
11825 aarch64_simd_vector_alignment (const_tree type)
11827 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11828 return MIN (align, 128);
11831 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11832 static bool
11833 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11835 if (is_packed)
11836 return false;
11838 /* We guarantee alignment for vectors up to 128-bits. */
11839 if (tree_int_cst_compare (TYPE_SIZE (type),
11840 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11841 return false;
11843 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11844 return true;
11847 /* Return true if the vector misalignment factor is supported by the
11848 target. */
11849 static bool
11850 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11851 const_tree type, int misalignment,
11852 bool is_packed)
11854 if (TARGET_SIMD && STRICT_ALIGNMENT)
11856 /* Return if movmisalign pattern is not supported for this mode. */
11857 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11858 return false;
11860 if (misalignment == -1)
11862 /* Misalignment factor is unknown at compile time but we know
11863 it's word aligned. */
11864 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11866 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11868 if (element_size != 64)
11869 return true;
11871 return false;
11874 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11875 is_packed);
11878 /* If VALS is a vector constant that can be loaded into a register
11879 using DUP, generate instructions to do so and return an RTX to
11880 assign to the register. Otherwise return NULL_RTX. */
11881 static rtx
11882 aarch64_simd_dup_constant (rtx vals)
11884 machine_mode mode = GET_MODE (vals);
11885 machine_mode inner_mode = GET_MODE_INNER (mode);
11886 rtx x;
11888 if (!const_vec_duplicate_p (vals, &x))
11889 return NULL_RTX;
11891 /* We can load this constant by using DUP and a constant in a
11892 single ARM register. This will be cheaper than a vector
11893 load. */
11894 x = copy_to_mode_reg (inner_mode, x);
11895 return gen_rtx_VEC_DUPLICATE (mode, x);
11899 /* Generate code to load VALS, which is a PARALLEL containing only
11900 constants (for vec_init) or CONST_VECTOR, efficiently into a
11901 register. Returns an RTX to copy into the register, or NULL_RTX
11902 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11903 static rtx
11904 aarch64_simd_make_constant (rtx vals)
11906 machine_mode mode = GET_MODE (vals);
11907 rtx const_dup;
11908 rtx const_vec = NULL_RTX;
11909 int n_elts = GET_MODE_NUNITS (mode);
11910 int n_const = 0;
11911 int i;
11913 if (GET_CODE (vals) == CONST_VECTOR)
11914 const_vec = vals;
11915 else if (GET_CODE (vals) == PARALLEL)
11917 /* A CONST_VECTOR must contain only CONST_INTs and
11918 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11919 Only store valid constants in a CONST_VECTOR. */
11920 for (i = 0; i < n_elts; ++i)
11922 rtx x = XVECEXP (vals, 0, i);
11923 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11924 n_const++;
11926 if (n_const == n_elts)
11927 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11929 else
11930 gcc_unreachable ();
11932 if (const_vec != NULL_RTX
11933 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11934 /* Load using MOVI/MVNI. */
11935 return const_vec;
11936 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11937 /* Loaded using DUP. */
11938 return const_dup;
11939 else if (const_vec != NULL_RTX)
11940 /* Load from constant pool. We can not take advantage of single-cycle
11941 LD1 because we need a PC-relative addressing mode. */
11942 return const_vec;
11943 else
11944 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11945 We can not construct an initializer. */
11946 return NULL_RTX;
11949 /* Expand a vector initialisation sequence, such that TARGET is
11950 initialised to contain VALS. */
11952 void
11953 aarch64_expand_vector_init (rtx target, rtx vals)
11955 machine_mode mode = GET_MODE (target);
11956 machine_mode inner_mode = GET_MODE_INNER (mode);
11957 /* The number of vector elements. */
11958 int n_elts = GET_MODE_NUNITS (mode);
11959 /* The number of vector elements which are not constant. */
11960 int n_var = 0;
11961 rtx any_const = NULL_RTX;
11962 /* The first element of vals. */
11963 rtx v0 = XVECEXP (vals, 0, 0);
11964 bool all_same = true;
11966 /* Count the number of variable elements to initialise. */
11967 for (int i = 0; i < n_elts; ++i)
11969 rtx x = XVECEXP (vals, 0, i);
11970 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11971 ++n_var;
11972 else
11973 any_const = x;
11975 all_same &= rtx_equal_p (x, v0);
11978 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11979 how best to handle this. */
11980 if (n_var == 0)
11982 rtx constant = aarch64_simd_make_constant (vals);
11983 if (constant != NULL_RTX)
11985 emit_move_insn (target, constant);
11986 return;
11990 /* Splat a single non-constant element if we can. */
11991 if (all_same)
11993 rtx x = copy_to_mode_reg (inner_mode, v0);
11994 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11995 return;
11998 enum insn_code icode = optab_handler (vec_set_optab, mode);
11999 gcc_assert (icode != CODE_FOR_nothing);
12001 /* If there are only variable elements, try to optimize
12002 the insertion using dup for the most common element
12003 followed by insertions. */
12005 /* The algorithm will fill matches[*][0] with the earliest matching element,
12006 and matches[X][1] with the count of duplicate elements (if X is the
12007 earliest element which has duplicates). */
12009 if (n_var == n_elts && n_elts <= 16)
12011 int matches[16][2] = {0};
12012 for (int i = 0; i < n_elts; i++)
12014 for (int j = 0; j <= i; j++)
12016 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12018 matches[i][0] = j;
12019 matches[j][1]++;
12020 break;
12024 int maxelement = 0;
12025 int maxv = 0;
12026 for (int i = 0; i < n_elts; i++)
12027 if (matches[i][1] > maxv)
12029 maxelement = i;
12030 maxv = matches[i][1];
12033 /* Create a duplicate of the most common element. */
12034 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12035 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12037 /* Insert the rest. */
12038 for (int i = 0; i < n_elts; i++)
12040 rtx x = XVECEXP (vals, 0, i);
12041 if (matches[i][0] == maxelement)
12042 continue;
12043 x = copy_to_mode_reg (inner_mode, x);
12044 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12046 return;
12049 /* Initialise a vector which is part-variable. We want to first try
12050 to build those lanes which are constant in the most efficient way we
12051 can. */
12052 if (n_var != n_elts)
12054 rtx copy = copy_rtx (vals);
12056 /* Load constant part of vector. We really don't care what goes into the
12057 parts we will overwrite, but we're more likely to be able to load the
12058 constant efficiently if it has fewer, larger, repeating parts
12059 (see aarch64_simd_valid_immediate). */
12060 for (int i = 0; i < n_elts; i++)
12062 rtx x = XVECEXP (vals, 0, i);
12063 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12064 continue;
12065 rtx subst = any_const;
12066 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12068 /* Look in the copied vector, as more elements are const. */
12069 rtx test = XVECEXP (copy, 0, i ^ bit);
12070 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12072 subst = test;
12073 break;
12076 XVECEXP (copy, 0, i) = subst;
12078 aarch64_expand_vector_init (target, copy);
12081 /* Insert the variable lanes directly. */
12082 for (int i = 0; i < n_elts; i++)
12084 rtx x = XVECEXP (vals, 0, i);
12085 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12086 continue;
12087 x = copy_to_mode_reg (inner_mode, x);
12088 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12092 static unsigned HOST_WIDE_INT
12093 aarch64_shift_truncation_mask (machine_mode mode)
12095 return
12096 (!SHIFT_COUNT_TRUNCATED
12097 || aarch64_vector_mode_supported_p (mode)
12098 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12101 /* Select a format to encode pointers in exception handling data. */
12103 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12105 int type;
12106 switch (aarch64_cmodel)
12108 case AARCH64_CMODEL_TINY:
12109 case AARCH64_CMODEL_TINY_PIC:
12110 case AARCH64_CMODEL_SMALL:
12111 case AARCH64_CMODEL_SMALL_PIC:
12112 case AARCH64_CMODEL_SMALL_SPIC:
12113 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12114 for everything. */
12115 type = DW_EH_PE_sdata4;
12116 break;
12117 default:
12118 /* No assumptions here. 8-byte relocs required. */
12119 type = DW_EH_PE_sdata8;
12120 break;
12122 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12125 /* The last .arch and .tune assembly strings that we printed. */
12126 static std::string aarch64_last_printed_arch_string;
12127 static std::string aarch64_last_printed_tune_string;
12129 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12130 by the function fndecl. */
12132 void
12133 aarch64_declare_function_name (FILE *stream, const char* name,
12134 tree fndecl)
12136 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12138 struct cl_target_option *targ_options;
12139 if (target_parts)
12140 targ_options = TREE_TARGET_OPTION (target_parts);
12141 else
12142 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12143 gcc_assert (targ_options);
12145 const struct processor *this_arch
12146 = aarch64_get_arch (targ_options->x_explicit_arch);
12148 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12149 std::string extension
12150 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12151 this_arch->flags);
12152 /* Only update the assembler .arch string if it is distinct from the last
12153 such string we printed. */
12154 std::string to_print = this_arch->name + extension;
12155 if (to_print != aarch64_last_printed_arch_string)
12157 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12158 aarch64_last_printed_arch_string = to_print;
12161 /* Print the cpu name we're tuning for in the comments, might be
12162 useful to readers of the generated asm. Do it only when it changes
12163 from function to function and verbose assembly is requested. */
12164 const struct processor *this_tune
12165 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12167 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12169 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12170 this_tune->name);
12171 aarch64_last_printed_tune_string = this_tune->name;
12174 /* Don't forget the type directive for ELF. */
12175 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12176 ASM_OUTPUT_LABEL (stream, name);
12179 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12181 static void
12182 aarch64_start_file (void)
12184 struct cl_target_option *default_options
12185 = TREE_TARGET_OPTION (target_option_default_node);
12187 const struct processor *default_arch
12188 = aarch64_get_arch (default_options->x_explicit_arch);
12189 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12190 std::string extension
12191 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12192 default_arch->flags);
12194 aarch64_last_printed_arch_string = default_arch->name + extension;
12195 aarch64_last_printed_tune_string = "";
12196 asm_fprintf (asm_out_file, "\t.arch %s\n",
12197 aarch64_last_printed_arch_string.c_str ());
12199 default_file_start ();
12202 /* Emit load exclusive. */
12204 static void
12205 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12206 rtx mem, rtx model_rtx)
12208 rtx (*gen) (rtx, rtx, rtx);
12210 switch (mode)
12212 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
12213 case HImode: gen = gen_aarch64_load_exclusivehi; break;
12214 case SImode: gen = gen_aarch64_load_exclusivesi; break;
12215 case DImode: gen = gen_aarch64_load_exclusivedi; break;
12216 default:
12217 gcc_unreachable ();
12220 emit_insn (gen (rval, mem, model_rtx));
12223 /* Emit store exclusive. */
12225 static void
12226 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12227 rtx rval, rtx mem, rtx model_rtx)
12229 rtx (*gen) (rtx, rtx, rtx, rtx);
12231 switch (mode)
12233 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
12234 case HImode: gen = gen_aarch64_store_exclusivehi; break;
12235 case SImode: gen = gen_aarch64_store_exclusivesi; break;
12236 case DImode: gen = gen_aarch64_store_exclusivedi; break;
12237 default:
12238 gcc_unreachable ();
12241 emit_insn (gen (bval, rval, mem, model_rtx));
12244 /* Mark the previous jump instruction as unlikely. */
12246 static void
12247 aarch64_emit_unlikely_jump (rtx insn)
12249 rtx_insn *jump = emit_jump_insn (insn);
12250 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12253 /* Expand a compare and swap pattern. */
12255 void
12256 aarch64_expand_compare_and_swap (rtx operands[])
12258 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12259 machine_mode mode, cmp_mode;
12260 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12261 int idx;
12262 gen_cas_fn gen;
12263 const gen_cas_fn split_cas[] =
12265 gen_aarch64_compare_and_swapqi,
12266 gen_aarch64_compare_and_swaphi,
12267 gen_aarch64_compare_and_swapsi,
12268 gen_aarch64_compare_and_swapdi
12270 const gen_cas_fn atomic_cas[] =
12272 gen_aarch64_compare_and_swapqi_lse,
12273 gen_aarch64_compare_and_swaphi_lse,
12274 gen_aarch64_compare_and_swapsi_lse,
12275 gen_aarch64_compare_and_swapdi_lse
12278 bval = operands[0];
12279 rval = operands[1];
12280 mem = operands[2];
12281 oldval = operands[3];
12282 newval = operands[4];
12283 is_weak = operands[5];
12284 mod_s = operands[6];
12285 mod_f = operands[7];
12286 mode = GET_MODE (mem);
12287 cmp_mode = mode;
12289 /* Normally the succ memory model must be stronger than fail, but in the
12290 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12291 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12293 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12294 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12295 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12297 switch (mode)
12299 case QImode:
12300 case HImode:
12301 /* For short modes, we're going to perform the comparison in SImode,
12302 so do the zero-extension now. */
12303 cmp_mode = SImode;
12304 rval = gen_reg_rtx (SImode);
12305 oldval = convert_modes (SImode, mode, oldval, true);
12306 /* Fall through. */
12308 case SImode:
12309 case DImode:
12310 /* Force the value into a register if needed. */
12311 if (!aarch64_plus_operand (oldval, mode))
12312 oldval = force_reg (cmp_mode, oldval);
12313 break;
12315 default:
12316 gcc_unreachable ();
12319 switch (mode)
12321 case QImode: idx = 0; break;
12322 case HImode: idx = 1; break;
12323 case SImode: idx = 2; break;
12324 case DImode: idx = 3; break;
12325 default:
12326 gcc_unreachable ();
12328 if (TARGET_LSE)
12329 gen = atomic_cas[idx];
12330 else
12331 gen = split_cas[idx];
12333 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12335 if (mode == QImode || mode == HImode)
12336 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12338 x = gen_rtx_REG (CCmode, CC_REGNUM);
12339 x = gen_rtx_EQ (SImode, x, const0_rtx);
12340 emit_insn (gen_rtx_SET (bval, x));
12343 /* Test whether the target supports using a atomic load-operate instruction.
12344 CODE is the operation and AFTER is TRUE if the data in memory after the
12345 operation should be returned and FALSE if the data before the operation
12346 should be returned. Returns FALSE if the operation isn't supported by the
12347 architecture. */
12349 bool
12350 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12352 if (!TARGET_LSE)
12353 return false;
12355 switch (code)
12357 case SET:
12358 case AND:
12359 case IOR:
12360 case XOR:
12361 case MINUS:
12362 case PLUS:
12363 return true;
12364 default:
12365 return false;
12369 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12370 sequence implementing an atomic operation. */
12372 static void
12373 aarch64_emit_post_barrier (enum memmodel model)
12375 const enum memmodel base_model = memmodel_base (model);
12377 if (is_mm_sync (model)
12378 && (base_model == MEMMODEL_ACQUIRE
12379 || base_model == MEMMODEL_ACQ_REL
12380 || base_model == MEMMODEL_SEQ_CST))
12382 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12386 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12387 for the data in memory. EXPECTED is the value expected to be in memory.
12388 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12389 is the memory ordering to use. */
12391 void
12392 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12393 rtx expected, rtx desired,
12394 rtx model)
12396 rtx (*gen) (rtx, rtx, rtx, rtx);
12397 machine_mode mode;
12399 mode = GET_MODE (mem);
12401 switch (mode)
12403 case QImode: gen = gen_aarch64_atomic_casqi; break;
12404 case HImode: gen = gen_aarch64_atomic_cashi; break;
12405 case SImode: gen = gen_aarch64_atomic_cassi; break;
12406 case DImode: gen = gen_aarch64_atomic_casdi; break;
12407 default:
12408 gcc_unreachable ();
12411 /* Move the expected value into the CAS destination register. */
12412 emit_insn (gen_rtx_SET (rval, expected));
12414 /* Emit the CAS. */
12415 emit_insn (gen (rval, mem, desired, model));
12417 /* Compare the expected value with the value loaded by the CAS, to establish
12418 whether the swap was made. */
12419 aarch64_gen_compare_reg (EQ, rval, expected);
12422 /* Split a compare and swap pattern. */
12424 void
12425 aarch64_split_compare_and_swap (rtx operands[])
12427 rtx rval, mem, oldval, newval, scratch;
12428 machine_mode mode;
12429 bool is_weak;
12430 rtx_code_label *label1, *label2;
12431 rtx x, cond;
12432 enum memmodel model;
12433 rtx model_rtx;
12435 rval = operands[0];
12436 mem = operands[1];
12437 oldval = operands[2];
12438 newval = operands[3];
12439 is_weak = (operands[4] != const0_rtx);
12440 model_rtx = operands[5];
12441 scratch = operands[7];
12442 mode = GET_MODE (mem);
12443 model = memmodel_from_int (INTVAL (model_rtx));
12445 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12446 loop:
12447 .label1:
12448 LD[A]XR rval, [mem]
12449 CBNZ rval, .label2
12450 ST[L]XR scratch, newval, [mem]
12451 CBNZ scratch, .label1
12452 .label2:
12453 CMP rval, 0. */
12454 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12456 label1 = NULL;
12457 if (!is_weak)
12459 label1 = gen_label_rtx ();
12460 emit_label (label1);
12462 label2 = gen_label_rtx ();
12464 /* The initial load can be relaxed for a __sync operation since a final
12465 barrier will be emitted to stop code hoisting. */
12466 if (is_mm_sync (model))
12467 aarch64_emit_load_exclusive (mode, rval, mem,
12468 GEN_INT (MEMMODEL_RELAXED));
12469 else
12470 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12472 if (strong_zero_p)
12474 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12475 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12476 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12477 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12479 else
12481 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12482 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12483 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12484 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12485 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12488 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12490 if (!is_weak)
12492 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12493 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12494 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12495 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12497 else
12499 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12500 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12501 emit_insn (gen_rtx_SET (cond, x));
12504 emit_label (label2);
12505 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12506 to set the condition flags. If this is not used it will be removed by
12507 later passes. */
12508 if (strong_zero_p)
12510 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12511 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12512 emit_insn (gen_rtx_SET (cond, x));
12514 /* Emit any final barrier needed for a __sync operation. */
12515 if (is_mm_sync (model))
12516 aarch64_emit_post_barrier (model);
12519 /* Emit a BIC instruction. */
12521 static void
12522 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12524 rtx shift_rtx = GEN_INT (shift);
12525 rtx (*gen) (rtx, rtx, rtx, rtx);
12527 switch (mode)
12529 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12530 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12531 default:
12532 gcc_unreachable ();
12535 emit_insn (gen (dst, s2, shift_rtx, s1));
12538 /* Emit an atomic swap. */
12540 static void
12541 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12542 rtx mem, rtx model)
12544 rtx (*gen) (rtx, rtx, rtx, rtx);
12546 switch (mode)
12548 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12549 case HImode: gen = gen_aarch64_atomic_swphi; break;
12550 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12551 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12552 default:
12553 gcc_unreachable ();
12556 emit_insn (gen (dst, mem, value, model));
12559 /* Operations supported by aarch64_emit_atomic_load_op. */
12561 enum aarch64_atomic_load_op_code
12563 AARCH64_LDOP_PLUS, /* A + B */
12564 AARCH64_LDOP_XOR, /* A ^ B */
12565 AARCH64_LDOP_OR, /* A | B */
12566 AARCH64_LDOP_BIC /* A & ~B */
12569 /* Emit an atomic load-operate. */
12571 static void
12572 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12573 machine_mode mode, rtx dst, rtx src,
12574 rtx mem, rtx model)
12576 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12577 const aarch64_atomic_load_op_fn plus[] =
12579 gen_aarch64_atomic_loadaddqi,
12580 gen_aarch64_atomic_loadaddhi,
12581 gen_aarch64_atomic_loadaddsi,
12582 gen_aarch64_atomic_loadadddi
12584 const aarch64_atomic_load_op_fn eor[] =
12586 gen_aarch64_atomic_loadeorqi,
12587 gen_aarch64_atomic_loadeorhi,
12588 gen_aarch64_atomic_loadeorsi,
12589 gen_aarch64_atomic_loadeordi
12591 const aarch64_atomic_load_op_fn ior[] =
12593 gen_aarch64_atomic_loadsetqi,
12594 gen_aarch64_atomic_loadsethi,
12595 gen_aarch64_atomic_loadsetsi,
12596 gen_aarch64_atomic_loadsetdi
12598 const aarch64_atomic_load_op_fn bic[] =
12600 gen_aarch64_atomic_loadclrqi,
12601 gen_aarch64_atomic_loadclrhi,
12602 gen_aarch64_atomic_loadclrsi,
12603 gen_aarch64_atomic_loadclrdi
12605 aarch64_atomic_load_op_fn gen;
12606 int idx = 0;
12608 switch (mode)
12610 case QImode: idx = 0; break;
12611 case HImode: idx = 1; break;
12612 case SImode: idx = 2; break;
12613 case DImode: idx = 3; break;
12614 default:
12615 gcc_unreachable ();
12618 switch (code)
12620 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12621 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12622 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12623 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12624 default:
12625 gcc_unreachable ();
12628 emit_insn (gen (dst, mem, src, model));
12631 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12632 location to store the data read from memory. OUT_RESULT is the location to
12633 store the result of the operation. MEM is the memory location to read and
12634 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12635 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12636 be NULL. */
12638 void
12639 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12640 rtx mem, rtx value, rtx model_rtx)
12642 machine_mode mode = GET_MODE (mem);
12643 machine_mode wmode = (mode == DImode ? DImode : SImode);
12644 const bool short_mode = (mode < SImode);
12645 aarch64_atomic_load_op_code ldop_code;
12646 rtx src;
12647 rtx x;
12649 if (out_data)
12650 out_data = gen_lowpart (mode, out_data);
12652 if (out_result)
12653 out_result = gen_lowpart (mode, out_result);
12655 /* Make sure the value is in a register, putting it into a destination
12656 register if it needs to be manipulated. */
12657 if (!register_operand (value, mode)
12658 || code == AND || code == MINUS)
12660 src = out_result ? out_result : out_data;
12661 emit_move_insn (src, gen_lowpart (mode, value));
12663 else
12664 src = value;
12665 gcc_assert (register_operand (src, mode));
12667 /* Preprocess the data for the operation as necessary. If the operation is
12668 a SET then emit a swap instruction and finish. */
12669 switch (code)
12671 case SET:
12672 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12673 return;
12675 case MINUS:
12676 /* Negate the value and treat it as a PLUS. */
12678 rtx neg_src;
12680 /* Resize the value if necessary. */
12681 if (short_mode)
12682 src = gen_lowpart (wmode, src);
12684 neg_src = gen_rtx_NEG (wmode, src);
12685 emit_insn (gen_rtx_SET (src, neg_src));
12687 if (short_mode)
12688 src = gen_lowpart (mode, src);
12690 /* Fall-through. */
12691 case PLUS:
12692 ldop_code = AARCH64_LDOP_PLUS;
12693 break;
12695 case IOR:
12696 ldop_code = AARCH64_LDOP_OR;
12697 break;
12699 case XOR:
12700 ldop_code = AARCH64_LDOP_XOR;
12701 break;
12703 case AND:
12705 rtx not_src;
12707 /* Resize the value if necessary. */
12708 if (short_mode)
12709 src = gen_lowpart (wmode, src);
12711 not_src = gen_rtx_NOT (wmode, src);
12712 emit_insn (gen_rtx_SET (src, not_src));
12714 if (short_mode)
12715 src = gen_lowpart (mode, src);
12717 ldop_code = AARCH64_LDOP_BIC;
12718 break;
12720 default:
12721 /* The operation can't be done with atomic instructions. */
12722 gcc_unreachable ();
12725 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12727 /* If necessary, calculate the data in memory after the update by redoing the
12728 operation from values in registers. */
12729 if (!out_result)
12730 return;
12732 if (short_mode)
12734 src = gen_lowpart (wmode, src);
12735 out_data = gen_lowpart (wmode, out_data);
12736 out_result = gen_lowpart (wmode, out_result);
12739 x = NULL_RTX;
12741 switch (code)
12743 case MINUS:
12744 case PLUS:
12745 x = gen_rtx_PLUS (wmode, out_data, src);
12746 break;
12747 case IOR:
12748 x = gen_rtx_IOR (wmode, out_data, src);
12749 break;
12750 case XOR:
12751 x = gen_rtx_XOR (wmode, out_data, src);
12752 break;
12753 case AND:
12754 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12755 return;
12756 default:
12757 gcc_unreachable ();
12760 emit_set_insn (out_result, x);
12762 return;
12765 /* Split an atomic operation. */
12767 void
12768 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12769 rtx value, rtx model_rtx, rtx cond)
12771 machine_mode mode = GET_MODE (mem);
12772 machine_mode wmode = (mode == DImode ? DImode : SImode);
12773 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12774 const bool is_sync = is_mm_sync (model);
12775 rtx_code_label *label;
12776 rtx x;
12778 /* Split the atomic operation into a sequence. */
12779 label = gen_label_rtx ();
12780 emit_label (label);
12782 if (new_out)
12783 new_out = gen_lowpart (wmode, new_out);
12784 if (old_out)
12785 old_out = gen_lowpart (wmode, old_out);
12786 else
12787 old_out = new_out;
12788 value = simplify_gen_subreg (wmode, value, mode, 0);
12790 /* The initial load can be relaxed for a __sync operation since a final
12791 barrier will be emitted to stop code hoisting. */
12792 if (is_sync)
12793 aarch64_emit_load_exclusive (mode, old_out, mem,
12794 GEN_INT (MEMMODEL_RELAXED));
12795 else
12796 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12798 switch (code)
12800 case SET:
12801 new_out = value;
12802 break;
12804 case NOT:
12805 x = gen_rtx_AND (wmode, old_out, value);
12806 emit_insn (gen_rtx_SET (new_out, x));
12807 x = gen_rtx_NOT (wmode, new_out);
12808 emit_insn (gen_rtx_SET (new_out, x));
12809 break;
12811 case MINUS:
12812 if (CONST_INT_P (value))
12814 value = GEN_INT (-INTVAL (value));
12815 code = PLUS;
12817 /* Fall through. */
12819 default:
12820 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12821 emit_insn (gen_rtx_SET (new_out, x));
12822 break;
12825 aarch64_emit_store_exclusive (mode, cond, mem,
12826 gen_lowpart (mode, new_out), model_rtx);
12828 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12829 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12830 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12831 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12833 /* Emit any final barrier needed for a __sync operation. */
12834 if (is_sync)
12835 aarch64_emit_post_barrier (model);
12838 static void
12839 aarch64_init_libfuncs (void)
12841 /* Half-precision float operations. The compiler handles all operations
12842 with NULL libfuncs by converting to SFmode. */
12844 /* Conversions. */
12845 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12846 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12848 /* Arithmetic. */
12849 set_optab_libfunc (add_optab, HFmode, NULL);
12850 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12851 set_optab_libfunc (smul_optab, HFmode, NULL);
12852 set_optab_libfunc (neg_optab, HFmode, NULL);
12853 set_optab_libfunc (sub_optab, HFmode, NULL);
12855 /* Comparisons. */
12856 set_optab_libfunc (eq_optab, HFmode, NULL);
12857 set_optab_libfunc (ne_optab, HFmode, NULL);
12858 set_optab_libfunc (lt_optab, HFmode, NULL);
12859 set_optab_libfunc (le_optab, HFmode, NULL);
12860 set_optab_libfunc (ge_optab, HFmode, NULL);
12861 set_optab_libfunc (gt_optab, HFmode, NULL);
12862 set_optab_libfunc (unord_optab, HFmode, NULL);
12865 /* Target hook for c_mode_for_suffix. */
12866 static machine_mode
12867 aarch64_c_mode_for_suffix (char suffix)
12869 if (suffix == 'q')
12870 return TFmode;
12872 return VOIDmode;
12875 /* We can only represent floating point constants which will fit in
12876 "quarter-precision" values. These values are characterised by
12877 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12880 (-1)^s * (n/16) * 2^r
12882 Where:
12883 's' is the sign bit.
12884 'n' is an integer in the range 16 <= n <= 31.
12885 'r' is an integer in the range -3 <= r <= 4. */
12887 /* Return true iff X can be represented by a quarter-precision
12888 floating point immediate operand X. Note, we cannot represent 0.0. */
12889 bool
12890 aarch64_float_const_representable_p (rtx x)
12892 /* This represents our current view of how many bits
12893 make up the mantissa. */
12894 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12895 int exponent;
12896 unsigned HOST_WIDE_INT mantissa, mask;
12897 REAL_VALUE_TYPE r, m;
12898 bool fail;
12900 if (!CONST_DOUBLE_P (x))
12901 return false;
12903 /* We don't support HFmode constants yet. */
12904 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12905 return false;
12907 r = *CONST_DOUBLE_REAL_VALUE (x);
12909 /* We cannot represent infinities, NaNs or +/-zero. We won't
12910 know if we have +zero until we analyse the mantissa, but we
12911 can reject the other invalid values. */
12912 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12913 || REAL_VALUE_MINUS_ZERO (r))
12914 return false;
12916 /* Extract exponent. */
12917 r = real_value_abs (&r);
12918 exponent = REAL_EXP (&r);
12920 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12921 highest (sign) bit, with a fixed binary point at bit point_pos.
12922 m1 holds the low part of the mantissa, m2 the high part.
12923 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12924 bits for the mantissa, this can fail (low bits will be lost). */
12925 real_ldexp (&m, &r, point_pos - exponent);
12926 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12928 /* If the low part of the mantissa has bits set we cannot represent
12929 the value. */
12930 if (w.ulow () != 0)
12931 return false;
12932 /* We have rejected the lower HOST_WIDE_INT, so update our
12933 understanding of how many bits lie in the mantissa and
12934 look only at the high HOST_WIDE_INT. */
12935 mantissa = w.elt (1);
12936 point_pos -= HOST_BITS_PER_WIDE_INT;
12938 /* We can only represent values with a mantissa of the form 1.xxxx. */
12939 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12940 if ((mantissa & mask) != 0)
12941 return false;
12943 /* Having filtered unrepresentable values, we may now remove all
12944 but the highest 5 bits. */
12945 mantissa >>= point_pos - 5;
12947 /* We cannot represent the value 0.0, so reject it. This is handled
12948 elsewhere. */
12949 if (mantissa == 0)
12950 return false;
12952 /* Then, as bit 4 is always set, we can mask it off, leaving
12953 the mantissa in the range [0, 15]. */
12954 mantissa &= ~(1 << 4);
12955 gcc_assert (mantissa <= 15);
12957 /* GCC internally does not use IEEE754-like encoding (where normalized
12958 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12959 Our mantissa values are shifted 4 places to the left relative to
12960 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12961 by 5 places to correct for GCC's representation. */
12962 exponent = 5 - exponent;
12964 return (exponent >= 0 && exponent <= 7);
12967 char*
12968 aarch64_output_simd_mov_immediate (rtx const_vector,
12969 machine_mode mode,
12970 unsigned width)
12972 bool is_valid;
12973 static char templ[40];
12974 const char *mnemonic;
12975 const char *shift_op;
12976 unsigned int lane_count = 0;
12977 char element_char;
12979 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12981 /* This will return true to show const_vector is legal for use as either
12982 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12983 also update INFO to show how the immediate should be generated. */
12984 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12985 gcc_assert (is_valid);
12987 element_char = sizetochar (info.element_width);
12988 lane_count = width / info.element_width;
12990 mode = GET_MODE_INNER (mode);
12991 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12993 gcc_assert (info.shift == 0 && ! info.mvn);
12994 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12995 move immediate path. */
12996 if (aarch64_float_const_zero_rtx_p (info.value))
12997 info.value = GEN_INT (0);
12998 else
13000 const unsigned int buf_size = 20;
13001 char float_buf[buf_size] = {'\0'};
13002 real_to_decimal_for_mode (float_buf,
13003 CONST_DOUBLE_REAL_VALUE (info.value),
13004 buf_size, buf_size, 1, mode);
13006 if (lane_count == 1)
13007 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13008 else
13009 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13010 lane_count, element_char, float_buf);
13011 return templ;
13015 mnemonic = info.mvn ? "mvni" : "movi";
13016 shift_op = info.msl ? "msl" : "lsl";
13018 gcc_assert (CONST_INT_P (info.value));
13019 if (lane_count == 1)
13020 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13021 mnemonic, UINTVAL (info.value));
13022 else if (info.shift)
13023 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13024 ", %s %d", mnemonic, lane_count, element_char,
13025 UINTVAL (info.value), shift_op, info.shift);
13026 else
13027 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13028 mnemonic, lane_count, element_char, UINTVAL (info.value));
13029 return templ;
13032 char*
13033 aarch64_output_scalar_simd_mov_immediate (rtx immediate, machine_mode mode)
13036 /* If a floating point number was passed and we desire to use it in an
13037 integer mode do the conversion to integer. */
13038 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13040 unsigned HOST_WIDE_INT ival;
13041 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13042 gcc_unreachable ();
13043 immediate = gen_int_mode (ival, mode);
13046 machine_mode vmode;
13047 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13048 a 128 bit vector mode. */
13049 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13051 gcc_assert (!VECTOR_MODE_P (mode));
13052 vmode = aarch64_simd_container_mode (mode, width);
13053 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13054 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13057 /* Split operands into moves from op[1] + op[2] into op[0]. */
13059 void
13060 aarch64_split_combinev16qi (rtx operands[3])
13062 unsigned int dest = REGNO (operands[0]);
13063 unsigned int src1 = REGNO (operands[1]);
13064 unsigned int src2 = REGNO (operands[2]);
13065 machine_mode halfmode = GET_MODE (operands[1]);
13066 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13067 rtx destlo, desthi;
13069 gcc_assert (halfmode == V16QImode);
13071 if (src1 == dest && src2 == dest + halfregs)
13073 /* No-op move. Can't split to nothing; emit something. */
13074 emit_note (NOTE_INSN_DELETED);
13075 return;
13078 /* Preserve register attributes for variable tracking. */
13079 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13080 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13081 GET_MODE_SIZE (halfmode));
13083 /* Special case of reversed high/low parts. */
13084 if (reg_overlap_mentioned_p (operands[2], destlo)
13085 && reg_overlap_mentioned_p (operands[1], desthi))
13087 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13088 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13089 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13091 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13093 /* Try to avoid unnecessary moves if part of the result
13094 is in the right place already. */
13095 if (src1 != dest)
13096 emit_move_insn (destlo, operands[1]);
13097 if (src2 != dest + halfregs)
13098 emit_move_insn (desthi, operands[2]);
13100 else
13102 if (src2 != dest + halfregs)
13103 emit_move_insn (desthi, operands[2]);
13104 if (src1 != dest)
13105 emit_move_insn (destlo, operands[1]);
13109 /* vec_perm support. */
13111 #define MAX_VECT_LEN 16
13113 struct expand_vec_perm_d
13115 rtx target, op0, op1;
13116 unsigned char perm[MAX_VECT_LEN];
13117 machine_mode vmode;
13118 unsigned char nelt;
13119 bool one_vector_p;
13120 bool testing_p;
13123 /* Generate a variable permutation. */
13125 static void
13126 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13128 machine_mode vmode = GET_MODE (target);
13129 bool one_vector_p = rtx_equal_p (op0, op1);
13131 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13132 gcc_checking_assert (GET_MODE (op0) == vmode);
13133 gcc_checking_assert (GET_MODE (op1) == vmode);
13134 gcc_checking_assert (GET_MODE (sel) == vmode);
13135 gcc_checking_assert (TARGET_SIMD);
13137 if (one_vector_p)
13139 if (vmode == V8QImode)
13141 /* Expand the argument to a V16QI mode by duplicating it. */
13142 rtx pair = gen_reg_rtx (V16QImode);
13143 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13144 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13146 else
13148 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13151 else
13153 rtx pair;
13155 if (vmode == V8QImode)
13157 pair = gen_reg_rtx (V16QImode);
13158 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13159 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13161 else
13163 pair = gen_reg_rtx (OImode);
13164 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13165 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13170 void
13171 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13173 machine_mode vmode = GET_MODE (target);
13174 unsigned int nelt = GET_MODE_NUNITS (vmode);
13175 bool one_vector_p = rtx_equal_p (op0, op1);
13176 rtx mask;
13178 /* The TBL instruction does not use a modulo index, so we must take care
13179 of that ourselves. */
13180 mask = aarch64_simd_gen_const_vector_dup (vmode,
13181 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13182 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13184 /* For big-endian, we also need to reverse the index within the vector
13185 (but not which vector). */
13186 if (BYTES_BIG_ENDIAN)
13188 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13189 if (!one_vector_p)
13190 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13191 sel = expand_simple_binop (vmode, XOR, sel, mask,
13192 NULL, 0, OPTAB_LIB_WIDEN);
13194 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13197 /* Recognize patterns suitable for the TRN instructions. */
13198 static bool
13199 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13201 unsigned int i, odd, mask, nelt = d->nelt;
13202 rtx out, in0, in1, x;
13203 rtx (*gen) (rtx, rtx, rtx);
13204 machine_mode vmode = d->vmode;
13206 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13207 return false;
13209 /* Note that these are little-endian tests.
13210 We correct for big-endian later. */
13211 if (d->perm[0] == 0)
13212 odd = 0;
13213 else if (d->perm[0] == 1)
13214 odd = 1;
13215 else
13216 return false;
13217 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13219 for (i = 0; i < nelt; i += 2)
13221 if (d->perm[i] != i + odd)
13222 return false;
13223 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13224 return false;
13227 /* Success! */
13228 if (d->testing_p)
13229 return true;
13231 in0 = d->op0;
13232 in1 = d->op1;
13233 if (BYTES_BIG_ENDIAN)
13235 x = in0, in0 = in1, in1 = x;
13236 odd = !odd;
13238 out = d->target;
13240 if (odd)
13242 switch (vmode)
13244 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
13245 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
13246 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
13247 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
13248 case V4SImode: gen = gen_aarch64_trn2v4si; break;
13249 case V2SImode: gen = gen_aarch64_trn2v2si; break;
13250 case V2DImode: gen = gen_aarch64_trn2v2di; break;
13251 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13252 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13253 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13254 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13255 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
13256 default:
13257 return false;
13260 else
13262 switch (vmode)
13264 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
13265 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
13266 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
13267 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
13268 case V4SImode: gen = gen_aarch64_trn1v4si; break;
13269 case V2SImode: gen = gen_aarch64_trn1v2si; break;
13270 case V2DImode: gen = gen_aarch64_trn1v2di; break;
13271 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13272 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13273 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13274 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13275 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
13276 default:
13277 return false;
13281 emit_insn (gen (out, in0, in1));
13282 return true;
13285 /* Recognize patterns suitable for the UZP instructions. */
13286 static bool
13287 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13289 unsigned int i, odd, mask, nelt = d->nelt;
13290 rtx out, in0, in1, x;
13291 rtx (*gen) (rtx, rtx, rtx);
13292 machine_mode vmode = d->vmode;
13294 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13295 return false;
13297 /* Note that these are little-endian tests.
13298 We correct for big-endian later. */
13299 if (d->perm[0] == 0)
13300 odd = 0;
13301 else if (d->perm[0] == 1)
13302 odd = 1;
13303 else
13304 return false;
13305 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13307 for (i = 0; i < nelt; i++)
13309 unsigned elt = (i * 2 + odd) & mask;
13310 if (d->perm[i] != elt)
13311 return false;
13314 /* Success! */
13315 if (d->testing_p)
13316 return true;
13318 in0 = d->op0;
13319 in1 = d->op1;
13320 if (BYTES_BIG_ENDIAN)
13322 x = in0, in0 = in1, in1 = x;
13323 odd = !odd;
13325 out = d->target;
13327 if (odd)
13329 switch (vmode)
13331 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13332 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13333 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13334 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13335 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
13336 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
13337 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
13338 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13339 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13340 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13341 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13342 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13343 default:
13344 return false;
13347 else
13349 switch (vmode)
13351 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13352 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13353 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13354 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13355 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
13356 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
13357 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
13358 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13359 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13360 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13361 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13362 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13363 default:
13364 return false;
13368 emit_insn (gen (out, in0, in1));
13369 return true;
13372 /* Recognize patterns suitable for the ZIP instructions. */
13373 static bool
13374 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13376 unsigned int i, high, mask, nelt = d->nelt;
13377 rtx out, in0, in1, x;
13378 rtx (*gen) (rtx, rtx, rtx);
13379 machine_mode vmode = d->vmode;
13381 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13382 return false;
13384 /* Note that these are little-endian tests.
13385 We correct for big-endian later. */
13386 high = nelt / 2;
13387 if (d->perm[0] == high)
13388 /* Do Nothing. */
13390 else if (d->perm[0] == 0)
13391 high = 0;
13392 else
13393 return false;
13394 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13396 for (i = 0; i < nelt / 2; i++)
13398 unsigned elt = (i + high) & mask;
13399 if (d->perm[i * 2] != elt)
13400 return false;
13401 elt = (elt + nelt) & mask;
13402 if (d->perm[i * 2 + 1] != elt)
13403 return false;
13406 /* Success! */
13407 if (d->testing_p)
13408 return true;
13410 in0 = d->op0;
13411 in1 = d->op1;
13412 if (BYTES_BIG_ENDIAN)
13414 x = in0, in0 = in1, in1 = x;
13415 high = !high;
13417 out = d->target;
13419 if (high)
13421 switch (vmode)
13423 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13424 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13425 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13426 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13427 case V4SImode: gen = gen_aarch64_zip2v4si; break;
13428 case V2SImode: gen = gen_aarch64_zip2v2si; break;
13429 case V2DImode: gen = gen_aarch64_zip2v2di; break;
13430 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13431 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13432 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13433 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13434 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13435 default:
13436 return false;
13439 else
13441 switch (vmode)
13443 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13444 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13445 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13446 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13447 case V4SImode: gen = gen_aarch64_zip1v4si; break;
13448 case V2SImode: gen = gen_aarch64_zip1v2si; break;
13449 case V2DImode: gen = gen_aarch64_zip1v2di; break;
13450 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13451 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13452 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13453 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13454 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13455 default:
13456 return false;
13460 emit_insn (gen (out, in0, in1));
13461 return true;
13464 /* Recognize patterns for the EXT insn. */
13466 static bool
13467 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13469 unsigned int i, nelt = d->nelt;
13470 rtx (*gen) (rtx, rtx, rtx, rtx);
13471 rtx offset;
13473 unsigned int location = d->perm[0]; /* Always < nelt. */
13475 /* Check if the extracted indices are increasing by one. */
13476 for (i = 1; i < nelt; i++)
13478 unsigned int required = location + i;
13479 if (d->one_vector_p)
13481 /* We'll pass the same vector in twice, so allow indices to wrap. */
13482 required &= (nelt - 1);
13484 if (d->perm[i] != required)
13485 return false;
13488 switch (d->vmode)
13490 case V16QImode: gen = gen_aarch64_extv16qi; break;
13491 case V8QImode: gen = gen_aarch64_extv8qi; break;
13492 case V4HImode: gen = gen_aarch64_extv4hi; break;
13493 case V8HImode: gen = gen_aarch64_extv8hi; break;
13494 case V2SImode: gen = gen_aarch64_extv2si; break;
13495 case V4SImode: gen = gen_aarch64_extv4si; break;
13496 case V4HFmode: gen = gen_aarch64_extv4hf; break;
13497 case V8HFmode: gen = gen_aarch64_extv8hf; break;
13498 case V2SFmode: gen = gen_aarch64_extv2sf; break;
13499 case V4SFmode: gen = gen_aarch64_extv4sf; break;
13500 case V2DImode: gen = gen_aarch64_extv2di; break;
13501 case V2DFmode: gen = gen_aarch64_extv2df; break;
13502 default:
13503 return false;
13506 /* Success! */
13507 if (d->testing_p)
13508 return true;
13510 /* The case where (location == 0) is a no-op for both big- and little-endian,
13511 and is removed by the mid-end at optimization levels -O1 and higher. */
13513 if (BYTES_BIG_ENDIAN && (location != 0))
13515 /* After setup, we want the high elements of the first vector (stored
13516 at the LSB end of the register), and the low elements of the second
13517 vector (stored at the MSB end of the register). So swap. */
13518 std::swap (d->op0, d->op1);
13519 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13520 location = nelt - location;
13523 offset = GEN_INT (location);
13524 emit_insn (gen (d->target, d->op0, d->op1, offset));
13525 return true;
13528 /* Recognize patterns for the REV insns. */
13530 static bool
13531 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13533 unsigned int i, j, diff, nelt = d->nelt;
13534 rtx (*gen) (rtx, rtx);
13536 if (!d->one_vector_p)
13537 return false;
13539 diff = d->perm[0];
13540 switch (diff)
13542 case 7:
13543 switch (d->vmode)
13545 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13546 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
13547 default:
13548 return false;
13550 break;
13551 case 3:
13552 switch (d->vmode)
13554 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13555 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13556 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13557 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13558 default:
13559 return false;
13561 break;
13562 case 1:
13563 switch (d->vmode)
13565 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13566 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13567 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13568 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13569 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13570 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13571 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13572 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13573 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13574 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13575 default:
13576 return false;
13578 break;
13579 default:
13580 return false;
13583 for (i = 0; i < nelt ; i += diff + 1)
13584 for (j = 0; j <= diff; j += 1)
13586 /* This is guaranteed to be true as the value of diff
13587 is 7, 3, 1 and we should have enough elements in the
13588 queue to generate this. Getting a vector mask with a
13589 value of diff other than these values implies that
13590 something is wrong by the time we get here. */
13591 gcc_assert (i + j < nelt);
13592 if (d->perm[i + j] != i + diff - j)
13593 return false;
13596 /* Success! */
13597 if (d->testing_p)
13598 return true;
13600 emit_insn (gen (d->target, d->op0));
13601 return true;
13604 static bool
13605 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13607 rtx (*gen) (rtx, rtx, rtx);
13608 rtx out = d->target;
13609 rtx in0;
13610 machine_mode vmode = d->vmode;
13611 unsigned int i, elt, nelt = d->nelt;
13612 rtx lane;
13614 elt = d->perm[0];
13615 for (i = 1; i < nelt; i++)
13617 if (elt != d->perm[i])
13618 return false;
13621 /* The generic preparation in aarch64_expand_vec_perm_const_1
13622 swaps the operand order and the permute indices if it finds
13623 d->perm[0] to be in the second operand. Thus, we can always
13624 use d->op0 and need not do any extra arithmetic to get the
13625 correct lane number. */
13626 in0 = d->op0;
13627 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13629 switch (vmode)
13631 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13632 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13633 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13634 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13635 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13636 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13637 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13638 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13639 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13640 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13641 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13642 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13643 default:
13644 return false;
13647 emit_insn (gen (out, in0, lane));
13648 return true;
13651 static bool
13652 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13654 rtx rperm[MAX_VECT_LEN], sel;
13655 machine_mode vmode = d->vmode;
13656 unsigned int i, nelt = d->nelt;
13658 if (d->testing_p)
13659 return true;
13661 /* Generic code will try constant permutation twice. Once with the
13662 original mode and again with the elements lowered to QImode.
13663 So wait and don't do the selector expansion ourselves. */
13664 if (vmode != V8QImode && vmode != V16QImode)
13665 return false;
13667 for (i = 0; i < nelt; ++i)
13669 int nunits = GET_MODE_NUNITS (vmode);
13671 /* If big-endian and two vectors we end up with a weird mixed-endian
13672 mode on NEON. Reverse the index within each word but not the word
13673 itself. */
13674 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13675 : d->perm[i]);
13677 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13678 sel = force_reg (vmode, sel);
13680 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13681 return true;
13684 static bool
13685 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13687 /* The pattern matching functions above are written to look for a small
13688 number to begin the sequence (0, 1, N/2). If we begin with an index
13689 from the second operand, we can swap the operands. */
13690 if (d->perm[0] >= d->nelt)
13692 unsigned i, nelt = d->nelt;
13694 gcc_assert (nelt == (nelt & -nelt));
13695 for (i = 0; i < nelt; ++i)
13696 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13698 std::swap (d->op0, d->op1);
13701 if (TARGET_SIMD)
13703 if (aarch64_evpc_rev (d))
13704 return true;
13705 else if (aarch64_evpc_ext (d))
13706 return true;
13707 else if (aarch64_evpc_dup (d))
13708 return true;
13709 else if (aarch64_evpc_zip (d))
13710 return true;
13711 else if (aarch64_evpc_uzp (d))
13712 return true;
13713 else if (aarch64_evpc_trn (d))
13714 return true;
13715 return aarch64_evpc_tbl (d);
13717 return false;
13720 /* Expand a vec_perm_const pattern. */
13722 bool
13723 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13725 struct expand_vec_perm_d d;
13726 int i, nelt, which;
13728 d.target = target;
13729 d.op0 = op0;
13730 d.op1 = op1;
13732 d.vmode = GET_MODE (target);
13733 gcc_assert (VECTOR_MODE_P (d.vmode));
13734 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13735 d.testing_p = false;
13737 for (i = which = 0; i < nelt; ++i)
13739 rtx e = XVECEXP (sel, 0, i);
13740 int ei = INTVAL (e) & (2 * nelt - 1);
13741 which |= (ei < nelt ? 1 : 2);
13742 d.perm[i] = ei;
13745 switch (which)
13747 default:
13748 gcc_unreachable ();
13750 case 3:
13751 d.one_vector_p = false;
13752 if (!rtx_equal_p (op0, op1))
13753 break;
13755 /* The elements of PERM do not suggest that only the first operand
13756 is used, but both operands are identical. Allow easier matching
13757 of the permutation by folding the permutation into the single
13758 input vector. */
13759 /* Fall Through. */
13760 case 2:
13761 for (i = 0; i < nelt; ++i)
13762 d.perm[i] &= nelt - 1;
13763 d.op0 = op1;
13764 d.one_vector_p = true;
13765 break;
13767 case 1:
13768 d.op1 = op0;
13769 d.one_vector_p = true;
13770 break;
13773 return aarch64_expand_vec_perm_const_1 (&d);
13776 static bool
13777 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13778 const unsigned char *sel)
13780 struct expand_vec_perm_d d;
13781 unsigned int i, nelt, which;
13782 bool ret;
13784 d.vmode = vmode;
13785 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13786 d.testing_p = true;
13787 memcpy (d.perm, sel, nelt);
13789 /* Calculate whether all elements are in one vector. */
13790 for (i = which = 0; i < nelt; ++i)
13792 unsigned char e = d.perm[i];
13793 gcc_assert (e < 2 * nelt);
13794 which |= (e < nelt ? 1 : 2);
13797 /* If all elements are from the second vector, reindex as if from the
13798 first vector. */
13799 if (which == 2)
13800 for (i = 0; i < nelt; ++i)
13801 d.perm[i] -= nelt;
13803 /* Check whether the mask can be applied to a single vector. */
13804 d.one_vector_p = (which != 3);
13806 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13807 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13808 if (!d.one_vector_p)
13809 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13811 start_sequence ();
13812 ret = aarch64_expand_vec_perm_const_1 (&d);
13813 end_sequence ();
13815 return ret;
13819 aarch64_reverse_mask (machine_mode mode)
13821 /* We have to reverse each vector because we dont have
13822 a permuted load that can reverse-load according to ABI rules. */
13823 rtx mask;
13824 rtvec v = rtvec_alloc (16);
13825 int i, j;
13826 int nunits = GET_MODE_NUNITS (mode);
13827 int usize = GET_MODE_UNIT_SIZE (mode);
13829 gcc_assert (BYTES_BIG_ENDIAN);
13830 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13832 for (i = 0; i < nunits; i++)
13833 for (j = 0; j < usize; j++)
13834 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13835 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13836 return force_reg (V16QImode, mask);
13839 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13840 However due to issues with register allocation it is preferable to avoid
13841 tieing integer scalar and FP scalar modes. Executing integer operations
13842 in general registers is better than treating them as scalar vector
13843 operations. This reduces latency and avoids redundant int<->FP moves.
13844 So tie modes if they are either the same class, or vector modes with
13845 other vector modes, vector structs or any scalar mode.
13848 bool
13849 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13851 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13852 return true;
13854 /* We specifically want to allow elements of "structure" modes to
13855 be tieable to the structure. This more general condition allows
13856 other rarer situations too. */
13857 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13858 return true;
13860 /* Also allow any scalar modes with vectors. */
13861 if (aarch64_vector_mode_supported_p (mode1)
13862 || aarch64_vector_mode_supported_p (mode2))
13863 return true;
13865 return false;
13868 /* Return a new RTX holding the result of moving POINTER forward by
13869 AMOUNT bytes. */
13871 static rtx
13872 aarch64_move_pointer (rtx pointer, int amount)
13874 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13876 return adjust_automodify_address (pointer, GET_MODE (pointer),
13877 next, amount);
13880 /* Return a new RTX holding the result of moving POINTER forward by the
13881 size of the mode it points to. */
13883 static rtx
13884 aarch64_progress_pointer (rtx pointer)
13886 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13888 return aarch64_move_pointer (pointer, amount);
13891 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13892 MODE bytes. */
13894 static void
13895 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13896 machine_mode mode)
13898 rtx reg = gen_reg_rtx (mode);
13900 /* "Cast" the pointers to the correct mode. */
13901 *src = adjust_address (*src, mode, 0);
13902 *dst = adjust_address (*dst, mode, 0);
13903 /* Emit the memcpy. */
13904 emit_move_insn (reg, *src);
13905 emit_move_insn (*dst, reg);
13906 /* Move the pointers forward. */
13907 *src = aarch64_progress_pointer (*src);
13908 *dst = aarch64_progress_pointer (*dst);
13911 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13912 we succeed, otherwise return false. */
13914 bool
13915 aarch64_expand_movmem (rtx *operands)
13917 unsigned int n;
13918 rtx dst = operands[0];
13919 rtx src = operands[1];
13920 rtx base;
13921 bool speed_p = !optimize_function_for_size_p (cfun);
13923 /* When optimizing for size, give a better estimate of the length of a
13924 memcpy call, but use the default otherwise. */
13925 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13927 /* We can't do anything smart if the amount to copy is not constant. */
13928 if (!CONST_INT_P (operands[2]))
13929 return false;
13931 n = UINTVAL (operands[2]);
13933 /* Try to keep the number of instructions low. For cases below 16 bytes we
13934 need to make at most two moves. For cases above 16 bytes it will be one
13935 move for each 16 byte chunk, then at most two additional moves. */
13936 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13937 return false;
13939 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13940 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13942 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13943 src = adjust_automodify_address (src, VOIDmode, base, 0);
13945 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13946 1-byte chunk. */
13947 if (n < 4)
13949 if (n >= 2)
13951 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13952 n -= 2;
13955 if (n == 1)
13956 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13958 return true;
13961 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13962 4-byte chunk, partially overlapping with the previously copied chunk. */
13963 if (n < 8)
13965 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13966 n -= 4;
13967 if (n > 0)
13969 int move = n - 4;
13971 src = aarch64_move_pointer (src, move);
13972 dst = aarch64_move_pointer (dst, move);
13973 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13975 return true;
13978 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13979 them, then (if applicable) an 8-byte chunk. */
13980 while (n >= 8)
13982 if (n / 16)
13984 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13985 n -= 16;
13987 else
13989 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13990 n -= 8;
13994 /* Finish the final bytes of the copy. We can always do this in one
13995 instruction. We either copy the exact amount we need, or partially
13996 overlap with the previous chunk we copied and copy 8-bytes. */
13997 if (n == 0)
13998 return true;
13999 else if (n == 1)
14000 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14001 else if (n == 2)
14002 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14003 else if (n == 4)
14004 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14005 else
14007 if (n == 3)
14009 src = aarch64_move_pointer (src, -1);
14010 dst = aarch64_move_pointer (dst, -1);
14011 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14013 else
14015 int move = n - 8;
14017 src = aarch64_move_pointer (src, move);
14018 dst = aarch64_move_pointer (dst, move);
14019 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14023 return true;
14026 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14027 SImode stores. Handle the case when the constant has identical
14028 bottom and top halves. This is beneficial when the two stores can be
14029 merged into an STP and we avoid synthesising potentially expensive
14030 immediates twice. Return true if such a split is possible. */
14032 bool
14033 aarch64_split_dimode_const_store (rtx dst, rtx src)
14035 rtx lo = gen_lowpart (SImode, src);
14036 rtx hi = gen_highpart_mode (SImode, DImode, src);
14038 bool size_p = optimize_function_for_size_p (cfun);
14040 if (!rtx_equal_p (lo, hi))
14041 return false;
14043 unsigned int orig_cost
14044 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14045 unsigned int lo_cost
14046 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14048 /* We want to transform:
14049 MOV x1, 49370
14050 MOVK x1, 0x140, lsl 16
14051 MOVK x1, 0xc0da, lsl 32
14052 MOVK x1, 0x140, lsl 48
14053 STR x1, [x0]
14054 into:
14055 MOV w1, 49370
14056 MOVK w1, 0x140, lsl 16
14057 STP w1, w1, [x0]
14058 So we want to perform this only when we save two instructions
14059 or more. When optimizing for size, however, accept any code size
14060 savings we can. */
14061 if (size_p && orig_cost <= lo_cost)
14062 return false;
14064 if (!size_p
14065 && (orig_cost <= lo_cost + 1))
14066 return false;
14068 rtx mem_lo = adjust_address (dst, SImode, 0);
14069 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14070 return false;
14072 rtx tmp_reg = gen_reg_rtx (SImode);
14073 aarch64_expand_mov_immediate (tmp_reg, lo);
14074 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14075 /* Don't emit an explicit store pair as this may not be always profitable.
14076 Let the sched-fusion logic decide whether to merge them. */
14077 emit_move_insn (mem_lo, tmp_reg);
14078 emit_move_insn (mem_hi, tmp_reg);
14080 return true;
14083 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14085 static unsigned HOST_WIDE_INT
14086 aarch64_asan_shadow_offset (void)
14088 return (HOST_WIDE_INT_1 << 36);
14091 static bool
14092 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14093 unsigned int align,
14094 enum by_pieces_operation op,
14095 bool speed_p)
14097 /* STORE_BY_PIECES can be used when copying a constant string, but
14098 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14099 For now we always fail this and let the move_by_pieces code copy
14100 the string from read-only memory. */
14101 if (op == STORE_BY_PIECES)
14102 return false;
14104 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14107 static rtx
14108 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14109 int code, tree treeop0, tree treeop1)
14111 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14112 rtx op0, op1;
14113 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14114 insn_code icode;
14115 struct expand_operand ops[4];
14117 start_sequence ();
14118 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14120 op_mode = GET_MODE (op0);
14121 if (op_mode == VOIDmode)
14122 op_mode = GET_MODE (op1);
14124 switch (op_mode)
14126 case QImode:
14127 case HImode:
14128 case SImode:
14129 cmp_mode = SImode;
14130 icode = CODE_FOR_cmpsi;
14131 break;
14133 case DImode:
14134 cmp_mode = DImode;
14135 icode = CODE_FOR_cmpdi;
14136 break;
14138 case SFmode:
14139 cmp_mode = SFmode;
14140 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14141 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14142 break;
14144 case DFmode:
14145 cmp_mode = DFmode;
14146 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14147 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14148 break;
14150 default:
14151 end_sequence ();
14152 return NULL_RTX;
14155 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14156 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14157 if (!op0 || !op1)
14159 end_sequence ();
14160 return NULL_RTX;
14162 *prep_seq = get_insns ();
14163 end_sequence ();
14165 create_fixed_operand (&ops[0], op0);
14166 create_fixed_operand (&ops[1], op1);
14168 start_sequence ();
14169 if (!maybe_expand_insn (icode, 2, ops))
14171 end_sequence ();
14172 return NULL_RTX;
14174 *gen_seq = get_insns ();
14175 end_sequence ();
14177 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14178 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14181 static rtx
14182 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14183 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14185 rtx op0, op1, target;
14186 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14187 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14188 insn_code icode;
14189 struct expand_operand ops[6];
14190 int aarch64_cond;
14192 push_to_sequence (*prep_seq);
14193 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14195 op_mode = GET_MODE (op0);
14196 if (op_mode == VOIDmode)
14197 op_mode = GET_MODE (op1);
14199 switch (op_mode)
14201 case QImode:
14202 case HImode:
14203 case SImode:
14204 cmp_mode = SImode;
14205 icode = CODE_FOR_ccmpsi;
14206 break;
14208 case DImode:
14209 cmp_mode = DImode;
14210 icode = CODE_FOR_ccmpdi;
14211 break;
14213 case SFmode:
14214 cmp_mode = SFmode;
14215 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14216 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14217 break;
14219 case DFmode:
14220 cmp_mode = DFmode;
14221 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14222 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14223 break;
14225 default:
14226 end_sequence ();
14227 return NULL_RTX;
14230 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14231 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14232 if (!op0 || !op1)
14234 end_sequence ();
14235 return NULL_RTX;
14237 *prep_seq = get_insns ();
14238 end_sequence ();
14240 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14241 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14243 if (bit_code != AND)
14245 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14246 GET_MODE (XEXP (prev, 0))),
14247 VOIDmode, XEXP (prev, 0), const0_rtx);
14248 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14251 create_fixed_operand (&ops[0], XEXP (prev, 0));
14252 create_fixed_operand (&ops[1], target);
14253 create_fixed_operand (&ops[2], op0);
14254 create_fixed_operand (&ops[3], op1);
14255 create_fixed_operand (&ops[4], prev);
14256 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14258 push_to_sequence (*gen_seq);
14259 if (!maybe_expand_insn (icode, 6, ops))
14261 end_sequence ();
14262 return NULL_RTX;
14265 *gen_seq = get_insns ();
14266 end_sequence ();
14268 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14271 #undef TARGET_GEN_CCMP_FIRST
14272 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14274 #undef TARGET_GEN_CCMP_NEXT
14275 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14277 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14278 instruction fusion of some sort. */
14280 static bool
14281 aarch64_macro_fusion_p (void)
14283 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14287 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14288 should be kept together during scheduling. */
14290 static bool
14291 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14293 rtx set_dest;
14294 rtx prev_set = single_set (prev);
14295 rtx curr_set = single_set (curr);
14296 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14297 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14299 if (!aarch64_macro_fusion_p ())
14300 return false;
14302 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14304 /* We are trying to match:
14305 prev (mov) == (set (reg r0) (const_int imm16))
14306 curr (movk) == (set (zero_extract (reg r0)
14307 (const_int 16)
14308 (const_int 16))
14309 (const_int imm16_1)) */
14311 set_dest = SET_DEST (curr_set);
14313 if (GET_CODE (set_dest) == ZERO_EXTRACT
14314 && CONST_INT_P (SET_SRC (curr_set))
14315 && CONST_INT_P (SET_SRC (prev_set))
14316 && CONST_INT_P (XEXP (set_dest, 2))
14317 && INTVAL (XEXP (set_dest, 2)) == 16
14318 && REG_P (XEXP (set_dest, 0))
14319 && REG_P (SET_DEST (prev_set))
14320 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14322 return true;
14326 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14329 /* We're trying to match:
14330 prev (adrp) == (set (reg r1)
14331 (high (symbol_ref ("SYM"))))
14332 curr (add) == (set (reg r0)
14333 (lo_sum (reg r1)
14334 (symbol_ref ("SYM"))))
14335 Note that r0 need not necessarily be the same as r1, especially
14336 during pre-regalloc scheduling. */
14338 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14339 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14341 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14342 && REG_P (XEXP (SET_SRC (curr_set), 0))
14343 && REGNO (XEXP (SET_SRC (curr_set), 0))
14344 == REGNO (SET_DEST (prev_set))
14345 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14346 XEXP (SET_SRC (curr_set), 1)))
14347 return true;
14351 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14354 /* We're trying to match:
14355 prev (movk) == (set (zero_extract (reg r0)
14356 (const_int 16)
14357 (const_int 32))
14358 (const_int imm16_1))
14359 curr (movk) == (set (zero_extract (reg r0)
14360 (const_int 16)
14361 (const_int 48))
14362 (const_int imm16_2)) */
14364 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14365 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14366 && REG_P (XEXP (SET_DEST (prev_set), 0))
14367 && REG_P (XEXP (SET_DEST (curr_set), 0))
14368 && REGNO (XEXP (SET_DEST (prev_set), 0))
14369 == REGNO (XEXP (SET_DEST (curr_set), 0))
14370 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14371 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14372 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14373 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14374 && CONST_INT_P (SET_SRC (prev_set))
14375 && CONST_INT_P (SET_SRC (curr_set)))
14376 return true;
14379 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14381 /* We're trying to match:
14382 prev (adrp) == (set (reg r0)
14383 (high (symbol_ref ("SYM"))))
14384 curr (ldr) == (set (reg r1)
14385 (mem (lo_sum (reg r0)
14386 (symbol_ref ("SYM")))))
14388 curr (ldr) == (set (reg r1)
14389 (zero_extend (mem
14390 (lo_sum (reg r0)
14391 (symbol_ref ("SYM")))))) */
14392 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14393 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14395 rtx curr_src = SET_SRC (curr_set);
14397 if (GET_CODE (curr_src) == ZERO_EXTEND)
14398 curr_src = XEXP (curr_src, 0);
14400 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14401 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14402 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14403 == REGNO (SET_DEST (prev_set))
14404 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14405 XEXP (SET_SRC (prev_set), 0)))
14406 return true;
14410 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14411 && aarch_crypto_can_dual_issue (prev, curr))
14412 return true;
14414 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14415 && any_condjump_p (curr))
14417 enum attr_type prev_type = get_attr_type (prev);
14419 unsigned int condreg1, condreg2;
14420 rtx cc_reg_1;
14421 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14422 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14424 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14425 && prev
14426 && modified_in_p (cc_reg_1, prev))
14428 /* FIXME: this misses some which is considered simple arthematic
14429 instructions for ThunderX. Simple shifts are missed here. */
14430 if (prev_type == TYPE_ALUS_SREG
14431 || prev_type == TYPE_ALUS_IMM
14432 || prev_type == TYPE_LOGICS_REG
14433 || prev_type == TYPE_LOGICS_IMM)
14434 return true;
14438 if (prev_set
14439 && curr_set
14440 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14441 && any_condjump_p (curr))
14443 /* We're trying to match:
14444 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14445 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14446 (const_int 0))
14447 (label_ref ("SYM"))
14448 (pc)) */
14449 if (SET_DEST (curr_set) == (pc_rtx)
14450 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14451 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14452 && REG_P (SET_DEST (prev_set))
14453 && REGNO (SET_DEST (prev_set))
14454 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14456 /* Fuse ALU operations followed by conditional branch instruction. */
14457 switch (get_attr_type (prev))
14459 case TYPE_ALU_IMM:
14460 case TYPE_ALU_SREG:
14461 case TYPE_ADC_REG:
14462 case TYPE_ADC_IMM:
14463 case TYPE_ADCS_REG:
14464 case TYPE_ADCS_IMM:
14465 case TYPE_LOGIC_REG:
14466 case TYPE_LOGIC_IMM:
14467 case TYPE_CSEL:
14468 case TYPE_ADR:
14469 case TYPE_MOV_IMM:
14470 case TYPE_SHIFT_REG:
14471 case TYPE_SHIFT_IMM:
14472 case TYPE_BFM:
14473 case TYPE_RBIT:
14474 case TYPE_REV:
14475 case TYPE_EXTEND:
14476 return true;
14478 default:;
14483 return false;
14486 /* Return true iff the instruction fusion described by OP is enabled. */
14488 bool
14489 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14491 return (aarch64_tune_params.fusible_ops & op) != 0;
14494 /* If MEM is in the form of [base+offset], extract the two parts
14495 of address and set to BASE and OFFSET, otherwise return false
14496 after clearing BASE and OFFSET. */
14498 bool
14499 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14501 rtx addr;
14503 gcc_assert (MEM_P (mem));
14505 addr = XEXP (mem, 0);
14507 if (REG_P (addr))
14509 *base = addr;
14510 *offset = const0_rtx;
14511 return true;
14514 if (GET_CODE (addr) == PLUS
14515 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14517 *base = XEXP (addr, 0);
14518 *offset = XEXP (addr, 1);
14519 return true;
14522 *base = NULL_RTX;
14523 *offset = NULL_RTX;
14525 return false;
14528 /* Types for scheduling fusion. */
14529 enum sched_fusion_type
14531 SCHED_FUSION_NONE = 0,
14532 SCHED_FUSION_LD_SIGN_EXTEND,
14533 SCHED_FUSION_LD_ZERO_EXTEND,
14534 SCHED_FUSION_LD,
14535 SCHED_FUSION_ST,
14536 SCHED_FUSION_NUM
14539 /* If INSN is a load or store of address in the form of [base+offset],
14540 extract the two parts and set to BASE and OFFSET. Return scheduling
14541 fusion type this INSN is. */
14543 static enum sched_fusion_type
14544 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14546 rtx x, dest, src;
14547 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14549 gcc_assert (INSN_P (insn));
14550 x = PATTERN (insn);
14551 if (GET_CODE (x) != SET)
14552 return SCHED_FUSION_NONE;
14554 src = SET_SRC (x);
14555 dest = SET_DEST (x);
14557 machine_mode dest_mode = GET_MODE (dest);
14559 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14560 return SCHED_FUSION_NONE;
14562 if (GET_CODE (src) == SIGN_EXTEND)
14564 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14565 src = XEXP (src, 0);
14566 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14567 return SCHED_FUSION_NONE;
14569 else if (GET_CODE (src) == ZERO_EXTEND)
14571 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14572 src = XEXP (src, 0);
14573 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14574 return SCHED_FUSION_NONE;
14577 if (GET_CODE (src) == MEM && REG_P (dest))
14578 extract_base_offset_in_addr (src, base, offset);
14579 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14581 fusion = SCHED_FUSION_ST;
14582 extract_base_offset_in_addr (dest, base, offset);
14584 else
14585 return SCHED_FUSION_NONE;
14587 if (*base == NULL_RTX || *offset == NULL_RTX)
14588 fusion = SCHED_FUSION_NONE;
14590 return fusion;
14593 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14595 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14596 and PRI are only calculated for these instructions. For other instruction,
14597 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14598 type instruction fusion can be added by returning different priorities.
14600 It's important that irrelevant instructions get the largest FUSION_PRI. */
14602 static void
14603 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14604 int *fusion_pri, int *pri)
14606 int tmp, off_val;
14607 rtx base, offset;
14608 enum sched_fusion_type fusion;
14610 gcc_assert (INSN_P (insn));
14612 tmp = max_pri - 1;
14613 fusion = fusion_load_store (insn, &base, &offset);
14614 if (fusion == SCHED_FUSION_NONE)
14616 *pri = tmp;
14617 *fusion_pri = tmp;
14618 return;
14621 /* Set FUSION_PRI according to fusion type and base register. */
14622 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14624 /* Calculate PRI. */
14625 tmp /= 2;
14627 /* INSN with smaller offset goes first. */
14628 off_val = (int)(INTVAL (offset));
14629 if (off_val >= 0)
14630 tmp -= (off_val & 0xfffff);
14631 else
14632 tmp += ((- off_val) & 0xfffff);
14634 *pri = tmp;
14635 return;
14638 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14639 Adjust priority of sha1h instructions so they are scheduled before
14640 other SHA1 instructions. */
14642 static int
14643 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14645 rtx x = PATTERN (insn);
14647 if (GET_CODE (x) == SET)
14649 x = SET_SRC (x);
14651 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14652 return priority + 10;
14655 return priority;
14658 /* Given OPERANDS of consecutive load/store, check if we can merge
14659 them into ldp/stp. LOAD is true if they are load instructions.
14660 MODE is the mode of memory operands. */
14662 bool
14663 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14664 machine_mode mode)
14666 HOST_WIDE_INT offval_1, offval_2, msize;
14667 enum reg_class rclass_1, rclass_2;
14668 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14670 if (load)
14672 mem_1 = operands[1];
14673 mem_2 = operands[3];
14674 reg_1 = operands[0];
14675 reg_2 = operands[2];
14676 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14677 if (REGNO (reg_1) == REGNO (reg_2))
14678 return false;
14680 else
14682 mem_1 = operands[0];
14683 mem_2 = operands[2];
14684 reg_1 = operands[1];
14685 reg_2 = operands[3];
14688 /* The mems cannot be volatile. */
14689 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14690 return false;
14692 /* If we have SImode and slow unaligned ldp,
14693 check the alignment to be at least 8 byte. */
14694 if (mode == SImode
14695 && (aarch64_tune_params.extra_tuning_flags
14696 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14697 && !optimize_size
14698 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14699 return false;
14701 /* Check if the addresses are in the form of [base+offset]. */
14702 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14703 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14704 return false;
14705 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14706 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14707 return false;
14709 /* Check if the bases are same. */
14710 if (!rtx_equal_p (base_1, base_2))
14711 return false;
14713 offval_1 = INTVAL (offset_1);
14714 offval_2 = INTVAL (offset_2);
14715 msize = GET_MODE_SIZE (mode);
14716 /* Check if the offsets are consecutive. */
14717 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14718 return false;
14720 /* Check if the addresses are clobbered by load. */
14721 if (load)
14723 if (reg_mentioned_p (reg_1, mem_1))
14724 return false;
14726 /* In increasing order, the last load can clobber the address. */
14727 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14728 return false;
14731 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14732 rclass_1 = FP_REGS;
14733 else
14734 rclass_1 = GENERAL_REGS;
14736 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14737 rclass_2 = FP_REGS;
14738 else
14739 rclass_2 = GENERAL_REGS;
14741 /* Check if the registers are of same class. */
14742 if (rclass_1 != rclass_2)
14743 return false;
14745 return true;
14748 /* Given OPERANDS of consecutive load/store, check if we can merge
14749 them into ldp/stp by adjusting the offset. LOAD is true if they
14750 are load instructions. MODE is the mode of memory operands.
14752 Given below consecutive stores:
14754 str w1, [xb, 0x100]
14755 str w1, [xb, 0x104]
14756 str w1, [xb, 0x108]
14757 str w1, [xb, 0x10c]
14759 Though the offsets are out of the range supported by stp, we can
14760 still pair them after adjusting the offset, like:
14762 add scratch, xb, 0x100
14763 stp w1, w1, [scratch]
14764 stp w1, w1, [scratch, 0x8]
14766 The peephole patterns detecting this opportunity should guarantee
14767 the scratch register is avaliable. */
14769 bool
14770 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14771 machine_mode mode)
14773 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14774 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14775 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14776 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14778 if (load)
14780 reg_1 = operands[0];
14781 mem_1 = operands[1];
14782 reg_2 = operands[2];
14783 mem_2 = operands[3];
14784 reg_3 = operands[4];
14785 mem_3 = operands[5];
14786 reg_4 = operands[6];
14787 mem_4 = operands[7];
14788 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14789 && REG_P (reg_3) && REG_P (reg_4));
14790 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14791 return false;
14793 else
14795 mem_1 = operands[0];
14796 reg_1 = operands[1];
14797 mem_2 = operands[2];
14798 reg_2 = operands[3];
14799 mem_3 = operands[4];
14800 reg_3 = operands[5];
14801 mem_4 = operands[6];
14802 reg_4 = operands[7];
14804 /* Skip if memory operand is by itslef valid for ldp/stp. */
14805 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14806 return false;
14808 /* The mems cannot be volatile. */
14809 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14810 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14811 return false;
14813 /* Check if the addresses are in the form of [base+offset]. */
14814 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14815 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14816 return false;
14817 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14818 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14819 return false;
14820 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14821 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14822 return false;
14823 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14824 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14825 return false;
14827 /* Check if the bases are same. */
14828 if (!rtx_equal_p (base_1, base_2)
14829 || !rtx_equal_p (base_2, base_3)
14830 || !rtx_equal_p (base_3, base_4))
14831 return false;
14833 offval_1 = INTVAL (offset_1);
14834 offval_2 = INTVAL (offset_2);
14835 offval_3 = INTVAL (offset_3);
14836 offval_4 = INTVAL (offset_4);
14837 msize = GET_MODE_SIZE (mode);
14838 /* Check if the offsets are consecutive. */
14839 if ((offval_1 != (offval_2 + msize)
14840 || offval_1 != (offval_3 + msize * 2)
14841 || offval_1 != (offval_4 + msize * 3))
14842 && (offval_4 != (offval_3 + msize)
14843 || offval_4 != (offval_2 + msize * 2)
14844 || offval_4 != (offval_1 + msize * 3)))
14845 return false;
14847 /* Check if the addresses are clobbered by load. */
14848 if (load)
14850 if (reg_mentioned_p (reg_1, mem_1)
14851 || reg_mentioned_p (reg_2, mem_2)
14852 || reg_mentioned_p (reg_3, mem_3))
14853 return false;
14855 /* In increasing order, the last load can clobber the address. */
14856 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14857 return false;
14860 /* If we have SImode and slow unaligned ldp,
14861 check the alignment to be at least 8 byte. */
14862 if (mode == SImode
14863 && (aarch64_tune_params.extra_tuning_flags
14864 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14865 && !optimize_size
14866 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14867 return false;
14869 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14870 rclass_1 = FP_REGS;
14871 else
14872 rclass_1 = GENERAL_REGS;
14874 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14875 rclass_2 = FP_REGS;
14876 else
14877 rclass_2 = GENERAL_REGS;
14879 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14880 rclass_3 = FP_REGS;
14881 else
14882 rclass_3 = GENERAL_REGS;
14884 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14885 rclass_4 = FP_REGS;
14886 else
14887 rclass_4 = GENERAL_REGS;
14889 /* Check if the registers are of same class. */
14890 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14891 return false;
14893 return true;
14896 /* Given OPERANDS of consecutive load/store, this function pairs them
14897 into ldp/stp after adjusting the offset. It depends on the fact
14898 that addresses of load/store instructions are in increasing order.
14899 MODE is the mode of memory operands. CODE is the rtl operator
14900 which should be applied to all memory operands, it's SIGN_EXTEND,
14901 ZERO_EXTEND or UNKNOWN. */
14903 bool
14904 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14905 machine_mode mode, RTX_CODE code)
14907 rtx base, offset, t1, t2;
14908 rtx mem_1, mem_2, mem_3, mem_4;
14909 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14911 if (load)
14913 mem_1 = operands[1];
14914 mem_2 = operands[3];
14915 mem_3 = operands[5];
14916 mem_4 = operands[7];
14918 else
14920 mem_1 = operands[0];
14921 mem_2 = operands[2];
14922 mem_3 = operands[4];
14923 mem_4 = operands[6];
14924 gcc_assert (code == UNKNOWN);
14927 extract_base_offset_in_addr (mem_1, &base, &offset);
14928 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14930 /* Adjust offset thus it can fit in ldp/stp instruction. */
14931 msize = GET_MODE_SIZE (mode);
14932 stp_off_limit = msize * 0x40;
14933 off_val = INTVAL (offset);
14934 abs_off = (off_val < 0) ? -off_val : off_val;
14935 new_off = abs_off % stp_off_limit;
14936 adj_off = abs_off - new_off;
14938 /* Further adjust to make sure all offsets are OK. */
14939 if ((new_off + msize * 2) >= stp_off_limit)
14941 adj_off += stp_off_limit;
14942 new_off -= stp_off_limit;
14945 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14946 if (adj_off >= 0x1000)
14947 return false;
14949 if (off_val < 0)
14951 adj_off = -adj_off;
14952 new_off = -new_off;
14955 /* Create new memory references. */
14956 mem_1 = change_address (mem_1, VOIDmode,
14957 plus_constant (DImode, operands[8], new_off));
14959 /* Check if the adjusted address is OK for ldp/stp. */
14960 if (!aarch64_mem_pair_operand (mem_1, mode))
14961 return false;
14963 msize = GET_MODE_SIZE (mode);
14964 mem_2 = change_address (mem_2, VOIDmode,
14965 plus_constant (DImode,
14966 operands[8],
14967 new_off + msize));
14968 mem_3 = change_address (mem_3, VOIDmode,
14969 plus_constant (DImode,
14970 operands[8],
14971 new_off + msize * 2));
14972 mem_4 = change_address (mem_4, VOIDmode,
14973 plus_constant (DImode,
14974 operands[8],
14975 new_off + msize * 3));
14977 if (code == ZERO_EXTEND)
14979 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14980 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14981 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14982 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14984 else if (code == SIGN_EXTEND)
14986 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14987 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14988 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14989 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14992 if (load)
14994 operands[1] = mem_1;
14995 operands[3] = mem_2;
14996 operands[5] = mem_3;
14997 operands[7] = mem_4;
14999 else
15001 operands[0] = mem_1;
15002 operands[2] = mem_2;
15003 operands[4] = mem_3;
15004 operands[6] = mem_4;
15007 /* Emit adjusting instruction. */
15008 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15009 /* Emit ldp/stp instructions. */
15010 t1 = gen_rtx_SET (operands[0], operands[1]);
15011 t2 = gen_rtx_SET (operands[2], operands[3]);
15012 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15013 t1 = gen_rtx_SET (operands[4], operands[5]);
15014 t2 = gen_rtx_SET (operands[6], operands[7]);
15015 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15016 return true;
15019 /* Return 1 if pseudo register should be created and used to hold
15020 GOT address for PIC code. */
15022 bool
15023 aarch64_use_pseudo_pic_reg (void)
15025 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15028 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15030 static int
15031 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15033 switch (XINT (x, 1))
15035 case UNSPEC_GOTSMALLPIC:
15036 case UNSPEC_GOTSMALLPIC28K:
15037 case UNSPEC_GOTTINYPIC:
15038 return 0;
15039 default:
15040 break;
15043 return default_unspec_may_trap_p (x, flags);
15047 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15048 return the log2 of that value. Otherwise return -1. */
15051 aarch64_fpconst_pow_of_2 (rtx x)
15053 const REAL_VALUE_TYPE *r;
15055 if (!CONST_DOUBLE_P (x))
15056 return -1;
15058 r = CONST_DOUBLE_REAL_VALUE (x);
15060 if (REAL_VALUE_NEGATIVE (*r)
15061 || REAL_VALUE_ISNAN (*r)
15062 || REAL_VALUE_ISINF (*r)
15063 || !real_isinteger (r, DFmode))
15064 return -1;
15066 return exact_log2 (real_to_integer (r));
15069 /* If X is a vector of equal CONST_DOUBLE values and that value is
15070 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15073 aarch64_vec_fpconst_pow_of_2 (rtx x)
15075 if (GET_CODE (x) != CONST_VECTOR)
15076 return -1;
15078 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15079 return -1;
15081 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15082 if (firstval <= 0)
15083 return -1;
15085 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15086 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15087 return -1;
15089 return firstval;
15092 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15093 to float.
15095 __fp16 always promotes through this hook.
15096 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15097 through the generic excess precision logic rather than here. */
15099 static tree
15100 aarch64_promoted_type (const_tree t)
15102 if (SCALAR_FLOAT_TYPE_P (t)
15103 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15104 return float_type_node;
15106 return NULL_TREE;
15109 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15111 static bool
15112 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15113 optimization_type opt_type)
15115 switch (op)
15117 case rsqrt_optab:
15118 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15120 default:
15121 return true;
15125 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15126 if MODE is HFmode, and punt to the generic implementation otherwise. */
15128 static bool
15129 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
15131 return (mode == HFmode
15132 ? true
15133 : default_libgcc_floating_mode_supported_p (mode));
15136 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15137 if MODE is HFmode, and punt to the generic implementation otherwise. */
15139 static bool
15140 aarch64_scalar_mode_supported_p (machine_mode mode)
15142 return (mode == HFmode
15143 ? true
15144 : default_scalar_mode_supported_p (mode));
15147 /* Set the value of FLT_EVAL_METHOD.
15148 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15150 0: evaluate all operations and constants, whose semantic type has at
15151 most the range and precision of type float, to the range and
15152 precision of float; evaluate all other operations and constants to
15153 the range and precision of the semantic type;
15155 N, where _FloatN is a supported interchange floating type
15156 evaluate all operations and constants, whose semantic type has at
15157 most the range and precision of _FloatN type, to the range and
15158 precision of the _FloatN type; evaluate all other operations and
15159 constants to the range and precision of the semantic type;
15161 If we have the ARMv8.2-A extensions then we support _Float16 in native
15162 precision, so we should set this to 16. Otherwise, we support the type,
15163 but want to evaluate expressions in float precision, so set this to
15164 0. */
15166 static enum flt_eval_method
15167 aarch64_excess_precision (enum excess_precision_type type)
15169 switch (type)
15171 case EXCESS_PRECISION_TYPE_FAST:
15172 case EXCESS_PRECISION_TYPE_STANDARD:
15173 /* We can calculate either in 16-bit range and precision or
15174 32-bit range and precision. Make that decision based on whether
15175 we have native support for the ARMv8.2-A 16-bit floating-point
15176 instructions or not. */
15177 return (TARGET_FP_F16INST
15178 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15179 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15180 case EXCESS_PRECISION_TYPE_IMPLICIT:
15181 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15182 default:
15183 gcc_unreachable ();
15185 return FLT_EVAL_METHOD_UNPREDICTABLE;
15188 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15189 scheduled for speculative execution. Reject the long-running division
15190 and square-root instructions. */
15192 static bool
15193 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15195 switch (get_attr_type (insn))
15197 case TYPE_SDIV:
15198 case TYPE_UDIV:
15199 case TYPE_FDIVS:
15200 case TYPE_FDIVD:
15201 case TYPE_FSQRTS:
15202 case TYPE_FSQRTD:
15203 case TYPE_NEON_FP_SQRT_S:
15204 case TYPE_NEON_FP_SQRT_D:
15205 case TYPE_NEON_FP_SQRT_S_Q:
15206 case TYPE_NEON_FP_SQRT_D_Q:
15207 case TYPE_NEON_FP_DIV_S:
15208 case TYPE_NEON_FP_DIV_D:
15209 case TYPE_NEON_FP_DIV_S_Q:
15210 case TYPE_NEON_FP_DIV_D_Q:
15211 return false;
15212 default:
15213 return true;
15217 /* Target-specific selftests. */
15219 #if CHECKING_P
15221 namespace selftest {
15223 /* Selftest for the RTL loader.
15224 Verify that the RTL loader copes with a dump from
15225 print_rtx_function. This is essentially just a test that class
15226 function_reader can handle a real dump, but it also verifies
15227 that lookup_reg_by_dump_name correctly handles hard regs.
15228 The presence of hard reg names in the dump means that the test is
15229 target-specific, hence it is in this file. */
15231 static void
15232 aarch64_test_loading_full_dump ()
15234 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15236 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15238 rtx_insn *insn_1 = get_insn_by_uid (1);
15239 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15241 rtx_insn *insn_15 = get_insn_by_uid (15);
15242 ASSERT_EQ (INSN, GET_CODE (insn_15));
15243 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15245 /* Verify crtl->return_rtx. */
15246 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15247 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15248 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15251 /* Run all target-specific selftests. */
15253 static void
15254 aarch64_run_selftests (void)
15256 aarch64_test_loading_full_dump ();
15259 } // namespace selftest
15261 #endif /* #if CHECKING_P */
15263 #undef TARGET_ADDRESS_COST
15264 #define TARGET_ADDRESS_COST aarch64_address_cost
15266 /* This hook will determines whether unnamed bitfields affect the alignment
15267 of the containing structure. The hook returns true if the structure
15268 should inherit the alignment requirements of an unnamed bitfield's
15269 type. */
15270 #undef TARGET_ALIGN_ANON_BITFIELD
15271 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15273 #undef TARGET_ASM_ALIGNED_DI_OP
15274 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15276 #undef TARGET_ASM_ALIGNED_HI_OP
15277 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15279 #undef TARGET_ASM_ALIGNED_SI_OP
15280 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15282 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15283 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15284 hook_bool_const_tree_hwi_hwi_const_tree_true
15286 #undef TARGET_ASM_FILE_START
15287 #define TARGET_ASM_FILE_START aarch64_start_file
15289 #undef TARGET_ASM_OUTPUT_MI_THUNK
15290 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15292 #undef TARGET_ASM_SELECT_RTX_SECTION
15293 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15295 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15296 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15298 #undef TARGET_BUILD_BUILTIN_VA_LIST
15299 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15301 #undef TARGET_CALLEE_COPIES
15302 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15304 #undef TARGET_CAN_ELIMINATE
15305 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15307 #undef TARGET_CAN_INLINE_P
15308 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15310 #undef TARGET_CANNOT_FORCE_CONST_MEM
15311 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15313 #undef TARGET_CASE_VALUES_THRESHOLD
15314 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15316 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15317 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15319 /* Only the least significant bit is used for initialization guard
15320 variables. */
15321 #undef TARGET_CXX_GUARD_MASK_BIT
15322 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15324 #undef TARGET_C_MODE_FOR_SUFFIX
15325 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15327 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15328 #undef TARGET_DEFAULT_TARGET_FLAGS
15329 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15330 #endif
15332 #undef TARGET_CLASS_MAX_NREGS
15333 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15335 #undef TARGET_BUILTIN_DECL
15336 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15338 #undef TARGET_BUILTIN_RECIPROCAL
15339 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15341 #undef TARGET_C_EXCESS_PRECISION
15342 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15344 #undef TARGET_EXPAND_BUILTIN
15345 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15347 #undef TARGET_EXPAND_BUILTIN_VA_START
15348 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15350 #undef TARGET_FOLD_BUILTIN
15351 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15353 #undef TARGET_FUNCTION_ARG
15354 #define TARGET_FUNCTION_ARG aarch64_function_arg
15356 #undef TARGET_FUNCTION_ARG_ADVANCE
15357 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15359 #undef TARGET_FUNCTION_ARG_BOUNDARY
15360 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15362 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15363 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15365 #undef TARGET_FUNCTION_VALUE
15366 #define TARGET_FUNCTION_VALUE aarch64_function_value
15368 #undef TARGET_FUNCTION_VALUE_REGNO_P
15369 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15371 #undef TARGET_FRAME_POINTER_REQUIRED
15372 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15374 #undef TARGET_GIMPLE_FOLD_BUILTIN
15375 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15377 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15378 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15380 #undef TARGET_INIT_BUILTINS
15381 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15383 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15384 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15385 aarch64_ira_change_pseudo_allocno_class
15387 #undef TARGET_LEGITIMATE_ADDRESS_P
15388 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15390 #undef TARGET_LEGITIMATE_CONSTANT_P
15391 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15393 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15394 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15395 aarch64_legitimize_address_displacement
15397 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15398 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15400 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15401 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15402 aarch64_libgcc_floating_mode_supported_p
15404 #undef TARGET_MANGLE_TYPE
15405 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15407 #undef TARGET_MEMORY_MOVE_COST
15408 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15410 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15411 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15413 #undef TARGET_MUST_PASS_IN_STACK
15414 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15416 /* This target hook should return true if accesses to volatile bitfields
15417 should use the narrowest mode possible. It should return false if these
15418 accesses should use the bitfield container type. */
15419 #undef TARGET_NARROW_VOLATILE_BITFIELD
15420 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15422 #undef TARGET_OPTION_OVERRIDE
15423 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15425 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15426 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15427 aarch64_override_options_after_change
15429 #undef TARGET_OPTION_SAVE
15430 #define TARGET_OPTION_SAVE aarch64_option_save
15432 #undef TARGET_OPTION_RESTORE
15433 #define TARGET_OPTION_RESTORE aarch64_option_restore
15435 #undef TARGET_OPTION_PRINT
15436 #define TARGET_OPTION_PRINT aarch64_option_print
15438 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15439 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15441 #undef TARGET_SET_CURRENT_FUNCTION
15442 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15444 #undef TARGET_PASS_BY_REFERENCE
15445 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15447 #undef TARGET_PREFERRED_RELOAD_CLASS
15448 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15450 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15451 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15453 #undef TARGET_PROMOTED_TYPE
15454 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15456 #undef TARGET_SECONDARY_RELOAD
15457 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15459 #undef TARGET_SHIFT_TRUNCATION_MASK
15460 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15462 #undef TARGET_SETUP_INCOMING_VARARGS
15463 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15465 #undef TARGET_STRUCT_VALUE_RTX
15466 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15468 #undef TARGET_REGISTER_MOVE_COST
15469 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15471 #undef TARGET_RETURN_IN_MEMORY
15472 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15474 #undef TARGET_RETURN_IN_MSB
15475 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15477 #undef TARGET_RTX_COSTS
15478 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15480 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15481 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15483 #undef TARGET_SCHED_ISSUE_RATE
15484 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15486 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15487 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15488 aarch64_sched_first_cycle_multipass_dfa_lookahead
15490 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15491 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15492 aarch64_first_cycle_multipass_dfa_lookahead_guard
15494 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15495 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15496 aarch64_get_separate_components
15498 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15499 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15500 aarch64_components_for_bb
15502 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15503 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15504 aarch64_disqualify_components
15506 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15507 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15508 aarch64_emit_prologue_components
15510 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15511 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15512 aarch64_emit_epilogue_components
15514 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15515 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15516 aarch64_set_handled_components
15518 #undef TARGET_TRAMPOLINE_INIT
15519 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15521 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15522 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15524 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15525 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15527 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15528 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15529 aarch64_builtin_support_vector_misalignment
15531 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15532 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15534 #undef TARGET_VECTORIZE_ADD_STMT_COST
15535 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15537 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15538 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15539 aarch64_builtin_vectorization_cost
15541 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15542 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15544 #undef TARGET_VECTORIZE_BUILTINS
15545 #define TARGET_VECTORIZE_BUILTINS
15547 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15548 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15549 aarch64_builtin_vectorized_function
15551 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15552 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15553 aarch64_autovectorize_vector_sizes
15555 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15556 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15557 aarch64_atomic_assign_expand_fenv
15559 /* Section anchor support. */
15561 #undef TARGET_MIN_ANCHOR_OFFSET
15562 #define TARGET_MIN_ANCHOR_OFFSET -256
15564 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15565 byte offset; we can do much more for larger data types, but have no way
15566 to determine the size of the access. We assume accesses are aligned. */
15567 #undef TARGET_MAX_ANCHOR_OFFSET
15568 #define TARGET_MAX_ANCHOR_OFFSET 4095
15570 #undef TARGET_VECTOR_ALIGNMENT
15571 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15573 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15574 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15575 aarch64_simd_vector_alignment_reachable
15577 /* vec_perm support. */
15579 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15580 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15581 aarch64_vectorize_vec_perm_const_ok
15583 #undef TARGET_INIT_LIBFUNCS
15584 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15586 #undef TARGET_FIXED_CONDITION_CODE_REGS
15587 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15589 #undef TARGET_FLAGS_REGNUM
15590 #define TARGET_FLAGS_REGNUM CC_REGNUM
15592 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15593 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15595 #undef TARGET_ASAN_SHADOW_OFFSET
15596 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15598 #undef TARGET_LEGITIMIZE_ADDRESS
15599 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15601 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15602 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15603 aarch64_use_by_pieces_infrastructure_p
15605 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15606 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15608 #undef TARGET_CAN_USE_DOLOOP_P
15609 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15611 #undef TARGET_SCHED_ADJUST_PRIORITY
15612 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15614 #undef TARGET_SCHED_MACRO_FUSION_P
15615 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15617 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15618 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15620 #undef TARGET_SCHED_FUSION_PRIORITY
15621 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15623 #undef TARGET_UNSPEC_MAY_TRAP_P
15624 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15626 #undef TARGET_USE_PSEUDO_PIC_REG
15627 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15629 #undef TARGET_PRINT_OPERAND
15630 #define TARGET_PRINT_OPERAND aarch64_print_operand
15632 #undef TARGET_PRINT_OPERAND_ADDRESS
15633 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15635 #undef TARGET_OPTAB_SUPPORTED_P
15636 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15638 #undef TARGET_OMIT_STRUCT_RETURN_REG
15639 #define TARGET_OMIT_STRUCT_RETURN_REG true
15641 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15642 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15643 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15645 #if CHECKING_P
15646 #undef TARGET_RUN_TARGET_SELFTESTS
15647 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15648 #endif /* #if CHECKING_P */
15650 struct gcc_target targetm = TARGET_INITIALIZER;
15652 #include "gt-aarch64.h"