[AArch64] Set jump-align=4 for neoversen1
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobe40750380cce202473da3cf572ebdbc28a4ecc06
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
104 union
106 /* For MOV and MVN. */
107 struct
109 /* The value of each element. */
110 rtx value;
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
118 /* For INDEX. */
119 struct
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
163 u.index.base = base_in;
164 u.index.step = step_in;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
174 u.pattern = pattern_in;
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel;
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg;
183 #ifdef HAVE_AS_TLS
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
186 #endif
188 static bool aarch64_composite_type_p (const_tree, machine_mode);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
190 const_tree,
191 machine_mode *, int *,
192 bool *);
193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode);
197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
199 const_tree type,
200 int misalignment,
201 bool is_packed);
202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
204 aarch64_addr_query_type);
205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version;
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune = cortexa53;
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags = 0;
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads;
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer;
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string = NULL;
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
228 /* Support for command line parsing of boolean flags in the tuning
229 structures. */
230 struct aarch64_flag_desc
232 const char* name;
233 unsigned int flag;
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
240 { "none", AARCH64_FUSE_NOTHING },
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL },
243 { NULL, AARCH64_FUSE_NOTHING }
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
250 { "none", AARCH64_EXTRA_TUNE_NONE },
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL },
253 { NULL, AARCH64_EXTRA_TUNE_NONE }
256 /* Tuning parameters. */
258 static const struct cpu_addrcost_table generic_addrcost_table =
261 1, /* hi */
262 0, /* si */
263 0, /* di */
264 1, /* ti */
266 0, /* pre_modify */
267 0, /* post_modify */
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
271 0 /* imm_offset */
274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
277 0, /* hi */
278 0, /* si */
279 0, /* di */
280 2, /* ti */
282 0, /* pre_modify */
283 0, /* post_modify */
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
287 0, /* imm_offset */
290 static const struct cpu_addrcost_table xgene1_addrcost_table =
293 1, /* hi */
294 0, /* si */
295 0, /* di */
296 1, /* ti */
298 1, /* pre_modify */
299 1, /* post_modify */
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
303 0, /* imm_offset */
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
309 1, /* hi */
310 1, /* si */
311 1, /* di */
312 2, /* ti */
314 0, /* pre_modify */
315 0, /* post_modify */
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
319 0, /* imm_offset */
322 static const struct cpu_addrcost_table tsv110_addrcost_table =
325 1, /* hi */
326 0, /* si */
327 0, /* di */
328 1, /* ti */
330 0, /* pre_modify */
331 0, /* post_modify */
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
335 0, /* imm_offset */
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
341 1, /* hi */
342 1, /* si */
343 1, /* di */
344 2, /* ti */
346 1, /* pre_modify */
347 1, /* post_modify */
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
351 2, /* imm_offset */
354 static const struct cpu_regmove_cost generic_regmove_cost =
356 1, /* GP2GP */
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
359 5, /* GP2FP */
360 5, /* FP2GP */
361 2 /* FP2FP */
364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
366 1, /* GP2GP */
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
369 5, /* GP2FP */
370 5, /* FP2GP */
371 2 /* FP2FP */
374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
376 1, /* GP2GP */
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
379 5, /* GP2FP */
380 5, /* FP2GP */
381 2 /* FP2FP */
384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
389 9, /* GP2FP */
390 9, /* FP2GP */
391 1 /* FP2FP */
394 static const struct cpu_regmove_cost thunderx_regmove_cost =
396 2, /* GP2GP */
397 2, /* GP2FP */
398 6, /* FP2GP */
399 4 /* FP2FP */
402 static const struct cpu_regmove_cost xgene1_regmove_cost =
404 1, /* GP2GP */
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 2 /* FP2FP */
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
414 2, /* GP2GP */
415 /* Avoid the use of int<->fp moves for spilling. */
416 6, /* GP2FP */
417 6, /* FP2GP */
418 4 /* FP2FP */
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
423 1, /* GP2GP */
424 /* Avoid the use of int<->fp moves for spilling. */
425 8, /* GP2FP */
426 8, /* FP2GP */
427 4 /* FP2FP */
430 static const struct cpu_regmove_cost tsv110_regmove_cost =
432 1, /* GP2GP */
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
435 2, /* GP2FP */
436 3, /* FP2GP */
437 2 /* FP2FP */
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost =
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 2, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost =
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost =
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
500 static const struct cpu_vector_cost tsv110_vector_cost =
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost =
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
539 static const struct cpu_vector_cost exynosm1_vector_cost =
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost =
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 10, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost =
601 1, /* Predictable. */
602 3 /* Unpredictable. */
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes =
608 AARCH64_APPROX_NONE, /* division */
609 AARCH64_APPROX_NONE, /* sqrt */
610 AARCH64_APPROX_NONE /* recip_sqrt */
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes =
616 AARCH64_APPROX_NONE, /* division */
617 AARCH64_APPROX_ALL, /* sqrt */
618 AARCH64_APPROX_ALL /* recip_sqrt */
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes =
624 AARCH64_APPROX_NONE, /* division */
625 AARCH64_APPROX_NONE, /* sqrt */
626 AARCH64_APPROX_ALL /* recip_sqrt */
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune =
632 0, /* num_slots */
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
643 0, /* num_slots */
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
654 4, /* num_slots */
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
665 8, /* num_slots */
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
674 static const cpu_prefetch_tune thunderx_prefetch_tune =
676 8, /* num_slots */
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
687 8, /* num_slots */
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
696 static const cpu_prefetch_tune tsv110_prefetch_tune =
698 0, /* num_slots */
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
707 static const cpu_prefetch_tune xgene1_prefetch_tune =
709 8, /* num_slots */
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
718 static const struct tune_params generic_tunings =
720 &cortexa57_extra_costs,
721 &generic_addrcost_table,
722 &generic_regmove_cost,
723 &generic_vector_cost,
724 &generic_branch_cost,
725 &generic_approx_modes,
726 SVE_NOT_IMPLEMENTED, /* sve_width */
727 4, /* memmov_cost */
728 2, /* issue_rate */
729 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
741 &generic_prefetch_tune
744 static const struct tune_params cortexa35_tunings =
746 &cortexa53_extra_costs,
747 &generic_addrcost_table,
748 &cortexa53_regmove_cost,
749 &generic_vector_cost,
750 &generic_branch_cost,
751 &generic_approx_modes,
752 SVE_NOT_IMPLEMENTED, /* sve_width */
753 4, /* memmov_cost */
754 1, /* issue_rate */
755 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
768 &generic_prefetch_tune
771 static const struct tune_params cortexa53_tunings =
773 &cortexa53_extra_costs,
774 &generic_addrcost_table,
775 &cortexa53_regmove_cost,
776 &generic_vector_cost,
777 &generic_branch_cost,
778 &generic_approx_modes,
779 SVE_NOT_IMPLEMENTED, /* sve_width */
780 4, /* memmov_cost */
781 2, /* issue_rate */
782 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params cortexa57_tunings =
800 &cortexa57_extra_costs,
801 &generic_addrcost_table,
802 &cortexa57_regmove_cost,
803 &cortexa57_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 SVE_NOT_IMPLEMENTED, /* sve_width */
807 4, /* memmov_cost */
808 3, /* issue_rate */
809 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
822 &generic_prefetch_tune
825 static const struct tune_params cortexa72_tunings =
827 &cortexa57_extra_costs,
828 &generic_addrcost_table,
829 &cortexa57_regmove_cost,
830 &cortexa57_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 SVE_NOT_IMPLEMENTED, /* sve_width */
834 4, /* memmov_cost */
835 3, /* issue_rate */
836 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
849 &generic_prefetch_tune
852 static const struct tune_params cortexa73_tunings =
854 &cortexa57_extra_costs,
855 &generic_addrcost_table,
856 &cortexa57_regmove_cost,
857 &cortexa57_vector_cost,
858 &generic_branch_cost,
859 &generic_approx_modes,
860 SVE_NOT_IMPLEMENTED, /* sve_width */
861 4, /* memmov_cost. */
862 2, /* issue_rate. */
863 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
876 &generic_prefetch_tune
881 static const struct tune_params exynosm1_tunings =
883 &exynosm1_extra_costs,
884 &exynosm1_addrcost_table,
885 &exynosm1_regmove_cost,
886 &exynosm1_vector_cost,
887 &generic_branch_cost,
888 &exynosm1_approx_modes,
889 SVE_NOT_IMPLEMENTED, /* sve_width */
890 4, /* memmov_cost */
891 3, /* issue_rate */
892 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
904 &exynosm1_prefetch_tune
907 static const struct tune_params thunderxt88_tunings =
909 &thunderx_extra_costs,
910 &generic_addrcost_table,
911 &thunderx_regmove_cost,
912 &thunderx_vector_cost,
913 &generic_branch_cost,
914 &generic_approx_modes,
915 SVE_NOT_IMPLEMENTED, /* sve_width */
916 6, /* memmov_cost */
917 2, /* issue_rate */
918 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
930 &thunderxt88_prefetch_tune
933 static const struct tune_params thunderx_tunings =
935 &thunderx_extra_costs,
936 &generic_addrcost_table,
937 &thunderx_regmove_cost,
938 &thunderx_vector_cost,
939 &generic_branch_cost,
940 &generic_approx_modes,
941 SVE_NOT_IMPLEMENTED, /* sve_width */
942 6, /* memmov_cost */
943 2, /* issue_rate */
944 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
957 &thunderx_prefetch_tune
960 static const struct tune_params tsv110_tunings =
962 &tsv110_extra_costs,
963 &tsv110_addrcost_table,
964 &tsv110_regmove_cost,
965 &tsv110_vector_cost,
966 &generic_branch_cost,
967 &generic_approx_modes,
968 SVE_NOT_IMPLEMENTED, /* sve_width */
969 4, /* memmov_cost */
970 4, /* issue_rate */
971 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
972 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
984 &tsv110_prefetch_tune
987 static const struct tune_params xgene1_tunings =
989 &xgene1_extra_costs,
990 &xgene1_addrcost_table,
991 &xgene1_regmove_cost,
992 &xgene1_vector_cost,
993 &generic_branch_cost,
994 &xgene1_approx_modes,
995 SVE_NOT_IMPLEMENTED, /* sve_width */
996 6, /* memmov_cost */
997 4, /* issue_rate */
998 AARCH64_FUSE_NOTHING, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1010 &xgene1_prefetch_tune
1013 static const struct tune_params emag_tunings =
1015 &xgene1_extra_costs,
1016 &xgene1_addrcost_table,
1017 &xgene1_regmove_cost,
1018 &xgene1_vector_cost,
1019 &generic_branch_cost,
1020 &xgene1_approx_modes,
1021 SVE_NOT_IMPLEMENTED,
1022 6, /* memmov_cost */
1023 4, /* issue_rate */
1024 AARCH64_FUSE_NOTHING, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1036 &xgene1_prefetch_tune
1039 static const struct tune_params qdf24xx_tunings =
1041 &qdf24xx_extra_costs,
1042 &qdf24xx_addrcost_table,
1043 &qdf24xx_regmove_cost,
1044 &qdf24xx_vector_cost,
1045 &generic_branch_cost,
1046 &generic_approx_modes,
1047 SVE_NOT_IMPLEMENTED, /* sve_width */
1048 4, /* memmov_cost */
1049 4, /* issue_rate */
1050 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 for now. */
1068 static const struct tune_params saphira_tunings =
1070 &generic_extra_costs,
1071 &generic_addrcost_table,
1072 &generic_regmove_cost,
1073 &generic_vector_cost,
1074 &generic_branch_cost,
1075 &generic_approx_modes,
1076 SVE_NOT_IMPLEMENTED, /* sve_width */
1077 4, /* memmov_cost */
1078 4, /* issue_rate */
1079 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1092 &generic_prefetch_tune
1095 static const struct tune_params thunderx2t99_tunings =
1097 &thunderx2t99_extra_costs,
1098 &thunderx2t99_addrcost_table,
1099 &thunderx2t99_regmove_cost,
1100 &thunderx2t99_vector_cost,
1101 &generic_branch_cost,
1102 &generic_approx_modes,
1103 SVE_NOT_IMPLEMENTED, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1122 static const struct tune_params neoversen1_tunings =
1124 &cortexa57_extra_costs,
1125 &generic_addrcost_table,
1126 &generic_regmove_cost,
1127 &cortexa57_vector_cost,
1128 &generic_branch_cost,
1129 &generic_approx_modes,
1130 SVE_NOT_IMPLEMENTED, /* sve_width */
1131 4, /* memmov_cost */
1132 3, /* issue_rate */
1133 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "4", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1145 &generic_prefetch_tune
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1151 const char* name;
1152 void (*parse_override)(const char*, struct tune_params*);
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1162 { "fuse", aarch64_parse_fuse_string },
1163 { "tune", aarch64_parse_tune_string },
1164 { "sve_width", aarch64_parse_sve_width_string },
1165 { NULL, NULL }
1168 /* A processor implementing AArch64. */
1169 struct processor
1171 const char *const name;
1172 enum aarch64_processor ident;
1173 enum aarch64_processor sched_core;
1174 enum aarch64_arch arch;
1175 unsigned architecture_version;
1176 const uint64_t flags;
1177 const struct tune_params *const tune;
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures[] =
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores[] =
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1216 static tree
1217 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1218 int, bool *no_add_attrs)
1220 /* Since we set fn_type_req to true, the caller should have checked
1221 this for us. */
1222 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1223 switch ((arm_pcs) fntype_abi (*node).id ())
1225 case ARM_PCS_AAPCS64:
1226 case ARM_PCS_SIMD:
1227 return NULL_TREE;
1229 case ARM_PCS_SVE:
1230 error ("the %qE attribute cannot be applied to an SVE function type",
1231 name);
1232 *no_add_attrs = true;
1233 return NULL_TREE;
1235 case ARM_PCS_TLSDESC:
1236 case ARM_PCS_UNKNOWN:
1237 break;
1239 gcc_unreachable ();
1242 /* Table of machine attributes. */
1243 static const struct attribute_spec aarch64_attribute_table[] =
1245 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246 affects_type_identity, handler, exclude } */
1247 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1248 handle_aarch64_vector_pcs_attribute, NULL },
1249 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
1250 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1255 /* An ISA extension in the co-processor and main instruction set space. */
1256 struct aarch64_option_extension
1258 const char *const name;
1259 const unsigned long flags_on;
1260 const unsigned long flags_off;
1263 typedef enum aarch64_cond_code
1265 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1266 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1267 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1269 aarch64_cc;
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1273 struct aarch64_branch_protect_type
1275 /* The type's name that the user passes to the branch-protection option
1276 string. */
1277 const char* name;
1278 /* Function to handle the protection type and set global variables.
1279 First argument is the string token corresponding with this type and the
1280 second argument is the next token in the option string.
1281 Return values:
1282 * AARCH64_PARSE_OK: Handling was sucessful.
1283 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284 should print an error.
1285 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1286 own error. */
1287 enum aarch64_parse_opt_result (*handler)(char*, char*);
1288 /* A list of types that can follow this type in the option string. */
1289 const aarch64_branch_protect_type* subtypes;
1290 unsigned int num_subtypes;
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str, char* rest)
1296 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1297 aarch64_enable_bti = 0;
1298 if (rest)
1300 error ("unexpected %<%s%> after %<%s%>", rest, str);
1301 return AARCH64_PARSE_INVALID_FEATURE;
1303 return AARCH64_PARSE_OK;
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str, char* rest)
1309 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1310 aarch64_ra_sign_key = AARCH64_KEY_A;
1311 aarch64_enable_bti = 1;
1312 if (rest)
1314 error ("unexpected %<%s%> after %<%s%>", rest, str);
1315 return AARCH64_PARSE_INVALID_FEATURE;
1317 return AARCH64_PARSE_OK;
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1322 char* rest ATTRIBUTE_UNUSED)
1324 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1325 aarch64_ra_sign_key = AARCH64_KEY_A;
1326 return AARCH64_PARSE_OK;
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1331 char* rest ATTRIBUTE_UNUSED)
1333 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1334 return AARCH64_PARSE_OK;
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1339 char* rest ATTRIBUTE_UNUSED)
1341 aarch64_ra_sign_key = AARCH64_KEY_B;
1342 return AARCH64_PARSE_OK;
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1347 char* rest ATTRIBUTE_UNUSED)
1349 aarch64_enable_bti = 1;
1350 return AARCH64_PARSE_OK;
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1354 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1355 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1356 { NULL, NULL, NULL, 0 }
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1360 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1361 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1362 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1363 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1364 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1365 { NULL, NULL, NULL, 0 }
1368 /* The condition codes of the processor, and the inverse function. */
1369 static const char * const aarch64_condition_codes[] =
1371 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1375 /* The preferred condition codes for SVE conditions. */
1376 static const char *const aarch64_sve_condition_codes[] =
1378 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1382 /* Return the assembly token for svpattern value VALUE. */
1384 static const char *
1385 svpattern_token (enum aarch64_svpattern pattern)
1387 switch (pattern)
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390 AARCH64_FOR_SVPATTERN (CASE)
1391 #undef CASE
1392 case AARCH64_NUM_SVPATTERNS:
1393 break;
1395 gcc_unreachable ();
1398 /* Return the descriptor of the SIMD ABI. */
1400 static const predefined_function_abi &
1401 aarch64_simd_abi (void)
1403 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1404 if (!simd_abi.initialized_p ())
1406 HARD_REG_SET full_reg_clobbers
1407 = default_function_abi.full_reg_clobbers ();
1408 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1409 if (FP_SIMD_SAVED_REGNUM_P (regno))
1410 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1411 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1413 return simd_abi;
1416 /* Return the descriptor of the SVE PCS. */
1418 static const predefined_function_abi &
1419 aarch64_sve_abi (void)
1421 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1422 if (!sve_abi.initialized_p ())
1424 HARD_REG_SET full_reg_clobbers
1425 = default_function_abi.full_reg_clobbers ();
1426 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1427 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1428 for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
1429 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1430 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1432 return sve_abi;
1435 /* Generate code to enable conditional branches in functions over 1 MiB. */
1436 const char *
1437 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1438 const char * branch_format)
1440 rtx_code_label * tmp_label = gen_label_rtx ();
1441 char label_buf[256];
1442 char buffer[128];
1443 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1444 CODE_LABEL_NUMBER (tmp_label));
1445 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1446 rtx dest_label = operands[pos_label];
1447 operands[pos_label] = tmp_label;
1449 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1450 output_asm_insn (buffer, operands);
1452 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1453 operands[pos_label] = dest_label;
1454 output_asm_insn (buffer, operands);
1455 return "";
1458 void
1459 aarch64_err_no_fpadvsimd (machine_mode mode)
1461 if (TARGET_GENERAL_REGS_ONLY)
1462 if (FLOAT_MODE_P (mode))
1463 error ("%qs is incompatible with the use of floating-point types",
1464 "-mgeneral-regs-only");
1465 else
1466 error ("%qs is incompatible with the use of vector types",
1467 "-mgeneral-regs-only");
1468 else
1469 if (FLOAT_MODE_P (mode))
1470 error ("%qs feature modifier is incompatible with the use of"
1471 " floating-point types", "+nofp");
1472 else
1473 error ("%qs feature modifier is incompatible with the use of"
1474 " vector types", "+nofp");
1477 /* Report when we try to do something that requires SVE when SVE is disabled.
1478 This is an error of last resort and isn't very high-quality. It usually
1479 involves attempts to measure the vector length in some way. */
1480 static void
1481 aarch64_report_sve_required (void)
1483 static bool reported_p = false;
1485 /* Avoid reporting a slew of messages for a single oversight. */
1486 if (reported_p)
1487 return;
1489 error ("this operation requires the SVE ISA extension");
1490 inform (input_location, "you can enable SVE using the command-line"
1491 " option %<-march%>, or by using the %<target%>"
1492 " attribute or pragma");
1493 reported_p = true;
1496 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1497 registers. */
1498 inline bool
1499 pr_or_ffr_regnum_p (unsigned int regno)
1501 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1504 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1505 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1506 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1507 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1508 and GENERAL_REGS is lower than the memory cost (in this case the best class
1509 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1510 cost results in bad allocations with many redundant int<->FP moves which
1511 are expensive on various cores.
1512 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1513 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1514 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1515 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1516 The result of this is that it is no longer inefficient to have a higher
1517 memory move cost than the register move cost.
1520 static reg_class_t
1521 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1522 reg_class_t best_class)
1524 machine_mode mode;
1526 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1527 || !reg_class_subset_p (FP_REGS, allocno_class))
1528 return allocno_class;
1530 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1531 || !reg_class_subset_p (FP_REGS, best_class))
1532 return best_class;
1534 mode = PSEUDO_REGNO_MODE (regno);
1535 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1538 static unsigned int
1539 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1541 if (GET_MODE_UNIT_SIZE (mode) == 4)
1542 return aarch64_tune_params.min_div_recip_mul_sf;
1543 return aarch64_tune_params.min_div_recip_mul_df;
1546 /* Return the reassociation width of treeop OPC with mode MODE. */
1547 static int
1548 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1550 if (VECTOR_MODE_P (mode))
1551 return aarch64_tune_params.vec_reassoc_width;
1552 if (INTEGRAL_MODE_P (mode))
1553 return aarch64_tune_params.int_reassoc_width;
1554 /* Avoid reassociating floating point addition so we emit more FMAs. */
1555 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1556 return aarch64_tune_params.fp_reassoc_width;
1557 return 1;
1560 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1561 unsigned
1562 aarch64_dbx_register_number (unsigned regno)
1564 if (GP_REGNUM_P (regno))
1565 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1566 else if (regno == SP_REGNUM)
1567 return AARCH64_DWARF_SP;
1568 else if (FP_REGNUM_P (regno))
1569 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1570 else if (PR_REGNUM_P (regno))
1571 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1572 else if (regno == VG_REGNUM)
1573 return AARCH64_DWARF_VG;
1575 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1576 equivalent DWARF register. */
1577 return DWARF_FRAME_REGISTERS;
1580 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1581 integer, otherwise return X unmodified. */
1582 static rtx
1583 aarch64_bit_representation (rtx x)
1585 if (CONST_DOUBLE_P (x))
1586 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1587 return x;
1590 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1591 static bool
1592 aarch64_advsimd_struct_mode_p (machine_mode mode)
1594 return (TARGET_SIMD
1595 && (mode == OImode || mode == CImode || mode == XImode));
1598 /* Return true if MODE is an SVE predicate mode. */
1599 static bool
1600 aarch64_sve_pred_mode_p (machine_mode mode)
1602 return (TARGET_SVE
1603 && (mode == VNx16BImode
1604 || mode == VNx8BImode
1605 || mode == VNx4BImode
1606 || mode == VNx2BImode));
1609 /* Three mutually-exclusive flags describing a vector or predicate type. */
1610 const unsigned int VEC_ADVSIMD = 1;
1611 const unsigned int VEC_SVE_DATA = 2;
1612 const unsigned int VEC_SVE_PRED = 4;
1613 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1614 a structure of 2, 3 or 4 vectors. */
1615 const unsigned int VEC_STRUCT = 8;
1616 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1617 vector has fewer significant bytes than a full SVE vector. */
1618 const unsigned int VEC_PARTIAL = 16;
1619 /* Useful combinations of the above. */
1620 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1621 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1623 /* Return a set of flags describing the vector properties of mode MODE.
1624 Ignore modes that are not supported by the current target. */
1625 static unsigned int
1626 aarch64_classify_vector_mode (machine_mode mode)
1628 if (aarch64_advsimd_struct_mode_p (mode))
1629 return VEC_ADVSIMD | VEC_STRUCT;
1631 if (aarch64_sve_pred_mode_p (mode))
1632 return VEC_SVE_PRED;
1634 /* Make the decision based on the mode's enum value rather than its
1635 properties, so that we keep the correct classification regardless
1636 of -msve-vector-bits. */
1637 switch (mode)
1639 /* Partial SVE QI vectors. */
1640 case E_VNx2QImode:
1641 case E_VNx4QImode:
1642 case E_VNx8QImode:
1643 /* Partial SVE HI vectors. */
1644 case E_VNx2HImode:
1645 case E_VNx4HImode:
1646 /* Partial SVE SI vector. */
1647 case E_VNx2SImode:
1648 /* Partial SVE HF vectors. */
1649 case E_VNx2HFmode:
1650 case E_VNx4HFmode:
1651 /* Partial SVE SF vector. */
1652 case E_VNx2SFmode:
1653 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1655 case E_VNx16QImode:
1656 case E_VNx8HImode:
1657 case E_VNx4SImode:
1658 case E_VNx2DImode:
1659 case E_VNx8HFmode:
1660 case E_VNx4SFmode:
1661 case E_VNx2DFmode:
1662 return TARGET_SVE ? VEC_SVE_DATA : 0;
1664 /* x2 SVE vectors. */
1665 case E_VNx32QImode:
1666 case E_VNx16HImode:
1667 case E_VNx8SImode:
1668 case E_VNx4DImode:
1669 case E_VNx16HFmode:
1670 case E_VNx8SFmode:
1671 case E_VNx4DFmode:
1672 /* x3 SVE vectors. */
1673 case E_VNx48QImode:
1674 case E_VNx24HImode:
1675 case E_VNx12SImode:
1676 case E_VNx6DImode:
1677 case E_VNx24HFmode:
1678 case E_VNx12SFmode:
1679 case E_VNx6DFmode:
1680 /* x4 SVE vectors. */
1681 case E_VNx64QImode:
1682 case E_VNx32HImode:
1683 case E_VNx16SImode:
1684 case E_VNx8DImode:
1685 case E_VNx32HFmode:
1686 case E_VNx16SFmode:
1687 case E_VNx8DFmode:
1688 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1690 /* 64-bit Advanced SIMD vectors. */
1691 case E_V8QImode:
1692 case E_V4HImode:
1693 case E_V2SImode:
1694 /* ...E_V1DImode doesn't exist. */
1695 case E_V4HFmode:
1696 case E_V4BFmode:
1697 case E_V2SFmode:
1698 case E_V1DFmode:
1699 /* 128-bit Advanced SIMD vectors. */
1700 case E_V16QImode:
1701 case E_V8HImode:
1702 case E_V4SImode:
1703 case E_V2DImode:
1704 case E_V8HFmode:
1705 case E_V8BFmode:
1706 case E_V4SFmode:
1707 case E_V2DFmode:
1708 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1710 default:
1711 return 0;
1715 /* Return true if MODE is any of the data vector modes, including
1716 structure modes. */
1717 static bool
1718 aarch64_vector_data_mode_p (machine_mode mode)
1720 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1723 /* Return true if MODE is any form of SVE mode, including predicates,
1724 vectors and structures. */
1725 bool
1726 aarch64_sve_mode_p (machine_mode mode)
1728 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1731 /* Return true if MODE is an SVE data vector mode; either a single vector
1732 or a structure of vectors. */
1733 static bool
1734 aarch64_sve_data_mode_p (machine_mode mode)
1736 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1739 /* Return the number of defined bytes in one constituent vector of
1740 SVE mode MODE, which has vector flags VEC_FLAGS. */
1741 static poly_int64
1742 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1744 if (vec_flags & VEC_PARTIAL)
1745 /* A single partial vector. */
1746 return GET_MODE_SIZE (mode);
1748 if (vec_flags & VEC_SVE_DATA)
1749 /* A single vector or a tuple. */
1750 return BYTES_PER_SVE_VECTOR;
1752 /* A single predicate. */
1753 gcc_assert (vec_flags & VEC_SVE_PRED);
1754 return BYTES_PER_SVE_PRED;
1757 /* Implement target hook TARGET_ARRAY_MODE. */
1758 static opt_machine_mode
1759 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1761 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1762 && IN_RANGE (nelems, 2, 4))
1763 return mode_for_vector (GET_MODE_INNER (mode),
1764 GET_MODE_NUNITS (mode) * nelems);
1766 return opt_machine_mode ();
1769 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1770 static bool
1771 aarch64_array_mode_supported_p (machine_mode mode,
1772 unsigned HOST_WIDE_INT nelems)
1774 if (TARGET_SIMD
1775 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1776 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1777 && (nelems >= 2 && nelems <= 4))
1778 return true;
1780 return false;
1783 /* MODE is some form of SVE vector mode. For data modes, return the number
1784 of vector register bits that each element of MODE occupies, such as 64
1785 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1786 in a 64-bit container). For predicate modes, return the number of
1787 data bits controlled by each significant predicate bit. */
1789 static unsigned int
1790 aarch64_sve_container_bits (machine_mode mode)
1792 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1793 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1794 ? BITS_PER_SVE_VECTOR
1795 : GET_MODE_BITSIZE (mode));
1796 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1799 /* Return the SVE predicate mode to use for elements that have
1800 ELEM_NBYTES bytes, if such a mode exists. */
1802 opt_machine_mode
1803 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1805 if (TARGET_SVE)
1807 if (elem_nbytes == 1)
1808 return VNx16BImode;
1809 if (elem_nbytes == 2)
1810 return VNx8BImode;
1811 if (elem_nbytes == 4)
1812 return VNx4BImode;
1813 if (elem_nbytes == 8)
1814 return VNx2BImode;
1816 return opt_machine_mode ();
1819 /* Return the SVE predicate mode that should be used to control
1820 SVE mode MODE. */
1822 machine_mode
1823 aarch64_sve_pred_mode (machine_mode mode)
1825 unsigned int bits = aarch64_sve_container_bits (mode);
1826 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1829 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1831 static opt_machine_mode
1832 aarch64_get_mask_mode (machine_mode mode)
1834 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1835 if (vec_flags & VEC_SVE_DATA)
1836 return aarch64_sve_pred_mode (mode);
1838 return default_get_mask_mode (mode);
1841 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1843 opt_machine_mode
1844 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1846 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1847 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1848 machine_mode mode;
1849 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1850 if (inner_mode == GET_MODE_INNER (mode)
1851 && known_eq (nunits, GET_MODE_NUNITS (mode))
1852 && aarch64_sve_data_mode_p (mode))
1853 return mode;
1854 return opt_machine_mode ();
1857 /* Return the integer element mode associated with SVE mode MODE. */
1859 static scalar_int_mode
1860 aarch64_sve_element_int_mode (machine_mode mode)
1862 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1863 ? BITS_PER_SVE_VECTOR
1864 : GET_MODE_BITSIZE (mode));
1865 unsigned int elt_bits = vector_element_size (vector_bits,
1866 GET_MODE_NUNITS (mode));
1867 return int_mode_for_size (elt_bits, 0).require ();
1870 /* Return an integer element mode that contains exactly
1871 aarch64_sve_container_bits (MODE) bits. This is wider than
1872 aarch64_sve_element_int_mode if MODE is a partial vector,
1873 otherwise it's the same. */
1875 static scalar_int_mode
1876 aarch64_sve_container_int_mode (machine_mode mode)
1878 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1881 /* Return the integer vector mode associated with SVE mode MODE.
1882 Unlike related_int_vector_mode, this can handle the case in which
1883 MODE is a predicate (and thus has a different total size). */
1885 machine_mode
1886 aarch64_sve_int_mode (machine_mode mode)
1888 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1889 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1892 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1894 static opt_machine_mode
1895 aarch64_vectorize_related_mode (machine_mode vector_mode,
1896 scalar_mode element_mode,
1897 poly_uint64 nunits)
1899 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1901 /* If we're operating on SVE vectors, try to return an SVE mode. */
1902 poly_uint64 sve_nunits;
1903 if ((vec_flags & VEC_SVE_DATA)
1904 && multiple_p (BYTES_PER_SVE_VECTOR,
1905 GET_MODE_SIZE (element_mode), &sve_nunits))
1907 machine_mode sve_mode;
1908 if (maybe_ne (nunits, 0U))
1910 /* Try to find a full or partial SVE mode with exactly
1911 NUNITS units. */
1912 if (multiple_p (sve_nunits, nunits)
1913 && aarch64_sve_data_mode (element_mode,
1914 nunits).exists (&sve_mode))
1915 return sve_mode;
1917 else
1919 /* Take the preferred number of units from the number of bytes
1920 that fit in VECTOR_MODE. We always start by "autodetecting"
1921 a full vector mode with preferred_simd_mode, so vectors
1922 chosen here will also be full vector modes. Then
1923 autovectorize_vector_modes tries smaller starting modes
1924 and thus smaller preferred numbers of units. */
1925 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1926 if (aarch64_sve_data_mode (element_mode,
1927 sve_nunits).exists (&sve_mode))
1928 return sve_mode;
1932 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1933 if ((vec_flags & VEC_ADVSIMD)
1934 && known_eq (nunits, 0U)
1935 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1936 && maybe_ge (GET_MODE_BITSIZE (element_mode)
1937 * GET_MODE_NUNITS (vector_mode), 128U))
1939 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1940 if (VECTOR_MODE_P (res))
1941 return res;
1944 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1947 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1948 prefer to use the first arithmetic operand as the else value if
1949 the else value doesn't matter, since that exactly matches the SVE
1950 destructive merging form. For ternary operations we could either
1951 pick the first operand and use FMAD-like instructions or the last
1952 operand and use FMLA-like instructions; the latter seems more
1953 natural. */
1955 static tree
1956 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1958 return nops == 3 ? ops[2] : ops[0];
1961 /* Implement TARGET_HARD_REGNO_NREGS. */
1963 static unsigned int
1964 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1966 /* ??? Logically we should only need to provide a value when
1967 HARD_REGNO_MODE_OK says that the combination is valid,
1968 but at the moment we need to handle all modes. Just ignore
1969 any runtime parts for registers that can't store them. */
1970 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1971 switch (aarch64_regno_regclass (regno))
1973 case FP_REGS:
1974 case FP_LO_REGS:
1975 case FP_LO8_REGS:
1977 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1978 if (vec_flags & VEC_SVE_DATA)
1979 return exact_div (GET_MODE_SIZE (mode),
1980 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1981 return CEIL (lowest_size, UNITS_PER_VREG);
1983 case PR_REGS:
1984 case PR_LO_REGS:
1985 case PR_HI_REGS:
1986 case FFR_REGS:
1987 case PR_AND_FFR_REGS:
1988 return 1;
1989 default:
1990 return CEIL (lowest_size, UNITS_PER_WORD);
1992 gcc_unreachable ();
1995 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1997 static bool
1998 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2000 if (GET_MODE_CLASS (mode) == MODE_CC)
2001 return regno == CC_REGNUM;
2003 if (regno == VG_REGNUM)
2004 /* This must have the same size as _Unwind_Word. */
2005 return mode == DImode;
2007 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2008 if (vec_flags & VEC_SVE_PRED)
2009 return pr_or_ffr_regnum_p (regno);
2011 if (pr_or_ffr_regnum_p (regno))
2012 return false;
2014 if (regno == SP_REGNUM)
2015 /* The purpose of comparing with ptr_mode is to support the
2016 global register variable associated with the stack pointer
2017 register via the syntax of asm ("wsp") in ILP32. */
2018 return mode == Pmode || mode == ptr_mode;
2020 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2021 return mode == Pmode;
2023 if (GP_REGNUM_P (regno))
2025 if (vec_flags & VEC_ANY_SVE)
2026 return false;
2027 if (known_le (GET_MODE_SIZE (mode), 8))
2028 return true;
2029 if (known_le (GET_MODE_SIZE (mode), 16))
2030 return (regno & 1) == 0;
2032 else if (FP_REGNUM_P (regno))
2034 if (vec_flags & VEC_STRUCT)
2035 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2036 else
2037 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2040 return false;
2043 /* Return true if TYPE is a type that should be passed or returned in
2044 SVE registers, assuming enough registers are available. When returning
2045 true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2046 respectively. */
2048 /* Return true if a function with type FNTYPE returns its value in
2049 SVE vector or predicate registers. */
2051 static bool
2052 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2054 tree return_type = TREE_TYPE (fntype);
2055 return (return_type != error_mark_node
2056 && aarch64_sve::builtin_type_p (return_type));
2059 /* Return true if a function with type FNTYPE takes arguments in
2060 SVE vector or predicate registers. */
2062 static bool
2063 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2065 CUMULATIVE_ARGS args_so_far_v;
2066 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2067 NULL_TREE, 0, true);
2068 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2070 for (tree chain = TYPE_ARG_TYPES (fntype);
2071 chain && chain != void_list_node;
2072 chain = TREE_CHAIN (chain))
2074 tree arg_type = TREE_VALUE (chain);
2075 if (arg_type == error_mark_node)
2076 return false;
2078 function_arg_info arg (arg_type, /*named=*/true);
2079 apply_pass_by_reference_rules (&args_so_far_v, arg);
2080 if (aarch64_sve::builtin_type_p (arg.type))
2081 return true;
2083 targetm.calls.function_arg_advance (args_so_far, arg);
2085 return false;
2088 /* Implement TARGET_FNTYPE_ABI. */
2090 static const predefined_function_abi &
2091 aarch64_fntype_abi (const_tree fntype)
2093 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2094 return aarch64_simd_abi ();
2096 if (aarch64_returns_value_in_sve_regs_p (fntype)
2097 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2098 return aarch64_sve_abi ();
2100 return default_function_abi;
2103 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2105 static bool
2106 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2108 return (aarch64_sve::builtin_type_p (type1)
2109 == aarch64_sve::builtin_type_p (type2));
2112 /* Return true if we should emit CFI for register REGNO. */
2114 static bool
2115 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2117 return (GP_REGNUM_P (regno)
2118 || !default_function_abi.clobbers_full_reg_p (regno));
2121 /* Return the mode we should use to save and restore register REGNO. */
2123 static machine_mode
2124 aarch64_reg_save_mode (unsigned int regno)
2126 if (GP_REGNUM_P (regno))
2127 return DImode;
2129 if (FP_REGNUM_P (regno))
2130 switch (crtl->abi->id ())
2132 case ARM_PCS_AAPCS64:
2133 /* Only the low 64 bits are saved by the base PCS. */
2134 return DFmode;
2136 case ARM_PCS_SIMD:
2137 /* The vector PCS saves the low 128 bits (which is the full
2138 register on non-SVE targets). */
2139 return TFmode;
2141 case ARM_PCS_SVE:
2142 /* Use vectors of DImode for registers that need frame
2143 information, so that the first 64 bytes of the save slot
2144 are always the equivalent of what storing D<n> would give. */
2145 if (aarch64_emit_cfi_for_reg_p (regno))
2146 return VNx2DImode;
2148 /* Use vectors of bytes otherwise, so that the layout is
2149 endian-agnostic, and so that we can use LDR and STR for
2150 big-endian targets. */
2151 return VNx16QImode;
2153 case ARM_PCS_TLSDESC:
2154 case ARM_PCS_UNKNOWN:
2155 break;
2158 if (PR_REGNUM_P (regno))
2159 /* Save the full predicate register. */
2160 return VNx16BImode;
2162 gcc_unreachable ();
2165 /* Implement TARGET_INSN_CALLEE_ABI. */
2167 const predefined_function_abi &
2168 aarch64_insn_callee_abi (const rtx_insn *insn)
2170 rtx pat = PATTERN (insn);
2171 gcc_assert (GET_CODE (pat) == PARALLEL);
2172 rtx unspec = XVECEXP (pat, 0, 1);
2173 gcc_assert (GET_CODE (unspec) == UNSPEC
2174 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2175 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2178 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2179 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2180 clobbers the top 64 bits when restoring the bottom 64 bits. */
2182 static bool
2183 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2184 unsigned int regno,
2185 machine_mode mode)
2187 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2189 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2190 unsigned int nregs = hard_regno_nregs (regno, mode);
2191 if (nregs > 1)
2192 per_register_size = exact_div (per_register_size, nregs);
2193 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2194 return maybe_gt (per_register_size, 16);
2195 return maybe_gt (per_register_size, 8);
2197 return false;
2200 /* Implement REGMODE_NATURAL_SIZE. */
2201 poly_uint64
2202 aarch64_regmode_natural_size (machine_mode mode)
2204 /* The natural size for SVE data modes is one SVE data vector,
2205 and similarly for predicates. We can't independently modify
2206 anything smaller than that. */
2207 /* ??? For now, only do this for variable-width SVE registers.
2208 Doing it for constant-sized registers breaks lower-subreg.c. */
2209 /* ??? And once that's fixed, we should probably have similar
2210 code for Advanced SIMD. */
2211 if (!aarch64_sve_vg.is_constant ())
2213 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2214 if (vec_flags & VEC_SVE_PRED)
2215 return BYTES_PER_SVE_PRED;
2216 if (vec_flags & VEC_SVE_DATA)
2217 return BYTES_PER_SVE_VECTOR;
2219 return UNITS_PER_WORD;
2222 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2223 machine_mode
2224 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2225 machine_mode mode)
2227 /* The predicate mode determines which bits are significant and
2228 which are "don't care". Decreasing the number of lanes would
2229 lose data while increasing the number of lanes would make bits
2230 unnecessarily significant. */
2231 if (PR_REGNUM_P (regno))
2232 return mode;
2233 if (known_ge (GET_MODE_SIZE (mode), 4))
2234 return mode;
2235 else
2236 return SImode;
2239 /* Return true if I's bits are consecutive ones from the MSB. */
2240 bool
2241 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2243 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2246 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2247 that strcpy from constants will be faster. */
2249 static HOST_WIDE_INT
2250 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2252 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2253 return MAX (align, BITS_PER_WORD);
2254 return align;
2257 /* Return true if calls to DECL should be treated as
2258 long-calls (ie called via a register). */
2259 static bool
2260 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2262 return false;
2265 /* Return true if calls to symbol-ref SYM should be treated as
2266 long-calls (ie called via a register). */
2267 bool
2268 aarch64_is_long_call_p (rtx sym)
2270 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2273 /* Return true if calls to symbol-ref SYM should not go through
2274 plt stubs. */
2276 bool
2277 aarch64_is_noplt_call_p (rtx sym)
2279 const_tree decl = SYMBOL_REF_DECL (sym);
2281 if (flag_pic
2282 && decl
2283 && (!flag_plt
2284 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2285 && !targetm.binds_local_p (decl))
2286 return true;
2288 return false;
2291 /* Return true if the offsets to a zero/sign-extract operation
2292 represent an expression that matches an extend operation. The
2293 operands represent the paramters from
2295 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2296 bool
2297 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2298 rtx extract_imm)
2300 HOST_WIDE_INT mult_val, extract_val;
2302 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2303 return false;
2305 mult_val = INTVAL (mult_imm);
2306 extract_val = INTVAL (extract_imm);
2308 if (extract_val > 8
2309 && extract_val < GET_MODE_BITSIZE (mode)
2310 && exact_log2 (extract_val & ~7) > 0
2311 && (extract_val & 7) <= 4
2312 && mult_val == (1 << (extract_val & 7)))
2313 return true;
2315 return false;
2318 /* Emit an insn that's a simple single-set. Both the operands must be
2319 known to be valid. */
2320 inline static rtx_insn *
2321 emit_set_insn (rtx x, rtx y)
2323 return emit_insn (gen_rtx_SET (x, y));
2326 /* X and Y are two things to compare using CODE. Emit the compare insn and
2327 return the rtx for register 0 in the proper mode. */
2329 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2331 machine_mode cmp_mode = GET_MODE (x);
2332 machine_mode cc_mode;
2333 rtx cc_reg;
2335 if (cmp_mode == TImode)
2337 gcc_assert (code == NE);
2339 cc_mode = CCmode;
2340 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2342 rtx x_lo = operand_subword (x, 0, 0, TImode);
2343 rtx y_lo = operand_subword (y, 0, 0, TImode);
2344 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2346 rtx x_hi = operand_subword (x, 1, 0, TImode);
2347 rtx y_hi = operand_subword (y, 1, 0, TImode);
2348 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2349 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2350 GEN_INT (AARCH64_EQ)));
2352 else
2354 cc_mode = SELECT_CC_MODE (code, x, y);
2355 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2356 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2358 return cc_reg;
2361 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2363 static rtx
2364 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2365 machine_mode y_mode)
2367 if (y_mode == E_QImode || y_mode == E_HImode)
2369 if (CONST_INT_P (y))
2370 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2371 else
2373 rtx t, cc_reg;
2374 machine_mode cc_mode;
2376 t = gen_rtx_ZERO_EXTEND (SImode, y);
2377 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2378 cc_mode = CC_SWPmode;
2379 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2380 emit_set_insn (cc_reg, t);
2381 return cc_reg;
2385 if (!aarch64_plus_operand (y, y_mode))
2386 y = force_reg (y_mode, y);
2388 return aarch64_gen_compare_reg (code, x, y);
2391 /* Build the SYMBOL_REF for __tls_get_addr. */
2393 static GTY(()) rtx tls_get_addr_libfunc;
2396 aarch64_tls_get_addr (void)
2398 if (!tls_get_addr_libfunc)
2399 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2400 return tls_get_addr_libfunc;
2403 /* Return the TLS model to use for ADDR. */
2405 static enum tls_model
2406 tls_symbolic_operand_type (rtx addr)
2408 enum tls_model tls_kind = TLS_MODEL_NONE;
2409 if (GET_CODE (addr) == CONST)
2411 poly_int64 addend;
2412 rtx sym = strip_offset (addr, &addend);
2413 if (GET_CODE (sym) == SYMBOL_REF)
2414 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2416 else if (GET_CODE (addr) == SYMBOL_REF)
2417 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2419 return tls_kind;
2422 /* We'll allow lo_sum's in addresses in our legitimate addresses
2423 so that combine would take care of combining addresses where
2424 necessary, but for generation purposes, we'll generate the address
2425 as :
2426 RTL Absolute
2427 tmp = hi (symbol_ref); adrp x1, foo
2428 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2431 PIC TLS
2432 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2433 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2434 bl __tls_get_addr
2437 Load TLS symbol, depending on TLS mechanism and TLS access model.
2439 Global Dynamic - Traditional TLS:
2440 adrp tmp, :tlsgd:imm
2441 add dest, tmp, #:tlsgd_lo12:imm
2442 bl __tls_get_addr
2444 Global Dynamic - TLS Descriptors:
2445 adrp dest, :tlsdesc:imm
2446 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2447 add dest, dest, #:tlsdesc_lo12:imm
2448 blr tmp
2449 mrs tp, tpidr_el0
2450 add dest, dest, tp
2452 Initial Exec:
2453 mrs tp, tpidr_el0
2454 adrp tmp, :gottprel:imm
2455 ldr dest, [tmp, #:gottprel_lo12:imm]
2456 add dest, dest, tp
2458 Local Exec:
2459 mrs tp, tpidr_el0
2460 add t0, tp, #:tprel_hi12:imm, lsl #12
2461 add t0, t0, #:tprel_lo12_nc:imm
2464 static void
2465 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2466 enum aarch64_symbol_type type)
2468 switch (type)
2470 case SYMBOL_SMALL_ABSOLUTE:
2472 /* In ILP32, the mode of dest can be either SImode or DImode. */
2473 rtx tmp_reg = dest;
2474 machine_mode mode = GET_MODE (dest);
2476 gcc_assert (mode == Pmode || mode == ptr_mode);
2478 if (can_create_pseudo_p ())
2479 tmp_reg = gen_reg_rtx (mode);
2481 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2482 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2483 return;
2486 case SYMBOL_TINY_ABSOLUTE:
2487 emit_insn (gen_rtx_SET (dest, imm));
2488 return;
2490 case SYMBOL_SMALL_GOT_28K:
2492 machine_mode mode = GET_MODE (dest);
2493 rtx gp_rtx = pic_offset_table_rtx;
2494 rtx insn;
2495 rtx mem;
2497 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2498 here before rtl expand. Tree IVOPT will generate rtl pattern to
2499 decide rtx costs, in which case pic_offset_table_rtx is not
2500 initialized. For that case no need to generate the first adrp
2501 instruction as the final cost for global variable access is
2502 one instruction. */
2503 if (gp_rtx != NULL)
2505 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2506 using the page base as GOT base, the first page may be wasted,
2507 in the worst scenario, there is only 28K space for GOT).
2509 The generate instruction sequence for accessing global variable
2512 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2514 Only one instruction needed. But we must initialize
2515 pic_offset_table_rtx properly. We generate initialize insn for
2516 every global access, and allow CSE to remove all redundant.
2518 The final instruction sequences will look like the following
2519 for multiply global variables access.
2521 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2523 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2524 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2525 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2526 ... */
2528 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2529 crtl->uses_pic_offset_table = 1;
2530 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2532 if (mode != GET_MODE (gp_rtx))
2533 gp_rtx = gen_lowpart (mode, gp_rtx);
2537 if (mode == ptr_mode)
2539 if (mode == DImode)
2540 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2541 else
2542 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2544 mem = XVECEXP (SET_SRC (insn), 0, 0);
2546 else
2548 gcc_assert (mode == Pmode);
2550 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2551 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2554 /* The operand is expected to be MEM. Whenever the related insn
2555 pattern changed, above code which calculate mem should be
2556 updated. */
2557 gcc_assert (GET_CODE (mem) == MEM);
2558 MEM_READONLY_P (mem) = 1;
2559 MEM_NOTRAP_P (mem) = 1;
2560 emit_insn (insn);
2561 return;
2564 case SYMBOL_SMALL_GOT_4G:
2566 /* In ILP32, the mode of dest can be either SImode or DImode,
2567 while the got entry is always of SImode size. The mode of
2568 dest depends on how dest is used: if dest is assigned to a
2569 pointer (e.g. in the memory), it has SImode; it may have
2570 DImode if dest is dereferenced to access the memeory.
2571 This is why we have to handle three different ldr_got_small
2572 patterns here (two patterns for ILP32). */
2574 rtx insn;
2575 rtx mem;
2576 rtx tmp_reg = dest;
2577 machine_mode mode = GET_MODE (dest);
2579 if (can_create_pseudo_p ())
2580 tmp_reg = gen_reg_rtx (mode);
2582 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2583 if (mode == ptr_mode)
2585 if (mode == DImode)
2586 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2587 else
2588 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2590 mem = XVECEXP (SET_SRC (insn), 0, 0);
2592 else
2594 gcc_assert (mode == Pmode);
2596 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2597 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2600 gcc_assert (GET_CODE (mem) == MEM);
2601 MEM_READONLY_P (mem) = 1;
2602 MEM_NOTRAP_P (mem) = 1;
2603 emit_insn (insn);
2604 return;
2607 case SYMBOL_SMALL_TLSGD:
2609 rtx_insn *insns;
2610 machine_mode mode = GET_MODE (dest);
2611 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2613 start_sequence ();
2614 if (TARGET_ILP32)
2615 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2616 else
2617 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2618 insns = get_insns ();
2619 end_sequence ();
2621 RTL_CONST_CALL_P (insns) = 1;
2622 emit_libcall_block (insns, dest, result, imm);
2623 return;
2626 case SYMBOL_SMALL_TLSDESC:
2628 machine_mode mode = GET_MODE (dest);
2629 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2630 rtx tp;
2632 gcc_assert (mode == Pmode || mode == ptr_mode);
2634 /* In ILP32, the got entry is always of SImode size. Unlike
2635 small GOT, the dest is fixed at reg 0. */
2636 if (TARGET_ILP32)
2637 emit_insn (gen_tlsdesc_small_si (imm));
2638 else
2639 emit_insn (gen_tlsdesc_small_di (imm));
2640 tp = aarch64_load_tp (NULL);
2642 if (mode != Pmode)
2643 tp = gen_lowpart (mode, tp);
2645 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2646 if (REG_P (dest))
2647 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2648 return;
2651 case SYMBOL_SMALL_TLSIE:
2653 /* In ILP32, the mode of dest can be either SImode or DImode,
2654 while the got entry is always of SImode size. The mode of
2655 dest depends on how dest is used: if dest is assigned to a
2656 pointer (e.g. in the memory), it has SImode; it may have
2657 DImode if dest is dereferenced to access the memeory.
2658 This is why we have to handle three different tlsie_small
2659 patterns here (two patterns for ILP32). */
2660 machine_mode mode = GET_MODE (dest);
2661 rtx tmp_reg = gen_reg_rtx (mode);
2662 rtx tp = aarch64_load_tp (NULL);
2664 if (mode == ptr_mode)
2666 if (mode == DImode)
2667 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2668 else
2670 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2671 tp = gen_lowpart (mode, tp);
2674 else
2676 gcc_assert (mode == Pmode);
2677 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2680 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2681 if (REG_P (dest))
2682 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2683 return;
2686 case SYMBOL_TLSLE12:
2687 case SYMBOL_TLSLE24:
2688 case SYMBOL_TLSLE32:
2689 case SYMBOL_TLSLE48:
2691 machine_mode mode = GET_MODE (dest);
2692 rtx tp = aarch64_load_tp (NULL);
2694 if (mode != Pmode)
2695 tp = gen_lowpart (mode, tp);
2697 switch (type)
2699 case SYMBOL_TLSLE12:
2700 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2701 (dest, tp, imm));
2702 break;
2703 case SYMBOL_TLSLE24:
2704 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2705 (dest, tp, imm));
2706 break;
2707 case SYMBOL_TLSLE32:
2708 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2709 (dest, imm));
2710 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2711 (dest, dest, tp));
2712 break;
2713 case SYMBOL_TLSLE48:
2714 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2715 (dest, imm));
2716 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2717 (dest, dest, tp));
2718 break;
2719 default:
2720 gcc_unreachable ();
2723 if (REG_P (dest))
2724 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2725 return;
2728 case SYMBOL_TINY_GOT:
2729 emit_insn (gen_ldr_got_tiny (dest, imm));
2730 return;
2732 case SYMBOL_TINY_TLSIE:
2734 machine_mode mode = GET_MODE (dest);
2735 rtx tp = aarch64_load_tp (NULL);
2737 if (mode == ptr_mode)
2739 if (mode == DImode)
2740 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2741 else
2743 tp = gen_lowpart (mode, tp);
2744 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2747 else
2749 gcc_assert (mode == Pmode);
2750 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2753 if (REG_P (dest))
2754 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2755 return;
2758 default:
2759 gcc_unreachable ();
2763 /* Emit a move from SRC to DEST. Assume that the move expanders can
2764 handle all moves if !can_create_pseudo_p (). The distinction is
2765 important because, unlike emit_move_insn, the move expanders know
2766 how to force Pmode objects into the constant pool even when the
2767 constant pool address is not itself legitimate. */
2768 static rtx
2769 aarch64_emit_move (rtx dest, rtx src)
2771 return (can_create_pseudo_p ()
2772 ? emit_move_insn (dest, src)
2773 : emit_move_insn_1 (dest, src));
2776 /* Apply UNOPTAB to OP and store the result in DEST. */
2778 static void
2779 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2781 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2782 if (dest != tmp)
2783 emit_move_insn (dest, tmp);
2786 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2788 static void
2789 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2791 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2792 OPTAB_DIRECT);
2793 if (dest != tmp)
2794 emit_move_insn (dest, tmp);
2797 /* Split a 128-bit move operation into two 64-bit move operations,
2798 taking care to handle partial overlap of register to register
2799 copies. Special cases are needed when moving between GP regs and
2800 FP regs. SRC can be a register, constant or memory; DST a register
2801 or memory. If either operand is memory it must not have any side
2802 effects. */
2803 void
2804 aarch64_split_128bit_move (rtx dst, rtx src)
2806 rtx dst_lo, dst_hi;
2807 rtx src_lo, src_hi;
2809 machine_mode mode = GET_MODE (dst);
2811 gcc_assert (mode == TImode || mode == TFmode);
2812 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2813 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2815 if (REG_P (dst) && REG_P (src))
2817 int src_regno = REGNO (src);
2818 int dst_regno = REGNO (dst);
2820 /* Handle FP <-> GP regs. */
2821 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2823 src_lo = gen_lowpart (word_mode, src);
2824 src_hi = gen_highpart (word_mode, src);
2826 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2827 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2828 return;
2830 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2832 dst_lo = gen_lowpart (word_mode, dst);
2833 dst_hi = gen_highpart (word_mode, dst);
2835 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2836 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2837 return;
2841 dst_lo = gen_lowpart (word_mode, dst);
2842 dst_hi = gen_highpart (word_mode, dst);
2843 src_lo = gen_lowpart (word_mode, src);
2844 src_hi = gen_highpart_mode (word_mode, mode, src);
2846 /* At most one pairing may overlap. */
2847 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2849 aarch64_emit_move (dst_hi, src_hi);
2850 aarch64_emit_move (dst_lo, src_lo);
2852 else
2854 aarch64_emit_move (dst_lo, src_lo);
2855 aarch64_emit_move (dst_hi, src_hi);
2859 bool
2860 aarch64_split_128bit_move_p (rtx dst, rtx src)
2862 return (! REG_P (src)
2863 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2866 /* Split a complex SIMD combine. */
2868 void
2869 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2871 machine_mode src_mode = GET_MODE (src1);
2872 machine_mode dst_mode = GET_MODE (dst);
2874 gcc_assert (VECTOR_MODE_P (dst_mode));
2875 gcc_assert (register_operand (dst, dst_mode)
2876 && register_operand (src1, src_mode)
2877 && register_operand (src2, src_mode));
2879 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2880 return;
2883 /* Split a complex SIMD move. */
2885 void
2886 aarch64_split_simd_move (rtx dst, rtx src)
2888 machine_mode src_mode = GET_MODE (src);
2889 machine_mode dst_mode = GET_MODE (dst);
2891 gcc_assert (VECTOR_MODE_P (dst_mode));
2893 if (REG_P (dst) && REG_P (src))
2895 gcc_assert (VECTOR_MODE_P (src_mode));
2896 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2900 bool
2901 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2902 machine_mode ymode, rtx y)
2904 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2905 gcc_assert (r != NULL);
2906 return rtx_equal_p (x, r);
2909 /* Return TARGET if it is nonnull and a register of mode MODE.
2910 Otherwise, return a fresh register of mode MODE if we can,
2911 or TARGET reinterpreted as MODE if we can't. */
2913 static rtx
2914 aarch64_target_reg (rtx target, machine_mode mode)
2916 if (target && REG_P (target) && GET_MODE (target) == mode)
2917 return target;
2918 if (!can_create_pseudo_p ())
2920 gcc_assert (target);
2921 return gen_lowpart (mode, target);
2923 return gen_reg_rtx (mode);
2926 /* Return a register that contains the constant in BUILDER, given that
2927 the constant is a legitimate move operand. Use TARGET as the register
2928 if it is nonnull and convenient. */
2930 static rtx
2931 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2933 rtx src = builder.build ();
2934 target = aarch64_target_reg (target, GET_MODE (src));
2935 emit_insn (gen_rtx_SET (target, src));
2936 return target;
2939 static rtx
2940 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2942 if (can_create_pseudo_p ())
2943 return force_reg (mode, value);
2944 else
2946 gcc_assert (x);
2947 aarch64_emit_move (x, value);
2948 return x;
2952 /* Return true if predicate value X is a constant in which every element
2953 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2954 value, i.e. as a predicate in which all bits are significant. */
2956 static bool
2957 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2959 if (GET_CODE (x) != CONST_VECTOR)
2960 return false;
2962 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2963 GET_MODE_NUNITS (GET_MODE (x)));
2964 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2965 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2966 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2968 unsigned int nelts = const_vector_encoded_nelts (x);
2969 for (unsigned int i = 0; i < nelts; ++i)
2971 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2972 if (!CONST_INT_P (elt))
2973 return false;
2975 builder.quick_push (elt);
2976 for (unsigned int j = 1; j < factor; ++j)
2977 builder.quick_push (const0_rtx);
2979 builder.finalize ();
2980 return true;
2983 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2984 widest predicate element size it can have (that is, the largest size
2985 for which each element would still be 0 or 1). */
2987 unsigned int
2988 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2990 /* Start with the most optimistic assumption: that we only need
2991 one bit per pattern. This is what we will use if only the first
2992 bit in each pattern is ever set. */
2993 unsigned int mask = GET_MODE_SIZE (DImode);
2994 mask |= builder.npatterns ();
2996 /* Look for set bits. */
2997 unsigned int nelts = builder.encoded_nelts ();
2998 for (unsigned int i = 1; i < nelts; ++i)
2999 if (INTVAL (builder.elt (i)) != 0)
3001 if (i & 1)
3002 return 1;
3003 mask |= i;
3005 return mask & -mask;
3008 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3009 return that predicate mode, otherwise return opt_machine_mode (). */
3011 opt_machine_mode
3012 aarch64_ptrue_all_mode (rtx x)
3014 gcc_assert (GET_MODE (x) == VNx16BImode);
3015 if (GET_CODE (x) != CONST_VECTOR
3016 || !CONST_VECTOR_DUPLICATE_P (x)
3017 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3018 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3019 return opt_machine_mode ();
3021 unsigned int nelts = const_vector_encoded_nelts (x);
3022 for (unsigned int i = 1; i < nelts; ++i)
3023 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3024 return opt_machine_mode ();
3026 return aarch64_sve_pred_mode (nelts);
3029 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3030 that the constant would have with predicate element size ELT_SIZE
3031 (ignoring the upper bits in each element) and return:
3033 * -1 if all bits are set
3034 * N if the predicate has N leading set bits followed by all clear bits
3035 * 0 if the predicate does not have any of these forms. */
3038 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3039 unsigned int elt_size)
3041 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3042 followed by set bits. */
3043 if (builder.nelts_per_pattern () == 3)
3044 return 0;
3046 /* Skip over leading set bits. */
3047 unsigned int nelts = builder.encoded_nelts ();
3048 unsigned int i = 0;
3049 for (; i < nelts; i += elt_size)
3050 if (INTVAL (builder.elt (i)) == 0)
3051 break;
3052 unsigned int vl = i / elt_size;
3054 /* Check for the all-true case. */
3055 if (i == nelts)
3056 return -1;
3058 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3059 repeating pattern of set bits followed by clear bits. */
3060 if (builder.nelts_per_pattern () != 2)
3061 return 0;
3063 /* We have a "foreground" value and a duplicated "background" value.
3064 If the background might repeat and the last set bit belongs to it,
3065 we might have set bits followed by clear bits followed by set bits. */
3066 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3067 return 0;
3069 /* Make sure that the rest are all clear. */
3070 for (; i < nelts; i += elt_size)
3071 if (INTVAL (builder.elt (i)) != 0)
3072 return 0;
3074 return vl;
3077 /* See if there is an svpattern that encodes an SVE predicate of mode
3078 PRED_MODE in which the first VL bits are set and the rest are clear.
3079 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3080 A VL of -1 indicates an all-true vector. */
3082 aarch64_svpattern
3083 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3085 if (vl < 0)
3086 return AARCH64_SV_ALL;
3088 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3089 return AARCH64_NUM_SVPATTERNS;
3091 if (vl >= 1 && vl <= 8)
3092 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3094 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3095 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3097 int max_vl;
3098 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3100 if (vl == (max_vl / 3) * 3)
3101 return AARCH64_SV_MUL3;
3102 /* These would only trigger for non-power-of-2 lengths. */
3103 if (vl == (max_vl & -4))
3104 return AARCH64_SV_MUL4;
3105 if (vl == (1 << floor_log2 (max_vl)))
3106 return AARCH64_SV_POW2;
3107 if (vl == max_vl)
3108 return AARCH64_SV_ALL;
3110 return AARCH64_NUM_SVPATTERNS;
3113 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3114 bits has the lowest bit set and the upper bits clear. This is the
3115 VNx16BImode equivalent of a PTRUE for controlling elements of
3116 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3117 all bits are significant, even the upper zeros. */
3120 aarch64_ptrue_all (unsigned int elt_size)
3122 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3123 builder.quick_push (const1_rtx);
3124 for (unsigned int i = 1; i < elt_size; ++i)
3125 builder.quick_push (const0_rtx);
3126 return builder.build ();
3129 /* Return an all-true predicate register of mode MODE. */
3132 aarch64_ptrue_reg (machine_mode mode)
3134 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3135 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3136 return gen_lowpart (mode, reg);
3139 /* Return an all-false predicate register of mode MODE. */
3142 aarch64_pfalse_reg (machine_mode mode)
3144 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3145 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3146 return gen_lowpart (mode, reg);
3149 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3150 true, or alternatively if we know that the operation predicated by
3151 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3152 aarch64_sve_gp_strictness operand that describes the operation
3153 predicated by PRED1[0]. */
3155 bool
3156 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3158 machine_mode mode = GET_MODE (pred2);
3159 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3160 && mode == GET_MODE (pred1[0])
3161 && aarch64_sve_gp_strictness (pred1[1], SImode));
3162 return (pred1[0] == CONSTM1_RTX (mode)
3163 || INTVAL (pred1[1]) == SVE_RELAXED_GP
3164 || rtx_equal_p (pred1[0], pred2));
3167 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3168 for it. PRED2[0] is the predicate for the instruction whose result
3169 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3170 for it. Return true if we can prove that the two predicates are
3171 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3172 with PRED1[0] without changing behavior. */
3174 bool
3175 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3177 machine_mode mode = GET_MODE (pred1[0]);
3178 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3179 && mode == GET_MODE (pred2[0])
3180 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3181 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3183 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3184 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3185 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3186 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3187 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3190 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3191 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3192 Use TARGET as the target register if nonnull and convenient. */
3194 static rtx
3195 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3196 machine_mode data_mode, rtx op1, rtx op2)
3198 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3199 expand_operand ops[5];
3200 create_output_operand (&ops[0], target, pred_mode);
3201 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3202 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3203 create_input_operand (&ops[3], op1, data_mode);
3204 create_input_operand (&ops[4], op2, data_mode);
3205 expand_insn (icode, 5, ops);
3206 return ops[0].value;
3209 /* Use a comparison to convert integer vector SRC into MODE, which is
3210 the corresponding SVE predicate mode. Use TARGET for the result
3211 if it's nonnull and convenient. */
3214 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3216 machine_mode src_mode = GET_MODE (src);
3217 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3218 src, CONST0_RTX (src_mode));
3221 /* Return the assembly token for svprfop value PRFOP. */
3223 static const char *
3224 svprfop_token (enum aarch64_svprfop prfop)
3226 switch (prfop)
3228 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3229 AARCH64_FOR_SVPRFOP (CASE)
3230 #undef CASE
3231 case AARCH64_NUM_SVPRFOPS:
3232 break;
3234 gcc_unreachable ();
3237 /* Return the assembly string for an SVE prefetch operation with
3238 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3239 and that SUFFIX is the format for the remaining operands. */
3241 char *
3242 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3243 const char *suffix)
3245 static char buffer[128];
3246 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3247 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3248 mnemonic, svprfop_token (prfop), suffix);
3249 gcc_assert (written < sizeof (buffer));
3250 return buffer;
3253 /* Check whether we can calculate the number of elements in PATTERN
3254 at compile time, given that there are NELTS_PER_VQ elements per
3255 128-bit block. Return the value if so, otherwise return -1. */
3257 HOST_WIDE_INT
3258 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3260 unsigned int vl, const_vg;
3261 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3262 vl = 1 + (pattern - AARCH64_SV_VL1);
3263 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3264 vl = 16 << (pattern - AARCH64_SV_VL16);
3265 else if (aarch64_sve_vg.is_constant (&const_vg))
3267 /* There are two vector granules per quadword. */
3268 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3269 switch (pattern)
3271 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3272 case AARCH64_SV_MUL4: return nelts & -4;
3273 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3274 case AARCH64_SV_ALL: return nelts;
3275 default: gcc_unreachable ();
3278 else
3279 return -1;
3281 /* There are two vector granules per quadword. */
3282 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3283 if (known_le (vl, nelts_all))
3284 return vl;
3286 /* Requesting more elements than are available results in a PFALSE. */
3287 if (known_gt (vl, nelts_all))
3288 return 0;
3290 return -1;
3293 /* Return true if we can move VALUE into a register using a single
3294 CNT[BHWD] instruction. */
3296 static bool
3297 aarch64_sve_cnt_immediate_p (poly_int64 value)
3299 HOST_WIDE_INT factor = value.coeffs[0];
3300 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3301 return (value.coeffs[1] == factor
3302 && IN_RANGE (factor, 2, 16 * 16)
3303 && (factor & 1) == 0
3304 && factor <= 16 * (factor & -factor));
3307 /* Likewise for rtx X. */
3309 bool
3310 aarch64_sve_cnt_immediate_p (rtx x)
3312 poly_int64 value;
3313 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3316 /* Return the asm string for an instruction with a CNT-like vector size
3317 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3318 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3319 first part of the operands template (the part that comes before the
3320 vector size itself). PATTERN is the pattern to use. FACTOR is the
3321 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3322 in each quadword. If it is zero, we can use any element size. */
3324 static char *
3325 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3326 aarch64_svpattern pattern,
3327 unsigned int factor,
3328 unsigned int nelts_per_vq)
3330 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3332 if (nelts_per_vq == 0)
3333 /* There is some overlap in the ranges of the four CNT instructions.
3334 Here we always use the smallest possible element size, so that the
3335 multiplier is 1 whereever possible. */
3336 nelts_per_vq = factor & -factor;
3337 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3338 gcc_assert (IN_RANGE (shift, 1, 4));
3339 char suffix = "dwhb"[shift - 1];
3341 factor >>= shift;
3342 unsigned int written;
3343 if (pattern == AARCH64_SV_ALL && factor == 1)
3344 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3345 prefix, suffix, operands);
3346 else if (factor == 1)
3347 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3348 prefix, suffix, operands, svpattern_token (pattern));
3349 else
3350 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3351 prefix, suffix, operands, svpattern_token (pattern),
3352 factor);
3353 gcc_assert (written < sizeof (buffer));
3354 return buffer;
3357 /* Return the asm string for an instruction with a CNT-like vector size
3358 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3359 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3360 first part of the operands template (the part that comes before the
3361 vector size itself). X is the value of the vector size operand,
3362 as a polynomial integer rtx; we need to convert this into an "all"
3363 pattern with a multiplier. */
3365 char *
3366 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3367 rtx x)
3369 poly_int64 value = rtx_to_poly_int64 (x);
3370 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3371 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3372 value.coeffs[1], 0);
3375 /* Return the asm string for an instruction with a CNT-like vector size
3376 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3377 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3378 first part of the operands template (the part that comes before the
3379 vector size itself). CNT_PAT[0..2] are the operands of the
3380 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3382 char *
3383 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3384 const char *operands, rtx *cnt_pat)
3386 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3387 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3388 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3389 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3390 factor, nelts_per_vq);
3393 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3395 bool
3396 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3398 poly_int64 value;
3399 return (poly_int_rtx_p (x, &value)
3400 && (aarch64_sve_cnt_immediate_p (value)
3401 || aarch64_sve_cnt_immediate_p (-value)));
3404 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3405 operand 0. */
3407 char *
3408 aarch64_output_sve_scalar_inc_dec (rtx offset)
3410 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3411 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3412 if (offset_value.coeffs[1] > 0)
3413 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3414 offset_value.coeffs[1], 0);
3415 else
3416 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3417 -offset_value.coeffs[1], 0);
3420 /* Return true if we can add VALUE to a register using a single ADDVL
3421 or ADDPL instruction. */
3423 static bool
3424 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3426 HOST_WIDE_INT factor = value.coeffs[0];
3427 if (factor == 0 || value.coeffs[1] != factor)
3428 return false;
3429 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3430 and a value of 16 is one vector width. */
3431 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3432 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3435 /* Likewise for rtx X. */
3437 bool
3438 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3440 poly_int64 value;
3441 return (poly_int_rtx_p (x, &value)
3442 && aarch64_sve_addvl_addpl_immediate_p (value));
3445 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3446 to operand 1 and storing the result in operand 0. */
3448 char *
3449 aarch64_output_sve_addvl_addpl (rtx offset)
3451 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3452 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3453 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3455 int factor = offset_value.coeffs[1];
3456 if ((factor & 15) == 0)
3457 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3458 else
3459 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3460 return buffer;
3463 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3464 instruction. If it is, store the number of elements in each vector
3465 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3466 factor in *FACTOR_OUT (if nonnull). */
3468 bool
3469 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3470 unsigned int *nelts_per_vq_out)
3472 rtx elt;
3473 poly_int64 value;
3475 if (!const_vec_duplicate_p (x, &elt)
3476 || !poly_int_rtx_p (elt, &value))
3477 return false;
3479 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3480 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3481 /* There's no vector INCB. */
3482 return false;
3484 HOST_WIDE_INT factor = value.coeffs[0];
3485 if (value.coeffs[1] != factor)
3486 return false;
3488 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3489 if ((factor % nelts_per_vq) != 0
3490 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3491 return false;
3493 if (factor_out)
3494 *factor_out = factor;
3495 if (nelts_per_vq_out)
3496 *nelts_per_vq_out = nelts_per_vq;
3497 return true;
3500 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3501 instruction. */
3503 bool
3504 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3506 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3509 /* Return the asm template for an SVE vector INC or DEC instruction.
3510 OPERANDS gives the operands before the vector count and X is the
3511 value of the vector count operand itself. */
3513 char *
3514 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3516 int factor;
3517 unsigned int nelts_per_vq;
3518 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3519 gcc_unreachable ();
3520 if (factor < 0)
3521 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3522 -factor, nelts_per_vq);
3523 else
3524 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3525 factor, nelts_per_vq);
3528 static int
3529 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3530 scalar_int_mode mode)
3532 int i;
3533 unsigned HOST_WIDE_INT val, val2, mask;
3534 int one_match, zero_match;
3535 int num_insns;
3537 val = INTVAL (imm);
3539 if (aarch64_move_imm (val, mode))
3541 if (generate)
3542 emit_insn (gen_rtx_SET (dest, imm));
3543 return 1;
3546 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3547 (with XXXX non-zero). In that case check to see if the move can be done in
3548 a smaller mode. */
3549 val2 = val & 0xffffffff;
3550 if (mode == DImode
3551 && aarch64_move_imm (val2, SImode)
3552 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3554 if (generate)
3555 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3557 /* Check if we have to emit a second instruction by checking to see
3558 if any of the upper 32 bits of the original DI mode value is set. */
3559 if (val == val2)
3560 return 1;
3562 i = (val >> 48) ? 48 : 32;
3564 if (generate)
3565 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3566 GEN_INT ((val >> i) & 0xffff)));
3568 return 2;
3571 if ((val >> 32) == 0 || mode == SImode)
3573 if (generate)
3575 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3576 if (mode == SImode)
3577 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3578 GEN_INT ((val >> 16) & 0xffff)));
3579 else
3580 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3581 GEN_INT ((val >> 16) & 0xffff)));
3583 return 2;
3586 /* Remaining cases are all for DImode. */
3588 mask = 0xffff;
3589 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3590 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3591 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3592 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3594 if (zero_match != 2 && one_match != 2)
3596 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3597 For a 64-bit bitmask try whether changing 16 bits to all ones or
3598 zeroes creates a valid bitmask. To check any repeated bitmask,
3599 try using 16 bits from the other 32-bit half of val. */
3601 for (i = 0; i < 64; i += 16, mask <<= 16)
3603 val2 = val & ~mask;
3604 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3605 break;
3606 val2 = val | mask;
3607 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3608 break;
3609 val2 = val2 & ~mask;
3610 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3611 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3612 break;
3614 if (i != 64)
3616 if (generate)
3618 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3619 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3620 GEN_INT ((val >> i) & 0xffff)));
3622 return 2;
3626 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3627 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3628 otherwise skip zero bits. */
3630 num_insns = 1;
3631 mask = 0xffff;
3632 val2 = one_match > zero_match ? ~val : val;
3633 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3635 if (generate)
3636 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3637 ? (val | ~(mask << i))
3638 : (val & (mask << i)))));
3639 for (i += 16; i < 64; i += 16)
3641 if ((val2 & (mask << i)) == 0)
3642 continue;
3643 if (generate)
3644 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3645 GEN_INT ((val >> i) & 0xffff)));
3646 num_insns ++;
3649 return num_insns;
3652 /* Return whether imm is a 128-bit immediate which is simple enough to
3653 expand inline. */
3654 bool
3655 aarch64_mov128_immediate (rtx imm)
3657 if (GET_CODE (imm) == CONST_INT)
3658 return true;
3660 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3662 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3663 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3665 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3666 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3670 /* Return the number of temporary registers that aarch64_add_offset_1
3671 would need to add OFFSET to a register. */
3673 static unsigned int
3674 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3676 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3679 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3680 a non-polynomial OFFSET. MODE is the mode of the addition.
3681 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3682 be set and CFA adjustments added to the generated instructions.
3684 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3685 temporary if register allocation is already complete. This temporary
3686 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3687 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3688 the immediate again.
3690 Since this function may be used to adjust the stack pointer, we must
3691 ensure that it cannot cause transient stack deallocation (for example
3692 by first incrementing SP and then decrementing when adjusting by a
3693 large immediate). */
3695 static void
3696 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3697 rtx src, HOST_WIDE_INT offset, rtx temp1,
3698 bool frame_related_p, bool emit_move_imm)
3700 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3701 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3703 HOST_WIDE_INT moffset = abs_hwi (offset);
3704 rtx_insn *insn;
3706 if (!moffset)
3708 if (!rtx_equal_p (dest, src))
3710 insn = emit_insn (gen_rtx_SET (dest, src));
3711 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3713 return;
3716 /* Single instruction adjustment. */
3717 if (aarch64_uimm12_shift (moffset))
3719 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3720 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3721 return;
3724 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3725 and either:
3727 a) the offset cannot be loaded by a 16-bit move or
3728 b) there is no spare register into which we can move it. */
3729 if (moffset < 0x1000000
3730 && ((!temp1 && !can_create_pseudo_p ())
3731 || !aarch64_move_imm (moffset, mode)))
3733 HOST_WIDE_INT low_off = moffset & 0xfff;
3735 low_off = offset < 0 ? -low_off : low_off;
3736 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3737 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3738 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3739 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3740 return;
3743 /* Emit a move immediate if required and an addition/subtraction. */
3744 if (emit_move_imm)
3746 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3747 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3749 insn = emit_insn (offset < 0
3750 ? gen_sub3_insn (dest, src, temp1)
3751 : gen_add3_insn (dest, src, temp1));
3752 if (frame_related_p)
3754 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3755 rtx adj = plus_constant (mode, src, offset);
3756 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3760 /* Return the number of temporary registers that aarch64_add_offset
3761 would need to move OFFSET into a register or add OFFSET to a register;
3762 ADD_P is true if we want the latter rather than the former. */
3764 static unsigned int
3765 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3767 /* This follows the same structure as aarch64_add_offset. */
3768 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3769 return 0;
3771 unsigned int count = 0;
3772 HOST_WIDE_INT factor = offset.coeffs[1];
3773 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3774 poly_int64 poly_offset (factor, factor);
3775 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3776 /* Need one register for the ADDVL/ADDPL result. */
3777 count += 1;
3778 else if (factor != 0)
3780 factor = abs (factor);
3781 if (factor > 16 * (factor & -factor))
3782 /* Need one register for the CNT result and one for the multiplication
3783 factor. If necessary, the second temporary can be reused for the
3784 constant part of the offset. */
3785 return 2;
3786 /* Need one register for the CNT result (which might then
3787 be shifted). */
3788 count += 1;
3790 return count + aarch64_add_offset_1_temporaries (constant);
3793 /* If X can be represented as a poly_int64, return the number
3794 of temporaries that are required to add it to a register.
3795 Return -1 otherwise. */
3798 aarch64_add_offset_temporaries (rtx x)
3800 poly_int64 offset;
3801 if (!poly_int_rtx_p (x, &offset))
3802 return -1;
3803 return aarch64_offset_temporaries (true, offset);
3806 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3807 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3808 be set and CFA adjustments added to the generated instructions.
3810 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3811 temporary if register allocation is already complete. This temporary
3812 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3813 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3814 false to avoid emitting the immediate again.
3816 TEMP2, if nonnull, is a second temporary register that doesn't
3817 overlap either DEST or REG.
3819 Since this function may be used to adjust the stack pointer, we must
3820 ensure that it cannot cause transient stack deallocation (for example
3821 by first incrementing SP and then decrementing when adjusting by a
3822 large immediate). */
3824 static void
3825 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3826 poly_int64 offset, rtx temp1, rtx temp2,
3827 bool frame_related_p, bool emit_move_imm = true)
3829 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3830 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3831 gcc_assert (temp1 == NULL_RTX
3832 || !frame_related_p
3833 || !reg_overlap_mentioned_p (temp1, dest));
3834 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3836 /* Try using ADDVL or ADDPL to add the whole value. */
3837 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3839 rtx offset_rtx = gen_int_mode (offset, mode);
3840 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3841 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3842 return;
3845 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3846 SVE vector register, over and above the minimum size of 128 bits.
3847 This is equivalent to half the value returned by CNTD with a
3848 vector shape of ALL. */
3849 HOST_WIDE_INT factor = offset.coeffs[1];
3850 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3852 /* Try using ADDVL or ADDPL to add the VG-based part. */
3853 poly_int64 poly_offset (factor, factor);
3854 if (src != const0_rtx
3855 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3857 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3858 if (frame_related_p)
3860 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3861 RTX_FRAME_RELATED_P (insn) = true;
3862 src = dest;
3864 else
3866 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3867 src = aarch64_force_temporary (mode, temp1, addr);
3868 temp1 = temp2;
3869 temp2 = NULL_RTX;
3872 /* Otherwise use a CNT-based sequence. */
3873 else if (factor != 0)
3875 /* Use a subtraction if we have a negative factor. */
3876 rtx_code code = PLUS;
3877 if (factor < 0)
3879 factor = -factor;
3880 code = MINUS;
3883 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3884 into the multiplication. */
3885 rtx val;
3886 int shift = 0;
3887 if (factor & 1)
3888 /* Use a right shift by 1. */
3889 shift = -1;
3890 else
3891 factor /= 2;
3892 HOST_WIDE_INT low_bit = factor & -factor;
3893 if (factor <= 16 * low_bit)
3895 if (factor > 16 * 8)
3897 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3898 the value with the minimum multiplier and shift it into
3899 position. */
3900 int extra_shift = exact_log2 (low_bit);
3901 shift += extra_shift;
3902 factor >>= extra_shift;
3904 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3906 else
3908 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3909 directly, since that should increase the chances of being
3910 able to use a shift and add sequence. If LOW_BIT itself
3911 is out of range, just use CNTD. */
3912 if (low_bit <= 16 * 8)
3913 factor /= low_bit;
3914 else
3915 low_bit = 1;
3917 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3918 val = aarch64_force_temporary (mode, temp1, val);
3920 if (can_create_pseudo_p ())
3922 rtx coeff1 = gen_int_mode (factor, mode);
3923 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3925 else
3927 /* Go back to using a negative multiplication factor if we have
3928 no register from which to subtract. */
3929 if (code == MINUS && src == const0_rtx)
3931 factor = -factor;
3932 code = PLUS;
3934 rtx coeff1 = gen_int_mode (factor, mode);
3935 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3936 val = gen_rtx_MULT (mode, val, coeff1);
3940 if (shift > 0)
3942 /* Multiply by 1 << SHIFT. */
3943 val = aarch64_force_temporary (mode, temp1, val);
3944 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3946 else if (shift == -1)
3948 /* Divide by 2. */
3949 val = aarch64_force_temporary (mode, temp1, val);
3950 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3953 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3954 if (src != const0_rtx)
3956 val = aarch64_force_temporary (mode, temp1, val);
3957 val = gen_rtx_fmt_ee (code, mode, src, val);
3959 else if (code == MINUS)
3961 val = aarch64_force_temporary (mode, temp1, val);
3962 val = gen_rtx_NEG (mode, val);
3965 if (constant == 0 || frame_related_p)
3967 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3968 if (frame_related_p)
3970 RTX_FRAME_RELATED_P (insn) = true;
3971 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3972 gen_rtx_SET (dest, plus_constant (Pmode, src,
3973 poly_offset)));
3975 src = dest;
3976 if (constant == 0)
3977 return;
3979 else
3981 src = aarch64_force_temporary (mode, temp1, val);
3982 temp1 = temp2;
3983 temp2 = NULL_RTX;
3986 emit_move_imm = true;
3989 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3990 frame_related_p, emit_move_imm);
3993 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3994 than a poly_int64. */
3996 void
3997 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3998 rtx offset_rtx, rtx temp1, rtx temp2)
4000 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4001 temp1, temp2, false);
4004 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4005 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4006 if TEMP1 already contains abs (DELTA). */
4008 static inline void
4009 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4011 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4012 temp1, temp2, true, emit_move_imm);
4015 /* Subtract DELTA from the stack pointer, marking the instructions
4016 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4017 if nonnull. */
4019 static inline void
4020 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4021 bool emit_move_imm = true)
4023 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4024 temp1, temp2, frame_related_p, emit_move_imm);
4027 /* Set DEST to (vec_series BASE STEP). */
4029 static void
4030 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4032 machine_mode mode = GET_MODE (dest);
4033 scalar_mode inner = GET_MODE_INNER (mode);
4035 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4036 if (!aarch64_sve_index_immediate_p (base))
4037 base = force_reg (inner, base);
4038 if (!aarch64_sve_index_immediate_p (step))
4039 step = force_reg (inner, step);
4041 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4044 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4045 register of mode MODE. Use TARGET for the result if it's nonnull
4046 and convenient.
4048 The two vector modes must have the same element mode. The behavior
4049 is to duplicate architectural lane N of SRC into architectural lanes
4050 N + I * STEP of the result. On big-endian targets, architectural
4051 lane 0 of an Advanced SIMD vector is the last element of the vector
4052 in memory layout, so for big-endian targets this operation has the
4053 effect of reversing SRC before duplicating it. Callers need to
4054 account for this. */
4057 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4059 machine_mode src_mode = GET_MODE (src);
4060 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4061 insn_code icode = (BYTES_BIG_ENDIAN
4062 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4063 : code_for_aarch64_vec_duplicate_vq_le (mode));
4065 unsigned int i = 0;
4066 expand_operand ops[3];
4067 create_output_operand (&ops[i++], target, mode);
4068 create_output_operand (&ops[i++], src, src_mode);
4069 if (BYTES_BIG_ENDIAN)
4071 /* Create a PARALLEL describing the reversal of SRC. */
4072 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4073 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4074 nelts_per_vq - 1, -1);
4075 create_fixed_operand (&ops[i++], sel);
4077 expand_insn (icode, i, ops);
4078 return ops[0].value;
4081 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4082 the memory image into DEST. Return true on success. */
4084 static bool
4085 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4087 src = force_const_mem (GET_MODE (src), src);
4088 if (!src)
4089 return false;
4091 /* Make sure that the address is legitimate. */
4092 if (!aarch64_sve_ld1rq_operand_p (src))
4094 rtx addr = force_reg (Pmode, XEXP (src, 0));
4095 src = replace_equiv_address (src, addr);
4098 machine_mode mode = GET_MODE (dest);
4099 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4100 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4101 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4102 return true;
4105 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4106 SVE data mode and isn't a legitimate constant. Use TARGET for the
4107 result if convenient.
4109 The returned register can have whatever mode seems most natural
4110 given the contents of SRC. */
4112 static rtx
4113 aarch64_expand_sve_const_vector (rtx target, rtx src)
4115 machine_mode mode = GET_MODE (src);
4116 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4117 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4118 scalar_mode elt_mode = GET_MODE_INNER (mode);
4119 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4120 unsigned int container_bits = aarch64_sve_container_bits (mode);
4121 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4123 if (nelts_per_pattern == 1
4124 && encoded_bits <= 128
4125 && container_bits != elt_bits)
4127 /* We have a partial vector mode and a constant whose full-vector
4128 equivalent would occupy a repeating 128-bit sequence. Build that
4129 full-vector equivalent instead, so that we have the option of
4130 using LD1RQ and Advanced SIMD operations. */
4131 unsigned int repeat = container_bits / elt_bits;
4132 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4133 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4134 for (unsigned int i = 0; i < npatterns; ++i)
4135 for (unsigned int j = 0; j < repeat; ++j)
4136 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4137 target = aarch64_target_reg (target, full_mode);
4138 return aarch64_expand_sve_const_vector (target, builder.build ());
4141 if (nelts_per_pattern == 1 && encoded_bits == 128)
4143 /* The constant is a duplicated quadword but can't be narrowed
4144 beyond a quadword. Get the memory image of the first quadword
4145 as a 128-bit vector and try using LD1RQ to load it from memory.
4147 The effect for both endiannesses is to load memory lane N into
4148 architectural lanes N + I * STEP of the result. On big-endian
4149 targets, the layout of the 128-bit vector in an Advanced SIMD
4150 register would be different from its layout in an SVE register,
4151 but this 128-bit vector is a memory value only. */
4152 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4153 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4154 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4155 return target;
4158 if (nelts_per_pattern == 1 && encoded_bits < 128)
4160 /* The vector is a repeating sequence of 64 bits or fewer.
4161 See if we can load them using an Advanced SIMD move and then
4162 duplicate it to fill a vector. This is better than using a GPR
4163 move because it keeps everything in the same register file. */
4164 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4165 rtx_vector_builder builder (vq_mode, npatterns, 1);
4166 for (unsigned int i = 0; i < npatterns; ++i)
4168 /* We want memory lane N to go into architectural lane N,
4169 so reverse for big-endian targets. The DUP .Q pattern
4170 has a compensating reverse built-in. */
4171 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4172 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4174 rtx vq_src = builder.build ();
4175 if (aarch64_simd_valid_immediate (vq_src, NULL))
4177 vq_src = force_reg (vq_mode, vq_src);
4178 return aarch64_expand_sve_dupq (target, mode, vq_src);
4181 /* Get an integer representation of the repeating part of Advanced
4182 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4183 which for big-endian targets is lane-swapped wrt a normal
4184 Advanced SIMD vector. This means that for both endiannesses,
4185 memory lane N of SVE vector SRC corresponds to architectural
4186 lane N of a register holding VQ_SRC. This in turn means that
4187 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4188 as a single 128-bit value) and thus that memory lane 0 of SRC is
4189 in the lsb of the integer. Duplicating the integer therefore
4190 ensures that memory lane N of SRC goes into architectural lane
4191 N + I * INDEX of the SVE register. */
4192 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4193 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4194 if (elt_value)
4196 /* Pretend that we had a vector of INT_MODE to start with. */
4197 elt_mode = int_mode;
4198 mode = aarch64_full_sve_mode (int_mode).require ();
4200 /* If the integer can be moved into a general register by a
4201 single instruction, do that and duplicate the result. */
4202 if (CONST_INT_P (elt_value)
4203 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4205 elt_value = force_reg (elt_mode, elt_value);
4206 return expand_vector_broadcast (mode, elt_value);
4209 else if (npatterns == 1)
4210 /* We're duplicating a single value, but can't do better than
4211 force it to memory and load from there. This handles things
4212 like symbolic constants. */
4213 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4215 if (elt_value)
4217 /* Load the element from memory if we can, otherwise move it into
4218 a register and use a DUP. */
4219 rtx op = force_const_mem (elt_mode, elt_value);
4220 if (!op)
4221 op = force_reg (elt_mode, elt_value);
4222 return expand_vector_broadcast (mode, op);
4226 /* Try using INDEX. */
4227 rtx base, step;
4228 if (const_vec_series_p (src, &base, &step))
4230 aarch64_expand_vec_series (target, base, step);
4231 return target;
4234 /* From here on, it's better to force the whole constant to memory
4235 if we can. */
4236 if (GET_MODE_NUNITS (mode).is_constant ())
4237 return NULL_RTX;
4239 /* Expand each pattern individually. */
4240 gcc_assert (npatterns > 1);
4241 rtx_vector_builder builder;
4242 auto_vec<rtx, 16> vectors (npatterns);
4243 for (unsigned int i = 0; i < npatterns; ++i)
4245 builder.new_vector (mode, 1, nelts_per_pattern);
4246 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4247 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4248 vectors.quick_push (force_reg (mode, builder.build ()));
4251 /* Use permutes to interleave the separate vectors. */
4252 while (npatterns > 1)
4254 npatterns /= 2;
4255 for (unsigned int i = 0; i < npatterns; ++i)
4257 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4258 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4259 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4260 vectors[i] = tmp;
4263 gcc_assert (vectors[0] == target);
4264 return target;
4267 /* Use WHILE to set a predicate register of mode MODE in which the first
4268 VL bits are set and the rest are clear. Use TARGET for the register
4269 if it's nonnull and convenient. */
4271 static rtx
4272 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4273 unsigned int vl)
4275 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4276 target = aarch64_target_reg (target, mode);
4277 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4278 target, const0_rtx, limit));
4279 return target;
4282 static rtx
4283 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4285 /* BUILDER is a constant predicate in which the index of every set bit
4286 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4287 by inverting every element at a multiple of ELT_SIZE and EORing the
4288 result with an ELT_SIZE PTRUE.
4290 Return a register that contains the constant on success, otherwise
4291 return null. Use TARGET as the register if it is nonnull and
4292 convenient. */
4294 static rtx
4295 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4296 unsigned int elt_size)
4298 /* Invert every element at a multiple of ELT_SIZE, keeping the
4299 other bits zero. */
4300 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4301 builder.nelts_per_pattern ());
4302 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4303 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4304 inv_builder.quick_push (const1_rtx);
4305 else
4306 inv_builder.quick_push (const0_rtx);
4307 inv_builder.finalize ();
4309 /* See if we can load the constant cheaply. */
4310 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4311 if (!inv)
4312 return NULL_RTX;
4314 /* EOR the result with an ELT_SIZE PTRUE. */
4315 rtx mask = aarch64_ptrue_all (elt_size);
4316 mask = force_reg (VNx16BImode, mask);
4317 target = aarch64_target_reg (target, VNx16BImode);
4318 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4319 return target;
4322 /* BUILDER is a constant predicate in which the index of every set bit
4323 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4324 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4325 register on success, otherwise return null. Use TARGET as the register
4326 if nonnull and convenient. */
4328 static rtx
4329 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4330 unsigned int elt_size,
4331 unsigned int permute_size)
4333 /* We're going to split the constant into two new constants A and B,
4334 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4335 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4337 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4338 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4340 where _ indicates elements that will be discarded by the permute.
4342 First calculate the ELT_SIZEs for A and B. */
4343 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4344 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4345 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4346 if (INTVAL (builder.elt (i)) != 0)
4348 if (i & permute_size)
4349 b_elt_size |= i - permute_size;
4350 else
4351 a_elt_size |= i;
4353 a_elt_size &= -a_elt_size;
4354 b_elt_size &= -b_elt_size;
4356 /* Now construct the vectors themselves. */
4357 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4358 builder.nelts_per_pattern ());
4359 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4360 builder.nelts_per_pattern ());
4361 unsigned int nelts = builder.encoded_nelts ();
4362 for (unsigned int i = 0; i < nelts; ++i)
4363 if (i & (elt_size - 1))
4365 a_builder.quick_push (const0_rtx);
4366 b_builder.quick_push (const0_rtx);
4368 else if ((i & permute_size) == 0)
4370 /* The A and B elements are significant. */
4371 a_builder.quick_push (builder.elt (i));
4372 b_builder.quick_push (builder.elt (i + permute_size));
4374 else
4376 /* The A and B elements are going to be discarded, so pick whatever
4377 is likely to give a nice constant. We are targeting element
4378 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4379 with the aim of each being a sequence of ones followed by
4380 a sequence of zeros. So:
4382 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4383 duplicate the last X_ELT_SIZE element, to extend the
4384 current sequence of ones or zeros.
4386 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4387 zero, so that the constant really does have X_ELT_SIZE and
4388 not a smaller size. */
4389 if (a_elt_size > permute_size)
4390 a_builder.quick_push (const0_rtx);
4391 else
4392 a_builder.quick_push (a_builder.elt (i - a_elt_size));
4393 if (b_elt_size > permute_size)
4394 b_builder.quick_push (const0_rtx);
4395 else
4396 b_builder.quick_push (b_builder.elt (i - b_elt_size));
4398 a_builder.finalize ();
4399 b_builder.finalize ();
4401 /* Try loading A into a register. */
4402 rtx_insn *last = get_last_insn ();
4403 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4404 if (!a)
4405 return NULL_RTX;
4407 /* Try loading B into a register. */
4408 rtx b = a;
4409 if (a_builder != b_builder)
4411 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4412 if (!b)
4414 delete_insns_since (last);
4415 return NULL_RTX;
4419 /* Emit the TRN1 itself. */
4420 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4421 target = aarch64_target_reg (target, mode);
4422 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4423 gen_lowpart (mode, a),
4424 gen_lowpart (mode, b)));
4425 return target;
4428 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4429 constant in BUILDER into an SVE predicate register. Return the register
4430 on success, otherwise return null. Use TARGET for the register if
4431 nonnull and convenient.
4433 ALLOW_RECURSE_P is true if we can use methods that would call this
4434 function recursively. */
4436 static rtx
4437 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4438 bool allow_recurse_p)
4440 if (builder.encoded_nelts () == 1)
4441 /* A PFALSE or a PTRUE .B ALL. */
4442 return aarch64_emit_set_immediate (target, builder);
4444 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4445 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4447 /* If we can load the constant using PTRUE, use it as-is. */
4448 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4449 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4450 return aarch64_emit_set_immediate (target, builder);
4452 /* Otherwise use WHILE to set the first VL bits. */
4453 return aarch64_sve_move_pred_via_while (target, mode, vl);
4456 if (!allow_recurse_p)
4457 return NULL_RTX;
4459 /* Try inverting the vector in element size ELT_SIZE and then EORing
4460 the result with an ELT_SIZE PTRUE. */
4461 if (INTVAL (builder.elt (0)) == 0)
4462 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4463 elt_size))
4464 return res;
4466 /* Try using TRN1 to permute two simpler constants. */
4467 for (unsigned int i = elt_size; i <= 8; i *= 2)
4468 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4469 elt_size, i))
4470 return res;
4472 return NULL_RTX;
4475 /* Return an SVE predicate register that contains the VNx16BImode
4476 constant in BUILDER, without going through the move expanders.
4478 The returned register can have whatever mode seems most natural
4479 given the contents of BUILDER. Use TARGET for the result if
4480 convenient. */
4482 static rtx
4483 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4485 /* Try loading the constant using pure predicate operations. */
4486 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4487 return res;
4489 /* Try forcing the constant to memory. */
4490 if (builder.full_nelts ().is_constant ())
4491 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4493 target = aarch64_target_reg (target, VNx16BImode);
4494 emit_move_insn (target, mem);
4495 return target;
4498 /* The last resort is to load the constant as an integer and then
4499 compare it against zero. Use -1 for set bits in order to increase
4500 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4501 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4502 builder.nelts_per_pattern ());
4503 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4504 int_builder.quick_push (INTVAL (builder.elt (i))
4505 ? constm1_rtx : const0_rtx);
4506 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4507 int_builder.build ());
4510 /* Set DEST to immediate IMM. */
4512 void
4513 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4515 machine_mode mode = GET_MODE (dest);
4517 /* Check on what type of symbol it is. */
4518 scalar_int_mode int_mode;
4519 if ((GET_CODE (imm) == SYMBOL_REF
4520 || GET_CODE (imm) == LABEL_REF
4521 || GET_CODE (imm) == CONST
4522 || GET_CODE (imm) == CONST_POLY_INT)
4523 && is_a <scalar_int_mode> (mode, &int_mode))
4525 rtx mem;
4526 poly_int64 offset;
4527 HOST_WIDE_INT const_offset;
4528 enum aarch64_symbol_type sty;
4530 /* If we have (const (plus symbol offset)), separate out the offset
4531 before we start classifying the symbol. */
4532 rtx base = strip_offset (imm, &offset);
4534 /* We must always add an offset involving VL separately, rather than
4535 folding it into the relocation. */
4536 if (!offset.is_constant (&const_offset))
4538 if (!TARGET_SVE)
4540 aarch64_report_sve_required ();
4541 return;
4543 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4544 emit_insn (gen_rtx_SET (dest, imm));
4545 else
4547 /* Do arithmetic on 32-bit values if the result is smaller
4548 than that. */
4549 if (partial_subreg_p (int_mode, SImode))
4551 /* It is invalid to do symbol calculations in modes
4552 narrower than SImode. */
4553 gcc_assert (base == const0_rtx);
4554 dest = gen_lowpart (SImode, dest);
4555 int_mode = SImode;
4557 if (base != const0_rtx)
4559 base = aarch64_force_temporary (int_mode, dest, base);
4560 aarch64_add_offset (int_mode, dest, base, offset,
4561 NULL_RTX, NULL_RTX, false);
4563 else
4564 aarch64_add_offset (int_mode, dest, base, offset,
4565 dest, NULL_RTX, false);
4567 return;
4570 sty = aarch64_classify_symbol (base, const_offset);
4571 switch (sty)
4573 case SYMBOL_FORCE_TO_MEM:
4574 if (const_offset != 0
4575 && targetm.cannot_force_const_mem (int_mode, imm))
4577 gcc_assert (can_create_pseudo_p ());
4578 base = aarch64_force_temporary (int_mode, dest, base);
4579 aarch64_add_offset (int_mode, dest, base, const_offset,
4580 NULL_RTX, NULL_RTX, false);
4581 return;
4584 mem = force_const_mem (ptr_mode, imm);
4585 gcc_assert (mem);
4587 /* If we aren't generating PC relative literals, then
4588 we need to expand the literal pool access carefully.
4589 This is something that needs to be done in a number
4590 of places, so could well live as a separate function. */
4591 if (!aarch64_pcrelative_literal_loads)
4593 gcc_assert (can_create_pseudo_p ());
4594 base = gen_reg_rtx (ptr_mode);
4595 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4596 if (ptr_mode != Pmode)
4597 base = convert_memory_address (Pmode, base);
4598 mem = gen_rtx_MEM (ptr_mode, base);
4601 if (int_mode != ptr_mode)
4602 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4604 emit_insn (gen_rtx_SET (dest, mem));
4606 return;
4608 case SYMBOL_SMALL_TLSGD:
4609 case SYMBOL_SMALL_TLSDESC:
4610 case SYMBOL_SMALL_TLSIE:
4611 case SYMBOL_SMALL_GOT_28K:
4612 case SYMBOL_SMALL_GOT_4G:
4613 case SYMBOL_TINY_GOT:
4614 case SYMBOL_TINY_TLSIE:
4615 if (const_offset != 0)
4617 gcc_assert(can_create_pseudo_p ());
4618 base = aarch64_force_temporary (int_mode, dest, base);
4619 aarch64_add_offset (int_mode, dest, base, const_offset,
4620 NULL_RTX, NULL_RTX, false);
4621 return;
4623 /* FALLTHRU */
4625 case SYMBOL_SMALL_ABSOLUTE:
4626 case SYMBOL_TINY_ABSOLUTE:
4627 case SYMBOL_TLSLE12:
4628 case SYMBOL_TLSLE24:
4629 case SYMBOL_TLSLE32:
4630 case SYMBOL_TLSLE48:
4631 aarch64_load_symref_appropriately (dest, imm, sty);
4632 return;
4634 default:
4635 gcc_unreachable ();
4639 if (!CONST_INT_P (imm))
4641 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4643 /* Only the low bit of each .H, .S and .D element is defined,
4644 so we can set the upper bits to whatever we like. If the
4645 predicate is all-true in MODE, prefer to set all the undefined
4646 bits as well, so that we can share a single .B predicate for
4647 all modes. */
4648 if (imm == CONSTM1_RTX (mode))
4649 imm = CONSTM1_RTX (VNx16BImode);
4651 /* All methods for constructing predicate modes wider than VNx16BI
4652 will set the upper bits of each element to zero. Expose this
4653 by moving such constants as a VNx16BI, so that all bits are
4654 significant and so that constants for different modes can be
4655 shared. The wider constant will still be available as a
4656 REG_EQUAL note. */
4657 rtx_vector_builder builder;
4658 if (aarch64_get_sve_pred_bits (builder, imm))
4660 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4661 if (dest != res)
4662 emit_move_insn (dest, gen_lowpart (mode, res));
4663 return;
4667 if (GET_CODE (imm) == HIGH
4668 || aarch64_simd_valid_immediate (imm, NULL))
4670 emit_insn (gen_rtx_SET (dest, imm));
4671 return;
4674 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4675 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4677 if (dest != res)
4678 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4679 return;
4682 rtx mem = force_const_mem (mode, imm);
4683 gcc_assert (mem);
4684 emit_move_insn (dest, mem);
4685 return;
4688 aarch64_internal_mov_immediate (dest, imm, true,
4689 as_a <scalar_int_mode> (mode));
4692 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4693 that is known to contain PTRUE. */
4695 void
4696 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4698 expand_operand ops[3];
4699 machine_mode mode = GET_MODE (dest);
4700 create_output_operand (&ops[0], dest, mode);
4701 create_input_operand (&ops[1], pred, GET_MODE(pred));
4702 create_input_operand (&ops[2], src, mode);
4703 temporary_volatile_ok v (true);
4704 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4707 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4708 operand is in memory. In this case we need to use the predicated LD1
4709 and ST1 instead of LDR and STR, both for correctness on big-endian
4710 targets and because LD1 and ST1 support a wider range of addressing modes.
4711 PRED_MODE is the mode of the predicate.
4713 See the comment at the head of aarch64-sve.md for details about the
4714 big-endian handling. */
4716 void
4717 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4719 machine_mode mode = GET_MODE (dest);
4720 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4721 if (!register_operand (src, mode)
4722 && !register_operand (dest, mode))
4724 rtx tmp = gen_reg_rtx (mode);
4725 if (MEM_P (src))
4726 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4727 else
4728 emit_move_insn (tmp, src);
4729 src = tmp;
4731 aarch64_emit_sve_pred_move (dest, ptrue, src);
4734 /* Called only on big-endian targets. See whether an SVE vector move
4735 from SRC to DEST is effectively a REV[BHW] instruction, because at
4736 least one operand is a subreg of an SVE vector that has wider or
4737 narrower elements. Return true and emit the instruction if so.
4739 For example:
4741 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4743 represents a VIEW_CONVERT between the following vectors, viewed
4744 in memory order:
4746 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4747 R1: { [0], [1], [2], [3], ... }
4749 The high part of lane X in R2 should therefore correspond to lane X*2
4750 of R1, but the register representations are:
4752 msb lsb
4753 R2: ...... [1].high [1].low [0].high [0].low
4754 R1: ...... [3] [2] [1] [0]
4756 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4757 We therefore need a reverse operation to swap the high and low values
4758 around.
4760 This is purely an optimization. Without it we would spill the
4761 subreg operand to the stack in one mode and reload it in the
4762 other mode, which has the same effect as the REV. */
4764 bool
4765 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4767 gcc_assert (BYTES_BIG_ENDIAN);
4768 if (GET_CODE (dest) == SUBREG)
4769 dest = SUBREG_REG (dest);
4770 if (GET_CODE (src) == SUBREG)
4771 src = SUBREG_REG (src);
4773 /* The optimization handles two single SVE REGs with different element
4774 sizes. */
4775 if (!REG_P (dest)
4776 || !REG_P (src)
4777 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4778 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4779 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4780 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4781 return false;
4783 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4784 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4785 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4786 UNSPEC_REV_SUBREG);
4787 emit_insn (gen_rtx_SET (dest, unspec));
4788 return true;
4791 /* Return a copy of X with mode MODE, without changing its other
4792 attributes. Unlike gen_lowpart, this doesn't care whether the
4793 mode change is valid. */
4796 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4798 if (GET_MODE (x) == mode)
4799 return x;
4801 x = shallow_copy_rtx (x);
4802 set_mode_and_regno (x, mode, REGNO (x));
4803 return x;
4806 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4807 stored in wider integer containers. */
4809 static unsigned int
4810 aarch64_sve_rev_unspec (machine_mode mode)
4812 switch (GET_MODE_UNIT_SIZE (mode))
4814 case 1: return UNSPEC_REVB;
4815 case 2: return UNSPEC_REVH;
4816 case 4: return UNSPEC_REVW;
4818 gcc_unreachable ();
4821 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4822 operands. */
4824 void
4825 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4827 /* Decide which REV operation we need. The mode with wider elements
4828 determines the mode of the operands and the mode with the narrower
4829 elements determines the reverse width. */
4830 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
4831 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
4832 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4833 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4834 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4836 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4837 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
4839 /* Get the operands in the appropriate modes and emit the instruction. */
4840 ptrue = gen_lowpart (pred_mode, ptrue);
4841 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4842 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4843 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4844 dest, ptrue, src));
4847 static bool
4848 aarch64_function_ok_for_sibcall (tree, tree exp)
4850 if (crtl->abi->id () != expr_callee_abi (exp).id ())
4851 return false;
4853 return true;
4856 /* Implement TARGET_PASS_BY_REFERENCE. */
4858 static bool
4859 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4860 const function_arg_info &arg)
4862 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4863 HOST_WIDE_INT size;
4864 machine_mode dummymode;
4865 int nregs;
4867 unsigned int num_zr, num_pr;
4868 if (arg.type && aarch64_sve::builtin_type_p (arg.type, &num_zr, &num_pr))
4870 if (pcum && !pcum->silent_p && !TARGET_SVE)
4871 /* We can't gracefully recover at this point, so make this a
4872 fatal error. */
4873 fatal_error (input_location, "arguments of type %qT require"
4874 " the SVE ISA extension", arg.type);
4876 /* Variadic SVE types are passed by reference. Normal non-variadic
4877 arguments are too if we've run out of registers. */
4878 return (!arg.named
4879 || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4880 || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4883 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4884 if (arg.mode == BLKmode && arg.type)
4885 size = int_size_in_bytes (arg.type);
4886 else
4887 /* No frontends can create types with variable-sized modes, so we
4888 shouldn't be asked to pass or return them. */
4889 size = GET_MODE_SIZE (arg.mode).to_constant ();
4891 /* Aggregates are passed by reference based on their size. */
4892 if (arg.aggregate_type_p ())
4893 size = int_size_in_bytes (arg.type);
4895 /* Variable sized arguments are always returned by reference. */
4896 if (size < 0)
4897 return true;
4899 /* Can this be a candidate to be passed in fp/simd register(s)? */
4900 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4901 &dummymode, &nregs,
4902 NULL))
4903 return false;
4905 /* Arguments which are variable sized or larger than 2 registers are
4906 passed by reference unless they are a homogenous floating point
4907 aggregate. */
4908 return size > 2 * UNITS_PER_WORD;
4911 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4912 static bool
4913 aarch64_return_in_msb (const_tree valtype)
4915 machine_mode dummy_mode;
4916 int dummy_int;
4918 /* Never happens in little-endian mode. */
4919 if (!BYTES_BIG_ENDIAN)
4920 return false;
4922 /* Only composite types smaller than or equal to 16 bytes can
4923 be potentially returned in registers. */
4924 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4925 || int_size_in_bytes (valtype) <= 0
4926 || int_size_in_bytes (valtype) > 16)
4927 return false;
4929 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4930 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4931 is always passed/returned in the least significant bits of fp/simd
4932 register(s). */
4933 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4934 &dummy_mode, &dummy_int, NULL))
4935 return false;
4937 return true;
4940 /* Subroutine of aarch64_function_value. MODE is the mode of the argument
4941 after promotion, and after partial SVE types have been replaced by
4942 their integer equivalents. */
4943 static rtx
4944 aarch64_function_value_1 (const_tree type, machine_mode mode)
4946 unsigned int num_zr, num_pr;
4947 if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
4949 /* Don't raise an error here if we're called when SVE is disabled,
4950 since this is really just a query function. Other code must
4951 do that where appropriate. */
4952 mode = TYPE_MODE_RAW (type);
4953 gcc_assert (VECTOR_MODE_P (mode)
4954 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4956 if (num_zr > 0 && num_pr == 0)
4957 return gen_rtx_REG (mode, V0_REGNUM);
4959 if (num_zr == 0 && num_pr == 1)
4960 return gen_rtx_REG (mode, P0_REGNUM);
4962 gcc_unreachable ();
4965 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4966 returned in memory, not by value. */
4967 gcc_assert (!aarch64_sve_mode_p (mode));
4969 if (aarch64_return_in_msb (type))
4971 HOST_WIDE_INT size = int_size_in_bytes (type);
4973 if (size % UNITS_PER_WORD != 0)
4975 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4976 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4980 int count;
4981 machine_mode ag_mode;
4982 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4983 &ag_mode, &count, NULL))
4985 if (!aarch64_composite_type_p (type, mode))
4987 gcc_assert (count == 1 && mode == ag_mode);
4988 return gen_rtx_REG (mode, V0_REGNUM);
4990 else
4992 int i;
4993 rtx par;
4995 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4996 for (i = 0; i < count; i++)
4998 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4999 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5000 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5001 XVECEXP (par, 0, i) = tmp;
5003 return par;
5006 else
5007 return gen_rtx_REG (mode, R0_REGNUM);
5010 /* Implement TARGET_FUNCTION_VALUE.
5011 Define how to find the value returned by a function. */
5013 static rtx
5014 aarch64_function_value (const_tree type, const_tree func,
5015 bool outgoing ATTRIBUTE_UNUSED)
5017 machine_mode mode;
5018 int unsignedp;
5020 mode = TYPE_MODE (type);
5021 if (INTEGRAL_TYPE_P (type))
5022 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5024 /* Vector types can acquire a partial SVE mode using things like
5025 __attribute__((vector_size(N))), and this is potentially useful.
5026 However, the choice of mode doesn't affect the type's ABI identity,
5027 so we should treat the types as though they had the associated
5028 integer mode, just like they did before SVE was introduced.
5030 We know that the vector must be 128 bits or smaller, otherwise we'd
5031 have returned it in memory instead. */
5032 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5033 if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5035 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5036 rtx reg = aarch64_function_value_1 (type, int_mode);
5037 /* Vector types are never returned in the MSB and are never split. */
5038 gcc_assert (REG_P (reg) && GET_MODE (reg) == int_mode);
5039 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5040 return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, pair));
5043 return aarch64_function_value_1 (type, mode);
5046 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5047 Return true if REGNO is the number of a hard register in which the values
5048 of called function may come back. */
5050 static bool
5051 aarch64_function_value_regno_p (const unsigned int regno)
5053 /* Maximum of 16 bytes can be returned in the general registers. Examples
5054 of 16-byte return values are: 128-bit integers and 16-byte small
5055 structures (excluding homogeneous floating-point aggregates). */
5056 if (regno == R0_REGNUM || regno == R1_REGNUM)
5057 return true;
5059 /* Up to four fp/simd registers can return a function value, e.g. a
5060 homogeneous floating-point aggregate having four members. */
5061 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5062 return TARGET_FLOAT;
5064 return false;
5067 /* Implement TARGET_RETURN_IN_MEMORY.
5069 If the type T of the result of a function is such that
5070 void func (T arg)
5071 would require that arg be passed as a value in a register (or set of
5072 registers) according to the parameter passing rules, then the result
5073 is returned in the same registers as would be used for such an
5074 argument. */
5076 static bool
5077 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5079 HOST_WIDE_INT size;
5080 machine_mode ag_mode;
5081 int count;
5083 if (!AGGREGATE_TYPE_P (type)
5084 && TREE_CODE (type) != COMPLEX_TYPE
5085 && TREE_CODE (type) != VECTOR_TYPE)
5086 /* Simple scalar types always returned in registers. */
5087 return false;
5089 unsigned int num_zr, num_pr;
5090 if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5092 /* All SVE types we support fit in registers. For example, it isn't
5093 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5094 predicates. */
5095 gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
5096 return false;
5099 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5100 type,
5101 &ag_mode,
5102 &count,
5103 NULL))
5104 return false;
5106 /* Types larger than 2 registers returned in memory. */
5107 size = int_size_in_bytes (type);
5108 return (size < 0 || size > 2 * UNITS_PER_WORD);
5111 static bool
5112 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5113 const_tree type, int *nregs)
5115 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5116 return aarch64_vfp_is_call_or_return_candidate (mode,
5117 type,
5118 &pcum->aapcs_vfp_rmode,
5119 nregs,
5120 NULL);
5123 /* Given MODE and TYPE of a function argument, return the alignment in
5124 bits. The idea is to suppress any stronger alignment requested by
5125 the user and opt for the natural alignment (specified in AAPCS64 \S
5126 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5127 calculated in versions of GCC prior to GCC-9. This is a helper
5128 function for local use only. */
5130 static unsigned int
5131 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5132 bool *abi_break)
5134 *abi_break = false;
5135 if (!type)
5136 return GET_MODE_ALIGNMENT (mode);
5138 if (integer_zerop (TYPE_SIZE (type)))
5139 return 0;
5141 gcc_assert (TYPE_MODE (type) == mode);
5143 if (!AGGREGATE_TYPE_P (type))
5144 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5146 if (TREE_CODE (type) == ARRAY_TYPE)
5147 return TYPE_ALIGN (TREE_TYPE (type));
5149 unsigned int alignment = 0;
5150 unsigned int bitfield_alignment = 0;
5151 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5152 if (TREE_CODE (field) == FIELD_DECL)
5154 alignment = std::max (alignment, DECL_ALIGN (field));
5155 if (DECL_BIT_FIELD_TYPE (field))
5156 bitfield_alignment
5157 = std::max (bitfield_alignment,
5158 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5161 if (bitfield_alignment > alignment)
5163 *abi_break = true;
5164 return bitfield_alignment;
5167 return alignment;
5170 /* Layout a function argument according to the AAPCS64 rules. The rule
5171 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5172 mode that was originally given to us by the target hook, whereas the
5173 mode in ARG might be the result of replacing partial SVE modes with
5174 the equivalent integer mode. */
5176 static void
5177 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg,
5178 machine_mode orig_mode)
5180 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5181 tree type = arg.type;
5182 machine_mode mode = arg.mode;
5183 int ncrn, nvrn, nregs;
5184 bool allocate_ncrn, allocate_nvrn;
5185 HOST_WIDE_INT size;
5186 bool abi_break;
5188 /* We need to do this once per argument. */
5189 if (pcum->aapcs_arg_processed)
5190 return;
5192 /* Vector types can acquire a partial SVE mode using things like
5193 __attribute__((vector_size(N))), and this is potentially useful.
5194 However, the choice of mode doesn't affect the type's ABI identity,
5195 so we should treat the types as though they had the associated
5196 integer mode, just like they did before SVE was introduced.
5198 We know that the vector must be 128 bits or smaller, otherwise we'd
5199 have passed it by reference instead. */
5200 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5201 if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5203 function_arg_info tmp_arg = arg;
5204 tmp_arg.mode = int_mode_for_mode (mode).require ();
5205 aarch64_layout_arg (pcum_v, tmp_arg, orig_mode);
5206 if (rtx reg = pcum->aapcs_reg)
5208 gcc_assert (REG_P (reg) && GET_MODE (reg) == tmp_arg.mode);
5209 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5210 pcum->aapcs_reg = gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5212 return;
5215 pcum->aapcs_arg_processed = true;
5217 unsigned int num_zr, num_pr;
5218 if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5220 /* The PCS says that it is invalid to pass an SVE value to an
5221 unprototyped function. There is no ABI-defined location we
5222 can return in this case, so we have no real choice but to raise
5223 an error immediately, even though this is only a query function. */
5224 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5226 gcc_assert (!pcum->silent_p);
5227 error ("SVE type %qT cannot be passed to an unprototyped function",
5228 arg.type);
5229 /* Avoid repeating the message, and avoid tripping the assert
5230 below. */
5231 pcum->pcs_variant = ARM_PCS_SVE;
5234 /* We would have converted the argument into pass-by-reference
5235 form if it didn't fit in registers. */
5236 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5237 pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5238 gcc_assert (arg.named
5239 && pcum->pcs_variant == ARM_PCS_SVE
5240 && aarch64_sve_mode_p (mode)
5241 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5242 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5244 if (num_zr > 0 && num_pr == 0)
5245 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5246 else if (num_zr == 0 && num_pr == 1)
5247 pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5248 else
5249 gcc_unreachable ();
5250 return;
5253 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5254 passed by reference, not by value. */
5255 gcc_assert (!aarch64_sve_mode_p (mode));
5257 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5258 if (type)
5259 size = int_size_in_bytes (type);
5260 else
5261 /* No frontends can create types with variable-sized modes, so we
5262 shouldn't be asked to pass or return them. */
5263 size = GET_MODE_SIZE (mode).to_constant ();
5264 size = ROUND_UP (size, UNITS_PER_WORD);
5266 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5267 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5268 mode,
5269 type,
5270 &nregs);
5272 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5273 The following code thus handles passing by SIMD/FP registers first. */
5275 nvrn = pcum->aapcs_nvrn;
5277 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5278 and homogenous short-vector aggregates (HVA). */
5279 if (allocate_nvrn)
5281 if (!pcum->silent_p && !TARGET_FLOAT)
5282 aarch64_err_no_fpadvsimd (mode);
5284 if (nvrn + nregs <= NUM_FP_ARG_REGS)
5286 pcum->aapcs_nextnvrn = nvrn + nregs;
5287 if (!aarch64_composite_type_p (type, mode))
5289 gcc_assert (nregs == 1);
5290 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5292 else
5294 rtx par;
5295 int i;
5296 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5297 for (i = 0; i < nregs; i++)
5299 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5300 V0_REGNUM + nvrn + i);
5301 rtx offset = gen_int_mode
5302 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5303 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5304 XVECEXP (par, 0, i) = tmp;
5306 pcum->aapcs_reg = par;
5308 return;
5310 else
5312 /* C.3 NSRN is set to 8. */
5313 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5314 goto on_stack;
5318 ncrn = pcum->aapcs_ncrn;
5319 nregs = size / UNITS_PER_WORD;
5321 /* C6 - C9. though the sign and zero extension semantics are
5322 handled elsewhere. This is the case where the argument fits
5323 entirely general registers. */
5324 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5326 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5328 /* C.8 if the argument has an alignment of 16 then the NGRN is
5329 rounded up to the next even number. */
5330 if (nregs == 2
5331 && ncrn % 2
5332 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5333 comparison is there because for > 16 * BITS_PER_UNIT
5334 alignment nregs should be > 2 and therefore it should be
5335 passed by reference rather than value. */
5336 && (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5337 == 16 * BITS_PER_UNIT))
5339 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5340 inform (input_location, "parameter passing for argument of type "
5341 "%qT changed in GCC 9.1", type);
5342 ++ncrn;
5343 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5346 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5347 A reg is still generated for it, but the caller should be smart
5348 enough not to use it. */
5349 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5350 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5351 else
5353 rtx par;
5354 int i;
5356 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5357 for (i = 0; i < nregs; i++)
5359 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5360 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5361 GEN_INT (i * UNITS_PER_WORD));
5362 XVECEXP (par, 0, i) = tmp;
5364 pcum->aapcs_reg = par;
5367 pcum->aapcs_nextncrn = ncrn + nregs;
5368 return;
5371 /* C.11 */
5372 pcum->aapcs_nextncrn = NUM_ARG_REGS;
5374 /* The argument is passed on stack; record the needed number of words for
5375 this argument and align the total size if necessary. */
5376 on_stack:
5377 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5379 if (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5380 == 16 * BITS_PER_UNIT)
5382 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5383 if (pcum->aapcs_stack_size != new_size)
5385 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5386 inform (input_location, "parameter passing for argument of type "
5387 "%qT changed in GCC 9.1", type);
5388 pcum->aapcs_stack_size = new_size;
5391 return;
5394 /* Implement TARGET_FUNCTION_ARG. */
5396 static rtx
5397 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5399 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5400 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5401 || pcum->pcs_variant == ARM_PCS_SIMD
5402 || pcum->pcs_variant == ARM_PCS_SVE);
5404 if (arg.end_marker_p ())
5405 return gen_int_mode (pcum->pcs_variant, DImode);
5407 aarch64_layout_arg (pcum_v, arg, arg.mode);
5408 return pcum->aapcs_reg;
5411 void
5412 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5413 const_tree fntype,
5414 rtx libname ATTRIBUTE_UNUSED,
5415 const_tree fndecl ATTRIBUTE_UNUSED,
5416 unsigned n_named ATTRIBUTE_UNUSED,
5417 bool silent_p)
5419 pcum->aapcs_ncrn = 0;
5420 pcum->aapcs_nvrn = 0;
5421 pcum->aapcs_nprn = 0;
5422 pcum->aapcs_nextncrn = 0;
5423 pcum->aapcs_nextnvrn = 0;
5424 pcum->aapcs_nextnprn = 0;
5425 if (fntype)
5426 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5427 else
5428 pcum->pcs_variant = ARM_PCS_AAPCS64;
5429 pcum->aapcs_reg = NULL_RTX;
5430 pcum->aapcs_arg_processed = false;
5431 pcum->aapcs_stack_words = 0;
5432 pcum->aapcs_stack_size = 0;
5433 pcum->silent_p = silent_p;
5435 if (!silent_p
5436 && !TARGET_FLOAT
5437 && fndecl && TREE_PUBLIC (fndecl)
5438 && fntype && fntype != error_mark_node)
5440 const_tree type = TREE_TYPE (fntype);
5441 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
5442 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
5443 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5444 &mode, &nregs, NULL))
5445 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5448 if (!silent_p
5449 && !TARGET_SVE
5450 && pcum->pcs_variant == ARM_PCS_SVE)
5452 /* We can't gracefully recover at this point, so make this a
5453 fatal error. */
5454 if (fndecl)
5455 fatal_error (input_location, "%qE requires the SVE ISA extension",
5456 fndecl);
5457 else
5458 fatal_error (input_location, "calls to functions of type %qT require"
5459 " the SVE ISA extension", fntype);
5463 static void
5464 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5465 const function_arg_info &arg)
5467 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5468 if (pcum->pcs_variant == ARM_PCS_AAPCS64
5469 || pcum->pcs_variant == ARM_PCS_SIMD
5470 || pcum->pcs_variant == ARM_PCS_SVE)
5472 aarch64_layout_arg (pcum_v, arg, arg.mode);
5473 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5474 != (pcum->aapcs_stack_words != 0));
5475 pcum->aapcs_arg_processed = false;
5476 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5477 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5478 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5479 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5480 pcum->aapcs_stack_words = 0;
5481 pcum->aapcs_reg = NULL_RTX;
5485 bool
5486 aarch64_function_arg_regno_p (unsigned regno)
5488 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5489 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5492 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5493 PARM_BOUNDARY bits of alignment, but will be given anything up
5494 to STACK_BOUNDARY bits if the type requires it. This makes sure
5495 that both before and after the layout of each argument, the Next
5496 Stacked Argument Address (NSAA) will have a minimum alignment of
5497 8 bytes. */
5499 static unsigned int
5500 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5502 bool abi_break;
5503 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5504 &abi_break);
5505 if (abi_break & warn_psabi)
5506 inform (input_location, "parameter passing for argument of type "
5507 "%qT changed in GCC 9.1", type);
5509 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5512 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5514 static fixed_size_mode
5515 aarch64_get_reg_raw_mode (int regno)
5517 if (TARGET_SVE && FP_REGNUM_P (regno))
5518 /* Don't use the SVE part of the register for __builtin_apply and
5519 __builtin_return. The SVE registers aren't used by the normal PCS,
5520 so using them there would be a waste of time. The PCS extensions
5521 for SVE types are fundamentally incompatible with the
5522 __builtin_return/__builtin_apply interface. */
5523 return as_a <fixed_size_mode> (V16QImode);
5524 return default_get_reg_raw_mode (regno);
5527 /* Implement TARGET_FUNCTION_ARG_PADDING.
5529 Small aggregate types are placed in the lowest memory address.
5531 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5533 static pad_direction
5534 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5536 /* On little-endian targets, the least significant byte of every stack
5537 argument is passed at the lowest byte address of the stack slot. */
5538 if (!BYTES_BIG_ENDIAN)
5539 return PAD_UPWARD;
5541 /* Otherwise, integral, floating-point and pointer types are padded downward:
5542 the least significant byte of a stack argument is passed at the highest
5543 byte address of the stack slot. */
5544 if (type
5545 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5546 || POINTER_TYPE_P (type))
5547 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5548 return PAD_DOWNWARD;
5550 /* Everything else padded upward, i.e. data in first byte of stack slot. */
5551 return PAD_UPWARD;
5554 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5556 It specifies padding for the last (may also be the only)
5557 element of a block move between registers and memory. If
5558 assuming the block is in the memory, padding upward means that
5559 the last element is padded after its highest significant byte,
5560 while in downward padding, the last element is padded at the
5561 its least significant byte side.
5563 Small aggregates and small complex types are always padded
5564 upwards.
5566 We don't need to worry about homogeneous floating-point or
5567 short-vector aggregates; their move is not affected by the
5568 padding direction determined here. Regardless of endianness,
5569 each element of such an aggregate is put in the least
5570 significant bits of a fp/simd register.
5572 Return !BYTES_BIG_ENDIAN if the least significant byte of the
5573 register has useful data, and return the opposite if the most
5574 significant byte does. */
5576 bool
5577 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5578 bool first ATTRIBUTE_UNUSED)
5581 /* Small composite types are always padded upward. */
5582 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5584 HOST_WIDE_INT size;
5585 if (type)
5586 size = int_size_in_bytes (type);
5587 else
5588 /* No frontends can create types with variable-sized modes, so we
5589 shouldn't be asked to pass or return them. */
5590 size = GET_MODE_SIZE (mode).to_constant ();
5591 if (size < 2 * UNITS_PER_WORD)
5592 return true;
5595 /* Otherwise, use the default padding. */
5596 return !BYTES_BIG_ENDIAN;
5599 static scalar_int_mode
5600 aarch64_libgcc_cmp_return_mode (void)
5602 return SImode;
5605 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5607 /* We use the 12-bit shifted immediate arithmetic instructions so values
5608 must be multiple of (1 << 12), i.e. 4096. */
5609 #define ARITH_FACTOR 4096
5611 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5612 #error Cannot use simple address calculation for stack probing
5613 #endif
5615 /* The pair of scratch registers used for stack probing. */
5616 #define PROBE_STACK_FIRST_REG R9_REGNUM
5617 #define PROBE_STACK_SECOND_REG R10_REGNUM
5619 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5620 inclusive. These are offsets from the current stack pointer. */
5622 static void
5623 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5625 HOST_WIDE_INT size;
5626 if (!poly_size.is_constant (&size))
5628 sorry ("stack probes for SVE frames");
5629 return;
5632 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5634 /* See the same assertion on PROBE_INTERVAL above. */
5635 gcc_assert ((first % ARITH_FACTOR) == 0);
5637 /* See if we have a constant small number of probes to generate. If so,
5638 that's the easy case. */
5639 if (size <= PROBE_INTERVAL)
5641 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5643 emit_set_insn (reg1,
5644 plus_constant (Pmode,
5645 stack_pointer_rtx, -(first + base)));
5646 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5649 /* The run-time loop is made up of 8 insns in the generic case while the
5650 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5651 else if (size <= 4 * PROBE_INTERVAL)
5653 HOST_WIDE_INT i, rem;
5655 emit_set_insn (reg1,
5656 plus_constant (Pmode,
5657 stack_pointer_rtx,
5658 -(first + PROBE_INTERVAL)));
5659 emit_stack_probe (reg1);
5661 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5662 it exceeds SIZE. If only two probes are needed, this will not
5663 generate any code. Then probe at FIRST + SIZE. */
5664 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5666 emit_set_insn (reg1,
5667 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5668 emit_stack_probe (reg1);
5671 rem = size - (i - PROBE_INTERVAL);
5672 if (rem > 256)
5674 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5676 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5677 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5679 else
5680 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5683 /* Otherwise, do the same as above, but in a loop. Note that we must be
5684 extra careful with variables wrapping around because we might be at
5685 the very top (or the very bottom) of the address space and we have
5686 to be able to handle this case properly; in particular, we use an
5687 equality test for the loop condition. */
5688 else
5690 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5692 /* Step 1: round SIZE to the previous multiple of the interval. */
5694 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5697 /* Step 2: compute initial and final value of the loop counter. */
5699 /* TEST_ADDR = SP + FIRST. */
5700 emit_set_insn (reg1,
5701 plus_constant (Pmode, stack_pointer_rtx, -first));
5703 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5704 HOST_WIDE_INT adjustment = - (first + rounded_size);
5705 if (! aarch64_uimm12_shift (adjustment))
5707 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5708 true, Pmode);
5709 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5711 else
5712 emit_set_insn (reg2,
5713 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5715 /* Step 3: the loop
5719 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5720 probe at TEST_ADDR
5722 while (TEST_ADDR != LAST_ADDR)
5724 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5725 until it is equal to ROUNDED_SIZE. */
5727 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5730 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5731 that SIZE is equal to ROUNDED_SIZE. */
5733 if (size != rounded_size)
5735 HOST_WIDE_INT rem = size - rounded_size;
5737 if (rem > 256)
5739 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5741 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5742 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5744 else
5745 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5749 /* Make sure nothing is scheduled before we are done. */
5750 emit_insn (gen_blockage ());
5753 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5754 absolute addresses. */
5756 const char *
5757 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5759 static int labelno = 0;
5760 char loop_lab[32];
5761 rtx xops[2];
5763 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5765 /* Loop. */
5766 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5768 HOST_WIDE_INT stack_clash_probe_interval
5769 = 1 << param_stack_clash_protection_guard_size;
5771 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5772 xops[0] = reg1;
5773 HOST_WIDE_INT interval;
5774 if (flag_stack_clash_protection)
5775 interval = stack_clash_probe_interval;
5776 else
5777 interval = PROBE_INTERVAL;
5779 gcc_assert (aarch64_uimm12_shift (interval));
5780 xops[1] = GEN_INT (interval);
5782 output_asm_insn ("sub\t%0, %0, %1", xops);
5784 /* If doing stack clash protection then we probe up by the ABI specified
5785 amount. We do this because we're dropping full pages at a time in the
5786 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5787 if (flag_stack_clash_protection)
5788 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5789 else
5790 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5792 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5793 by this amount for each iteration. */
5794 output_asm_insn ("str\txzr, [%0, %1]", xops);
5796 /* Test if TEST_ADDR == LAST_ADDR. */
5797 xops[1] = reg2;
5798 output_asm_insn ("cmp\t%0, %1", xops);
5800 /* Branch. */
5801 fputs ("\tb.ne\t", asm_out_file);
5802 assemble_name_raw (asm_out_file, loop_lab);
5803 fputc ('\n', asm_out_file);
5805 return "";
5808 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5809 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5810 of GUARD_SIZE. When a probe is emitted it is done at most
5811 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5812 at most MIN_PROBE_THRESHOLD. By the end of this function
5813 BASE = BASE - ADJUSTMENT. */
5815 const char *
5816 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5817 rtx min_probe_threshold, rtx guard_size)
5819 /* This function is not allowed to use any instruction generation function
5820 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5821 so instead emit the code you want using output_asm_insn. */
5822 gcc_assert (flag_stack_clash_protection);
5823 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5824 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5826 /* The minimum required allocation before the residual requires probing. */
5827 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5829 /* Clamp the value down to the nearest value that can be used with a cmp. */
5830 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5831 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5833 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5834 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5836 static int labelno = 0;
5837 char loop_start_lab[32];
5838 char loop_end_lab[32];
5839 rtx xops[2];
5841 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5842 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5844 /* Emit loop start label. */
5845 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5847 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5848 xops[0] = adjustment;
5849 xops[1] = probe_offset_value_rtx;
5850 output_asm_insn ("cmp\t%0, %1", xops);
5852 /* Branch to end if not enough adjustment to probe. */
5853 fputs ("\tb.lt\t", asm_out_file);
5854 assemble_name_raw (asm_out_file, loop_end_lab);
5855 fputc ('\n', asm_out_file);
5857 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5858 xops[0] = base;
5859 xops[1] = probe_offset_value_rtx;
5860 output_asm_insn ("sub\t%0, %0, %1", xops);
5862 /* Probe at BASE. */
5863 xops[1] = const0_rtx;
5864 output_asm_insn ("str\txzr, [%0, %1]", xops);
5866 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5867 xops[0] = adjustment;
5868 xops[1] = probe_offset_value_rtx;
5869 output_asm_insn ("sub\t%0, %0, %1", xops);
5871 /* Branch to start if still more bytes to allocate. */
5872 fputs ("\tb\t", asm_out_file);
5873 assemble_name_raw (asm_out_file, loop_start_lab);
5874 fputc ('\n', asm_out_file);
5876 /* No probe leave. */
5877 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5879 /* BASE = BASE - ADJUSTMENT. */
5880 xops[0] = base;
5881 xops[1] = adjustment;
5882 output_asm_insn ("sub\t%0, %0, %1", xops);
5883 return "";
5886 /* Determine whether a frame chain needs to be generated. */
5887 static bool
5888 aarch64_needs_frame_chain (void)
5890 /* Force a frame chain for EH returns so the return address is at FP+8. */
5891 if (frame_pointer_needed || crtl->calls_eh_return)
5892 return true;
5894 /* A leaf function cannot have calls or write LR. */
5895 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5897 /* Don't use a frame chain in leaf functions if leaf frame pointers
5898 are disabled. */
5899 if (flag_omit_leaf_frame_pointer && is_leaf)
5900 return false;
5902 return aarch64_use_frame_pointer;
5905 /* Mark the registers that need to be saved by the callee and calculate
5906 the size of the callee-saved registers area and frame record (both FP
5907 and LR may be omitted). */
5908 static void
5909 aarch64_layout_frame (void)
5911 poly_int64 offset = 0;
5912 int regno, last_fp_reg = INVALID_REGNUM;
5913 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5914 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5915 bool frame_related_fp_reg_p = false;
5916 aarch64_frame &frame = cfun->machine->frame;
5918 frame.emit_frame_chain = aarch64_needs_frame_chain ();
5920 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5921 the mid-end is doing. */
5922 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5924 #define SLOT_NOT_REQUIRED (-2)
5925 #define SLOT_REQUIRED (-1)
5927 frame.wb_candidate1 = INVALID_REGNUM;
5928 frame.wb_candidate2 = INVALID_REGNUM;
5929 frame.spare_pred_reg = INVALID_REGNUM;
5931 /* First mark all the registers that really need to be saved... */
5932 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5933 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5935 /* ... that includes the eh data registers (if needed)... */
5936 if (crtl->calls_eh_return)
5937 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5938 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5940 /* ... and any callee saved register that dataflow says is live. */
5941 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5942 if (df_regs_ever_live_p (regno)
5943 && !fixed_regs[regno]
5944 && (regno == R30_REGNUM
5945 || !crtl->abi->clobbers_full_reg_p (regno)))
5946 frame.reg_offset[regno] = SLOT_REQUIRED;
5948 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5949 if (df_regs_ever_live_p (regno)
5950 && !fixed_regs[regno]
5951 && !crtl->abi->clobbers_full_reg_p (regno))
5953 frame.reg_offset[regno] = SLOT_REQUIRED;
5954 last_fp_reg = regno;
5955 if (aarch64_emit_cfi_for_reg_p (regno))
5956 frame_related_fp_reg_p = true;
5959 /* Big-endian SVE frames need a spare predicate register in order
5960 to save Z8-Z15. Decide which register they should use. Prefer
5961 an unused argument register if possible, so that we don't force P4
5962 to be saved unnecessarily. */
5963 if (frame_related_fp_reg_p
5964 && crtl->abi->id () == ARM_PCS_SVE
5965 && BYTES_BIG_ENDIAN)
5967 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5968 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5969 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5970 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5971 break;
5972 gcc_assert (regno <= P7_REGNUM);
5973 frame.spare_pred_reg = regno;
5974 df_set_regs_ever_live (regno, true);
5977 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5978 if (df_regs_ever_live_p (regno)
5979 && !fixed_regs[regno]
5980 && !crtl->abi->clobbers_full_reg_p (regno))
5981 frame.reg_offset[regno] = SLOT_REQUIRED;
5983 /* With stack-clash, LR must be saved in non-leaf functions. */
5984 gcc_assert (crtl->is_leaf
5985 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
5987 /* Now assign stack slots for the registers. Start with the predicate
5988 registers, since predicate LDR and STR have a relatively small
5989 offset range. These saves happen below the hard frame pointer. */
5990 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5991 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5993 frame.reg_offset[regno] = offset;
5994 offset += BYTES_PER_SVE_PRED;
5997 /* We save a maximum of 8 predicate registers, and since vector
5998 registers are 8 times the size of a predicate register, all the
5999 saved predicates fit within a single vector. Doing this also
6000 rounds the offset to a 128-bit boundary. */
6001 if (maybe_ne (offset, 0))
6003 gcc_assert (known_le (offset, vector_save_size));
6004 offset = vector_save_size;
6007 /* If we need to save any SVE vector registers, add them next. */
6008 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6009 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6010 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6012 frame.reg_offset[regno] = offset;
6013 offset += vector_save_size;
6016 /* OFFSET is now the offset of the hard frame pointer from the bottom
6017 of the callee save area. */
6018 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6019 frame.below_hard_fp_saved_regs_size = offset;
6020 if (frame.emit_frame_chain)
6022 /* FP and LR are placed in the linkage record. */
6023 frame.reg_offset[R29_REGNUM] = offset;
6024 frame.wb_candidate1 = R29_REGNUM;
6025 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6026 frame.wb_candidate2 = R30_REGNUM;
6027 offset += 2 * UNITS_PER_WORD;
6030 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6031 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6033 frame.reg_offset[regno] = offset;
6034 if (frame.wb_candidate1 == INVALID_REGNUM)
6035 frame.wb_candidate1 = regno;
6036 else if (frame.wb_candidate2 == INVALID_REGNUM)
6037 frame.wb_candidate2 = regno;
6038 offset += UNITS_PER_WORD;
6041 poly_int64 max_int_offset = offset;
6042 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6043 bool has_align_gap = maybe_ne (offset, max_int_offset);
6045 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6046 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6048 /* If there is an alignment gap between integer and fp callee-saves,
6049 allocate the last fp register to it if possible. */
6050 if (regno == last_fp_reg
6051 && has_align_gap
6052 && known_eq (vector_save_size, 8)
6053 && multiple_p (offset, 16))
6055 frame.reg_offset[regno] = max_int_offset;
6056 break;
6059 frame.reg_offset[regno] = offset;
6060 if (frame.wb_candidate1 == INVALID_REGNUM)
6061 frame.wb_candidate1 = regno;
6062 else if (frame.wb_candidate2 == INVALID_REGNUM
6063 && frame.wb_candidate1 >= V0_REGNUM)
6064 frame.wb_candidate2 = regno;
6065 offset += vector_save_size;
6068 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6070 frame.saved_regs_size = offset;
6072 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6074 poly_int64 above_outgoing_args
6075 = aligned_upper_bound (varargs_and_saved_regs_size
6076 + get_frame_size (),
6077 STACK_BOUNDARY / BITS_PER_UNIT);
6079 frame.hard_fp_offset
6080 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6082 /* Both these values are already aligned. */
6083 gcc_assert (multiple_p (crtl->outgoing_args_size,
6084 STACK_BOUNDARY / BITS_PER_UNIT));
6085 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6087 frame.locals_offset = frame.saved_varargs_size;
6089 frame.initial_adjust = 0;
6090 frame.final_adjust = 0;
6091 frame.callee_adjust = 0;
6092 frame.sve_callee_adjust = 0;
6093 frame.callee_offset = 0;
6095 HOST_WIDE_INT max_push_offset = 0;
6096 if (frame.wb_candidate2 != INVALID_REGNUM)
6097 max_push_offset = 512;
6098 else if (frame.wb_candidate1 != INVALID_REGNUM)
6099 max_push_offset = 256;
6101 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6102 HOST_WIDE_INT const_saved_regs_size;
6103 if (frame.frame_size.is_constant (&const_size)
6104 && const_size < max_push_offset
6105 && known_eq (frame.hard_fp_offset, const_size))
6107 /* Simple, small frame with no outgoing arguments:
6109 stp reg1, reg2, [sp, -frame_size]!
6110 stp reg3, reg4, [sp, 16] */
6111 frame.callee_adjust = const_size;
6113 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6114 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6115 && const_outgoing_args_size + const_saved_regs_size < 512
6116 /* We could handle this case even with outgoing args, provided
6117 that the number of args left us with valid offsets for all
6118 predicate and vector save slots. It's such a rare case that
6119 it hardly seems worth the effort though. */
6120 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6121 && !(cfun->calls_alloca
6122 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6123 && const_fp_offset < max_push_offset))
6125 /* Frame with small outgoing arguments:
6127 sub sp, sp, frame_size
6128 stp reg1, reg2, [sp, outgoing_args_size]
6129 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6130 frame.initial_adjust = frame.frame_size;
6131 frame.callee_offset = const_outgoing_args_size;
6133 else if (saves_below_hard_fp_p
6134 && known_eq (frame.saved_regs_size,
6135 frame.below_hard_fp_saved_regs_size))
6137 /* Frame in which all saves are SVE saves:
6139 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6140 save SVE registers relative to SP
6141 sub sp, sp, outgoing_args_size */
6142 frame.initial_adjust = (frame.hard_fp_offset
6143 + frame.below_hard_fp_saved_regs_size);
6144 frame.final_adjust = crtl->outgoing_args_size;
6146 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6147 && const_fp_offset < max_push_offset)
6149 /* Frame with large outgoing arguments or SVE saves, but with
6150 a small local area:
6152 stp reg1, reg2, [sp, -hard_fp_offset]!
6153 stp reg3, reg4, [sp, 16]
6154 [sub sp, sp, below_hard_fp_saved_regs_size]
6155 [save SVE registers relative to SP]
6156 sub sp, sp, outgoing_args_size */
6157 frame.callee_adjust = const_fp_offset;
6158 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6159 frame.final_adjust = crtl->outgoing_args_size;
6161 else
6163 /* Frame with large local area and outgoing arguments or SVE saves,
6164 using frame pointer:
6166 sub sp, sp, hard_fp_offset
6167 stp x29, x30, [sp, 0]
6168 add x29, sp, 0
6169 stp reg3, reg4, [sp, 16]
6170 [sub sp, sp, below_hard_fp_saved_regs_size]
6171 [save SVE registers relative to SP]
6172 sub sp, sp, outgoing_args_size */
6173 frame.initial_adjust = frame.hard_fp_offset;
6174 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6175 frame.final_adjust = crtl->outgoing_args_size;
6178 /* Make sure the individual adjustments add up to the full frame size. */
6179 gcc_assert (known_eq (frame.initial_adjust
6180 + frame.callee_adjust
6181 + frame.sve_callee_adjust
6182 + frame.final_adjust, frame.frame_size));
6184 frame.laid_out = true;
6187 /* Return true if the register REGNO is saved on entry to
6188 the current function. */
6190 static bool
6191 aarch64_register_saved_on_entry (int regno)
6193 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6196 /* Return the next register up from REGNO up to LIMIT for the callee
6197 to save. */
6199 static unsigned
6200 aarch64_next_callee_save (unsigned regno, unsigned limit)
6202 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6203 regno ++;
6204 return regno;
6207 /* Push the register number REGNO of mode MODE to the stack with write-back
6208 adjusting the stack by ADJUSTMENT. */
6210 static void
6211 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6212 HOST_WIDE_INT adjustment)
6214 rtx base_rtx = stack_pointer_rtx;
6215 rtx insn, reg, mem;
6217 reg = gen_rtx_REG (mode, regno);
6218 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6219 plus_constant (Pmode, base_rtx, -adjustment));
6220 mem = gen_frame_mem (mode, mem);
6222 insn = emit_move_insn (mem, reg);
6223 RTX_FRAME_RELATED_P (insn) = 1;
6226 /* Generate and return an instruction to store the pair of registers
6227 REG and REG2 of mode MODE to location BASE with write-back adjusting
6228 the stack location BASE by ADJUSTMENT. */
6230 static rtx
6231 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6232 HOST_WIDE_INT adjustment)
6234 switch (mode)
6236 case E_DImode:
6237 return gen_storewb_pairdi_di (base, base, reg, reg2,
6238 GEN_INT (-adjustment),
6239 GEN_INT (UNITS_PER_WORD - adjustment));
6240 case E_DFmode:
6241 return gen_storewb_pairdf_di (base, base, reg, reg2,
6242 GEN_INT (-adjustment),
6243 GEN_INT (UNITS_PER_WORD - adjustment));
6244 case E_TFmode:
6245 return gen_storewb_pairtf_di (base, base, reg, reg2,
6246 GEN_INT (-adjustment),
6247 GEN_INT (UNITS_PER_VREG - adjustment));
6248 default:
6249 gcc_unreachable ();
6253 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6254 stack pointer by ADJUSTMENT. */
6256 static void
6257 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6259 rtx_insn *insn;
6260 machine_mode mode = aarch64_reg_save_mode (regno1);
6262 if (regno2 == INVALID_REGNUM)
6263 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6265 rtx reg1 = gen_rtx_REG (mode, regno1);
6266 rtx reg2 = gen_rtx_REG (mode, regno2);
6268 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6269 reg2, adjustment));
6270 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6271 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6272 RTX_FRAME_RELATED_P (insn) = 1;
6275 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6276 adjusting it by ADJUSTMENT afterwards. */
6278 static rtx
6279 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6280 HOST_WIDE_INT adjustment)
6282 switch (mode)
6284 case E_DImode:
6285 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6286 GEN_INT (UNITS_PER_WORD));
6287 case E_DFmode:
6288 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6289 GEN_INT (UNITS_PER_WORD));
6290 case E_TFmode:
6291 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6292 GEN_INT (UNITS_PER_VREG));
6293 default:
6294 gcc_unreachable ();
6298 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6299 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6300 into CFI_OPS. */
6302 static void
6303 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6304 rtx *cfi_ops)
6306 machine_mode mode = aarch64_reg_save_mode (regno1);
6307 rtx reg1 = gen_rtx_REG (mode, regno1);
6309 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6311 if (regno2 == INVALID_REGNUM)
6313 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6314 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6315 emit_move_insn (reg1, gen_frame_mem (mode, mem));
6317 else
6319 rtx reg2 = gen_rtx_REG (mode, regno2);
6320 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6321 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6322 reg2, adjustment));
6326 /* Generate and return a store pair instruction of mode MODE to store
6327 register REG1 to MEM1 and register REG2 to MEM2. */
6329 static rtx
6330 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6331 rtx reg2)
6333 switch (mode)
6335 case E_DImode:
6336 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6338 case E_DFmode:
6339 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6341 case E_TFmode:
6342 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6344 default:
6345 gcc_unreachable ();
6349 /* Generate and regurn a load pair isntruction of mode MODE to load register
6350 REG1 from MEM1 and register REG2 from MEM2. */
6352 static rtx
6353 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6354 rtx mem2)
6356 switch (mode)
6358 case E_DImode:
6359 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6361 case E_DFmode:
6362 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6364 case E_TFmode:
6365 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6367 default:
6368 gcc_unreachable ();
6372 /* Return TRUE if return address signing should be enabled for the current
6373 function, otherwise return FALSE. */
6375 bool
6376 aarch64_return_address_signing_enabled (void)
6378 /* This function should only be called after frame laid out. */
6379 gcc_assert (cfun->machine->frame.laid_out);
6381 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6382 if its LR is pushed onto stack. */
6383 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6384 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6385 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6388 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6389 bool
6390 aarch64_bti_enabled (void)
6392 return (aarch64_enable_bti == 1);
6395 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6396 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6397 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6399 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6400 or LD1D address
6402 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6403 if the variable isn't already nonnull
6405 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6406 Handle this case using a temporary base register that is suitable for
6407 all offsets in that range. Use ANCHOR_REG as this base register if it
6408 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6410 static inline void
6411 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6412 rtx &anchor_reg, poly_int64 &offset,
6413 rtx &ptrue)
6415 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6417 /* This is the maximum valid offset of the anchor from the base.
6418 Lower values would be valid too. */
6419 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6420 if (!anchor_reg)
6422 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6423 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6424 gen_int_mode (anchor_offset, Pmode)));
6426 base_rtx = anchor_reg;
6427 offset -= anchor_offset;
6429 if (!ptrue)
6431 int pred_reg = cfun->machine->frame.spare_pred_reg;
6432 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6433 CONSTM1_RTX (VNx16BImode));
6434 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6438 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6439 is saved at BASE + OFFSET. */
6441 static void
6442 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6443 rtx base, poly_int64 offset)
6445 rtx mem = gen_frame_mem (GET_MODE (reg),
6446 plus_constant (Pmode, base, offset));
6447 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6450 /* Emit code to save the callee-saved registers from register number START
6451 to LIMIT to the stack at the location starting at offset START_OFFSET,
6452 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6453 is true if the hard frame pointer has been set up. */
6455 static void
6456 aarch64_save_callee_saves (poly_int64 start_offset,
6457 unsigned start, unsigned limit, bool skip_wb,
6458 bool hard_fp_valid_p)
6460 rtx_insn *insn;
6461 unsigned regno;
6462 unsigned regno2;
6463 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6465 for (regno = aarch64_next_callee_save (start, limit);
6466 regno <= limit;
6467 regno = aarch64_next_callee_save (regno + 1, limit))
6469 rtx reg, mem;
6470 poly_int64 offset;
6471 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6473 if (skip_wb
6474 && (regno == cfun->machine->frame.wb_candidate1
6475 || regno == cfun->machine->frame.wb_candidate2))
6476 continue;
6478 if (cfun->machine->reg_is_wrapped_separately[regno])
6479 continue;
6481 machine_mode mode = aarch64_reg_save_mode (regno);
6482 reg = gen_rtx_REG (mode, regno);
6483 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6484 rtx base_rtx = stack_pointer_rtx;
6485 poly_int64 sp_offset = offset;
6487 HOST_WIDE_INT const_offset;
6488 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6489 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6490 offset, ptrue);
6491 else if (GP_REGNUM_P (regno)
6492 && (!offset.is_constant (&const_offset) || const_offset >= 512))
6494 gcc_assert (known_eq (start_offset, 0));
6495 poly_int64 fp_offset
6496 = cfun->machine->frame.below_hard_fp_saved_regs_size;
6497 if (hard_fp_valid_p)
6498 base_rtx = hard_frame_pointer_rtx;
6499 else
6501 if (!anchor_reg)
6503 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6504 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6505 gen_int_mode (fp_offset, Pmode)));
6507 base_rtx = anchor_reg;
6509 offset -= fp_offset;
6511 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6512 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6514 if (!aarch64_sve_mode_p (mode)
6515 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6516 && !cfun->machine->reg_is_wrapped_separately[regno2]
6517 && known_eq (GET_MODE_SIZE (mode),
6518 cfun->machine->frame.reg_offset[regno2]
6519 - cfun->machine->frame.reg_offset[regno]))
6521 rtx reg2 = gen_rtx_REG (mode, regno2);
6522 rtx mem2;
6524 offset += GET_MODE_SIZE (mode);
6525 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6526 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6527 reg2));
6529 /* The first part of a frame-related parallel insn is
6530 always assumed to be relevant to the frame
6531 calculations; subsequent parts, are only
6532 frame-related if explicitly marked. */
6533 if (aarch64_emit_cfi_for_reg_p (regno2))
6535 if (need_cfa_note_p)
6536 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6537 sp_offset + GET_MODE_SIZE (mode));
6538 else
6539 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6542 regno = regno2;
6544 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6546 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6547 need_cfa_note_p = true;
6549 else if (aarch64_sve_mode_p (mode))
6550 insn = emit_insn (gen_rtx_SET (mem, reg));
6551 else
6552 insn = emit_move_insn (mem, reg);
6554 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6555 if (frame_related_p && need_cfa_note_p)
6556 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6560 /* Emit code to restore the callee registers from register number START
6561 up to and including LIMIT. Restore from the stack offset START_OFFSET,
6562 skipping any write-back candidates if SKIP_WB is true. Write the
6563 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
6565 static void
6566 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6567 unsigned limit, bool skip_wb, rtx *cfi_ops)
6569 unsigned regno;
6570 unsigned regno2;
6571 poly_int64 offset;
6572 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6574 for (regno = aarch64_next_callee_save (start, limit);
6575 regno <= limit;
6576 regno = aarch64_next_callee_save (regno + 1, limit))
6578 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6579 if (cfun->machine->reg_is_wrapped_separately[regno])
6580 continue;
6582 rtx reg, mem;
6584 if (skip_wb
6585 && (regno == cfun->machine->frame.wb_candidate1
6586 || regno == cfun->machine->frame.wb_candidate2))
6587 continue;
6589 machine_mode mode = aarch64_reg_save_mode (regno);
6590 reg = gen_rtx_REG (mode, regno);
6591 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6592 rtx base_rtx = stack_pointer_rtx;
6593 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6594 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6595 offset, ptrue);
6596 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6598 if (!aarch64_sve_mode_p (mode)
6599 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6600 && !cfun->machine->reg_is_wrapped_separately[regno2]
6601 && known_eq (GET_MODE_SIZE (mode),
6602 cfun->machine->frame.reg_offset[regno2]
6603 - cfun->machine->frame.reg_offset[regno]))
6605 rtx reg2 = gen_rtx_REG (mode, regno2);
6606 rtx mem2;
6608 offset += GET_MODE_SIZE (mode);
6609 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6610 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6612 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6613 regno = regno2;
6615 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6616 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6617 else if (aarch64_sve_mode_p (mode))
6618 emit_insn (gen_rtx_SET (reg, mem));
6619 else
6620 emit_move_insn (reg, mem);
6621 if (frame_related_p)
6622 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6626 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6627 of MODE. */
6629 static inline bool
6630 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6632 HOST_WIDE_INT multiple;
6633 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6634 && IN_RANGE (multiple, -8, 7));
6637 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6638 of MODE. */
6640 static inline bool
6641 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6643 HOST_WIDE_INT multiple;
6644 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6645 && IN_RANGE (multiple, 0, 63));
6648 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6649 of MODE. */
6651 bool
6652 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6654 HOST_WIDE_INT multiple;
6655 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6656 && IN_RANGE (multiple, -64, 63));
6659 /* Return true if OFFSET is a signed 9-bit value. */
6661 bool
6662 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6663 poly_int64 offset)
6665 HOST_WIDE_INT const_offset;
6666 return (offset.is_constant (&const_offset)
6667 && IN_RANGE (const_offset, -256, 255));
6670 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6671 of MODE. */
6673 static inline bool
6674 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6676 HOST_WIDE_INT multiple;
6677 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6678 && IN_RANGE (multiple, -256, 255));
6681 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6682 of MODE. */
6684 static inline bool
6685 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6687 HOST_WIDE_INT multiple;
6688 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6689 && IN_RANGE (multiple, 0, 4095));
6692 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
6694 static sbitmap
6695 aarch64_get_separate_components (void)
6697 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6698 bitmap_clear (components);
6700 /* The registers we need saved to the frame. */
6701 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6702 if (aarch64_register_saved_on_entry (regno))
6704 /* Punt on saves and restores that use ST1D and LD1D. We could
6705 try to be smarter, but it would involve making sure that the
6706 spare predicate register itself is safe to use at the save
6707 and restore points. Also, when a frame pointer is being used,
6708 the slots are often out of reach of ST1D and LD1D anyway. */
6709 machine_mode mode = aarch64_reg_save_mode (regno);
6710 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6711 continue;
6713 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6715 /* If the register is saved in the first SVE save slot, we use
6716 it as a stack probe for -fstack-clash-protection. */
6717 if (flag_stack_clash_protection
6718 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6719 && known_eq (offset, 0))
6720 continue;
6722 /* Get the offset relative to the register we'll use. */
6723 if (frame_pointer_needed)
6724 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6725 else
6726 offset += crtl->outgoing_args_size;
6728 /* Check that we can access the stack slot of the register with one
6729 direct load with no adjustments needed. */
6730 if (aarch64_sve_mode_p (mode)
6731 ? offset_9bit_signed_scaled_p (mode, offset)
6732 : offset_12bit_unsigned_scaled_p (mode, offset))
6733 bitmap_set_bit (components, regno);
6736 /* Don't mess with the hard frame pointer. */
6737 if (frame_pointer_needed)
6738 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6740 /* If the spare predicate register used by big-endian SVE code
6741 is call-preserved, it must be saved in the main prologue
6742 before any saves that use it. */
6743 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6744 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6746 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6747 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6748 /* If registers have been chosen to be stored/restored with
6749 writeback don't interfere with them to avoid having to output explicit
6750 stack adjustment instructions. */
6751 if (reg2 != INVALID_REGNUM)
6752 bitmap_clear_bit (components, reg2);
6753 if (reg1 != INVALID_REGNUM)
6754 bitmap_clear_bit (components, reg1);
6756 bitmap_clear_bit (components, LR_REGNUM);
6757 bitmap_clear_bit (components, SP_REGNUM);
6759 return components;
6762 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
6764 static sbitmap
6765 aarch64_components_for_bb (basic_block bb)
6767 bitmap in = DF_LIVE_IN (bb);
6768 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6769 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6771 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6772 bitmap_clear (components);
6774 /* Clobbered registers don't generate values in any meaningful sense,
6775 since nothing after the clobber can rely on their value. And we can't
6776 say that partially-clobbered registers are unconditionally killed,
6777 because whether they're killed or not depends on the mode of the
6778 value they're holding. Thus partially call-clobbered registers
6779 appear in neither the kill set nor the gen set.
6781 Check manually for any calls that clobber more of a register than the
6782 current function can. */
6783 function_abi_aggregator callee_abis;
6784 rtx_insn *insn;
6785 FOR_BB_INSNS (bb, insn)
6786 if (CALL_P (insn))
6787 callee_abis.note_callee_abi (insn_callee_abi (insn));
6788 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6790 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
6791 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6792 if (!fixed_regs[regno]
6793 && !crtl->abi->clobbers_full_reg_p (regno)
6794 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6795 || bitmap_bit_p (in, regno)
6796 || bitmap_bit_p (gen, regno)
6797 || bitmap_bit_p (kill, regno)))
6799 bitmap_set_bit (components, regno);
6801 /* If there is a callee-save at an adjacent offset, add it too
6802 to increase the use of LDP/STP. */
6803 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6804 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6806 if (regno2 <= LAST_SAVED_REGNUM)
6808 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6809 if (regno < regno2
6810 ? known_eq (offset + 8, offset2)
6811 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6812 bitmap_set_bit (components, regno2);
6816 return components;
6819 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6820 Nothing to do for aarch64. */
6822 static void
6823 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6827 /* Return the next set bit in BMP from START onwards. Return the total number
6828 of bits in BMP if no set bit is found at or after START. */
6830 static unsigned int
6831 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6833 unsigned int nbits = SBITMAP_SIZE (bmp);
6834 if (start == nbits)
6835 return start;
6837 gcc_assert (start < nbits);
6838 for (unsigned int i = start; i < nbits; i++)
6839 if (bitmap_bit_p (bmp, i))
6840 return i;
6842 return nbits;
6845 /* Do the work for aarch64_emit_prologue_components and
6846 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6847 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6848 for these components or the epilogue sequence. That is, it determines
6849 whether we should emit stores or loads and what kind of CFA notes to attach
6850 to the insns. Otherwise the logic for the two sequences is very
6851 similar. */
6853 static void
6854 aarch64_process_components (sbitmap components, bool prologue_p)
6856 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6857 ? HARD_FRAME_POINTER_REGNUM
6858 : STACK_POINTER_REGNUM);
6860 unsigned last_regno = SBITMAP_SIZE (components);
6861 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6862 rtx_insn *insn = NULL;
6864 while (regno != last_regno)
6866 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6867 machine_mode mode = aarch64_reg_save_mode (regno);
6869 rtx reg = gen_rtx_REG (mode, regno);
6870 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6871 if (frame_pointer_needed)
6872 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6873 else
6874 offset += crtl->outgoing_args_size;
6876 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6877 rtx mem = gen_frame_mem (mode, addr);
6879 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6880 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6881 /* No more registers to handle after REGNO.
6882 Emit a single save/restore and exit. */
6883 if (regno2 == last_regno)
6885 insn = emit_insn (set);
6886 if (frame_related_p)
6888 RTX_FRAME_RELATED_P (insn) = 1;
6889 if (prologue_p)
6890 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6891 else
6892 add_reg_note (insn, REG_CFA_RESTORE, reg);
6894 break;
6897 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6898 /* The next register is not of the same class or its offset is not
6899 mergeable with the current one into a pair. */
6900 if (aarch64_sve_mode_p (mode)
6901 || !satisfies_constraint_Ump (mem)
6902 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6903 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6904 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6905 GET_MODE_SIZE (mode)))
6907 insn = emit_insn (set);
6908 if (frame_related_p)
6910 RTX_FRAME_RELATED_P (insn) = 1;
6911 if (prologue_p)
6912 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6913 else
6914 add_reg_note (insn, REG_CFA_RESTORE, reg);
6917 regno = regno2;
6918 continue;
6921 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6923 /* REGNO2 can be saved/restored in a pair with REGNO. */
6924 rtx reg2 = gen_rtx_REG (mode, regno2);
6925 if (frame_pointer_needed)
6926 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6927 else
6928 offset2 += crtl->outgoing_args_size;
6929 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6930 rtx mem2 = gen_frame_mem (mode, addr2);
6931 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6932 : gen_rtx_SET (reg2, mem2);
6934 if (prologue_p)
6935 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6936 else
6937 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6939 if (frame_related_p || frame_related2_p)
6941 RTX_FRAME_RELATED_P (insn) = 1;
6942 if (prologue_p)
6944 if (frame_related_p)
6945 add_reg_note (insn, REG_CFA_OFFSET, set);
6946 if (frame_related2_p)
6947 add_reg_note (insn, REG_CFA_OFFSET, set2);
6949 else
6951 if (frame_related_p)
6952 add_reg_note (insn, REG_CFA_RESTORE, reg);
6953 if (frame_related2_p)
6954 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6958 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6962 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6964 static void
6965 aarch64_emit_prologue_components (sbitmap components)
6967 aarch64_process_components (components, true);
6970 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6972 static void
6973 aarch64_emit_epilogue_components (sbitmap components)
6975 aarch64_process_components (components, false);
6978 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6980 static void
6981 aarch64_set_handled_components (sbitmap components)
6983 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6984 if (bitmap_bit_p (components, regno))
6985 cfun->machine->reg_is_wrapped_separately[regno] = true;
6988 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6989 determining the probe offset for alloca. */
6991 static HOST_WIDE_INT
6992 aarch64_stack_clash_protection_alloca_probe_range (void)
6994 return STACK_CLASH_CALLER_GUARD;
6998 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6999 registers. If POLY_SIZE is not large enough to require a probe this function
7000 will only adjust the stack. When allocating the stack space
7001 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7002 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7003 arguments. If we are then we ensure that any allocation larger than the ABI
7004 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7005 maintained.
7007 We emit barriers after each stack adjustment to prevent optimizations from
7008 breaking the invariant that we never drop the stack more than a page. This
7009 invariant is needed to make it easier to correctly handle asynchronous
7010 events, e.g. if we were to allow the stack to be dropped by more than a page
7011 and then have multiple probes up and we take a signal somewhere in between
7012 then the signal handler doesn't know the state of the stack and can make no
7013 assumptions about which pages have been probed. */
7015 static void
7016 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7017 poly_int64 poly_size,
7018 bool frame_related_p,
7019 bool final_adjustment_p)
7021 HOST_WIDE_INT guard_size
7022 = 1 << param_stack_clash_protection_guard_size;
7023 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7024 HOST_WIDE_INT min_probe_threshold
7025 = (final_adjustment_p
7026 ? guard_used_by_caller
7027 : guard_size - guard_used_by_caller);
7028 /* When doing the final adjustment for the outgoing arguments, take into
7029 account any unprobed space there is above the current SP. There are
7030 two cases:
7032 - When saving SVE registers below the hard frame pointer, we force
7033 the lowest save to take place in the prologue before doing the final
7034 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7035 This acts as a probe at SP, so there is no unprobed space.
7037 - When there are no SVE register saves, we use the store of the link
7038 register as a probe. We can't assume that LR was saved at position 0
7039 though, so treat any space below it as unprobed. */
7040 if (final_adjustment_p
7041 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7043 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7044 if (known_ge (lr_offset, 0))
7045 min_probe_threshold -= lr_offset.to_constant ();
7046 else
7047 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7050 poly_int64 frame_size = cfun->machine->frame.frame_size;
7052 /* We should always have a positive probe threshold. */
7053 gcc_assert (min_probe_threshold > 0);
7055 if (flag_stack_clash_protection && !final_adjustment_p)
7057 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7058 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7059 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7061 if (known_eq (frame_size, 0))
7063 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7065 else if (known_lt (initial_adjust + sve_callee_adjust,
7066 guard_size - guard_used_by_caller)
7067 && known_lt (final_adjust, guard_used_by_caller))
7069 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7073 /* If SIZE is not large enough to require probing, just adjust the stack and
7074 exit. */
7075 if (known_lt (poly_size, min_probe_threshold)
7076 || !flag_stack_clash_protection)
7078 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7079 return;
7082 HOST_WIDE_INT size;
7083 /* Handle the SVE non-constant case first. */
7084 if (!poly_size.is_constant (&size))
7086 if (dump_file)
7088 fprintf (dump_file, "Stack clash SVE prologue: ");
7089 print_dec (poly_size, dump_file);
7090 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7093 /* First calculate the amount of bytes we're actually spilling. */
7094 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7095 poly_size, temp1, temp2, false, true);
7097 rtx_insn *insn = get_last_insn ();
7099 if (frame_related_p)
7101 /* This is done to provide unwinding information for the stack
7102 adjustments we're about to do, however to prevent the optimizers
7103 from removing the R11 move and leaving the CFA note (which would be
7104 very wrong) we tie the old and new stack pointer together.
7105 The tie will expand to nothing but the optimizers will not touch
7106 the instruction. */
7107 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7108 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7109 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7111 /* We want the CFA independent of the stack pointer for the
7112 duration of the loop. */
7113 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7114 RTX_FRAME_RELATED_P (insn) = 1;
7117 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7118 rtx guard_const = gen_int_mode (guard_size, Pmode);
7120 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7121 stack_pointer_rtx, temp1,
7122 probe_const, guard_const));
7124 /* Now reset the CFA register if needed. */
7125 if (frame_related_p)
7127 add_reg_note (insn, REG_CFA_DEF_CFA,
7128 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7129 gen_int_mode (poly_size, Pmode)));
7130 RTX_FRAME_RELATED_P (insn) = 1;
7133 return;
7136 if (dump_file)
7137 fprintf (dump_file,
7138 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7139 " bytes, probing will be required.\n", size);
7141 /* Round size to the nearest multiple of guard_size, and calculate the
7142 residual as the difference between the original size and the rounded
7143 size. */
7144 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7145 HOST_WIDE_INT residual = size - rounded_size;
7147 /* We can handle a small number of allocations/probes inline. Otherwise
7148 punt to a loop. */
7149 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7151 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7153 aarch64_sub_sp (NULL, temp2, guard_size, true);
7154 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7155 guard_used_by_caller));
7156 emit_insn (gen_blockage ());
7158 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7160 else
7162 /* Compute the ending address. */
7163 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7164 temp1, NULL, false, true);
7165 rtx_insn *insn = get_last_insn ();
7167 /* For the initial allocation, we don't have a frame pointer
7168 set up, so we always need CFI notes. If we're doing the
7169 final allocation, then we may have a frame pointer, in which
7170 case it is the CFA, otherwise we need CFI notes.
7172 We can determine which allocation we are doing by looking at
7173 the value of FRAME_RELATED_P since the final allocations are not
7174 frame related. */
7175 if (frame_related_p)
7177 /* We want the CFA independent of the stack pointer for the
7178 duration of the loop. */
7179 add_reg_note (insn, REG_CFA_DEF_CFA,
7180 plus_constant (Pmode, temp1, rounded_size));
7181 RTX_FRAME_RELATED_P (insn) = 1;
7184 /* This allocates and probes the stack. Note that this re-uses some of
7185 the existing Ada stack protection code. However we are guaranteed not
7186 to enter the non loop or residual branches of that code.
7188 The non-loop part won't be entered because if our allocation amount
7189 doesn't require a loop, the case above would handle it.
7191 The residual amount won't be entered because TEMP1 is a mutliple of
7192 the allocation size. The residual will always be 0. As such, the only
7193 part we are actually using from that code is the loop setup. The
7194 actual probing is done in aarch64_output_probe_stack_range. */
7195 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7196 stack_pointer_rtx, temp1));
7198 /* Now reset the CFA register if needed. */
7199 if (frame_related_p)
7201 add_reg_note (insn, REG_CFA_DEF_CFA,
7202 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7203 RTX_FRAME_RELATED_P (insn) = 1;
7206 emit_insn (gen_blockage ());
7207 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7210 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7211 be probed. This maintains the requirement that each page is probed at
7212 least once. For initial probing we probe only if the allocation is
7213 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7214 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7215 GUARD_SIZE. This works that for any allocation that is large enough to
7216 trigger a probe here, we'll have at least one, and if they're not large
7217 enough for this code to emit anything for them, The page would have been
7218 probed by the saving of FP/LR either by this function or any callees. If
7219 we don't have any callees then we won't have more stack adjustments and so
7220 are still safe. */
7221 if (residual)
7223 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7224 /* If we're doing final adjustments, and we've done any full page
7225 allocations then any residual needs to be probed. */
7226 if (final_adjustment_p && rounded_size != 0)
7227 min_probe_threshold = 0;
7228 /* If doing a small final adjustment, we always probe at offset 0.
7229 This is done to avoid issues when LR is not at position 0 or when
7230 the final adjustment is smaller than the probing offset. */
7231 else if (final_adjustment_p && rounded_size == 0)
7232 residual_probe_offset = 0;
7234 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7235 if (residual >= min_probe_threshold)
7237 if (dump_file)
7238 fprintf (dump_file,
7239 "Stack clash AArch64 prologue residuals: "
7240 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7241 "\n", residual);
7243 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7244 residual_probe_offset));
7245 emit_insn (gen_blockage ());
7250 /* Return 1 if the register is used by the epilogue. We need to say the
7251 return register is used, but only after epilogue generation is complete.
7252 Note that in the case of sibcalls, the values "used by the epilogue" are
7253 considered live at the start of the called function.
7255 For SIMD functions we need to return 1 for FP registers that are saved and
7256 restored by a function but are not zero in call_used_regs. If we do not do
7257 this optimizations may remove the restore of the register. */
7260 aarch64_epilogue_uses (int regno)
7262 if (epilogue_completed)
7264 if (regno == LR_REGNUM)
7265 return 1;
7267 return 0;
7270 /* AArch64 stack frames generated by this compiler look like:
7272 +-------------------------------+
7274 | incoming stack arguments |
7276 +-------------------------------+
7277 | | <-- incoming stack pointer (aligned)
7278 | callee-allocated save area |
7279 | for register varargs |
7281 +-------------------------------+
7282 | local variables | <-- frame_pointer_rtx
7284 +-------------------------------+
7285 | padding | \
7286 +-------------------------------+ |
7287 | callee-saved registers | | frame.saved_regs_size
7288 +-------------------------------+ |
7289 | LR' | |
7290 +-------------------------------+ |
7291 | FP' | |
7292 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7293 | SVE vector registers | | \
7294 +-------------------------------+ | | below_hard_fp_saved_regs_size
7295 | SVE predicate registers | / /
7296 +-------------------------------+
7297 | dynamic allocation |
7298 +-------------------------------+
7299 | padding |
7300 +-------------------------------+
7301 | outgoing stack arguments | <-- arg_pointer
7303 +-------------------------------+
7304 | | <-- stack_pointer_rtx (aligned)
7306 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7307 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7308 unchanged.
7310 By default for stack-clash we assume the guard is at least 64KB, but this
7311 value is configurable to either 4KB or 64KB. We also force the guard size to
7312 be the same as the probing interval and both values are kept in sync.
7314 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7315 on the guard size) of stack space without probing.
7317 When probing is needed, we emit a probe at the start of the prologue
7318 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7320 We have to track how much space has been allocated and the only stores
7321 to the stack we track as implicit probes are the FP/LR stores.
7323 For outgoing arguments we probe if the size is larger than 1KB, such that
7324 the ABI specified buffer is maintained for the next callee.
7326 The following registers are reserved during frame layout and should not be
7327 used for any other purpose:
7329 - r11: Used by stack clash protection when SVE is enabled, and also
7330 as an anchor register when saving and restoring registers
7331 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7332 - r14 and r15: Used for speculation tracking.
7333 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7334 - r30(LR), r29(FP): Used by standard frame layout.
7336 These registers must be avoided in frame layout related code unless the
7337 explicit intention is to interact with one of the features listed above. */
7339 /* Generate the prologue instructions for entry into a function.
7340 Establish the stack frame by decreasing the stack pointer with a
7341 properly calculated size and, if necessary, create a frame record
7342 filled with the values of LR and previous frame pointer. The
7343 current FP is also set up if it is in use. */
7345 void
7346 aarch64_expand_prologue (void)
7348 poly_int64 frame_size = cfun->machine->frame.frame_size;
7349 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7350 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7351 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7352 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7353 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7354 poly_int64 below_hard_fp_saved_regs_size
7355 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7356 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7357 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7358 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7359 rtx_insn *insn;
7361 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7363 /* Fold the SVE allocation into the initial allocation.
7364 We don't do this in aarch64_layout_arg to avoid pessimizing
7365 the epilogue code. */
7366 initial_adjust += sve_callee_adjust;
7367 sve_callee_adjust = 0;
7370 /* Sign return address for functions. */
7371 if (aarch64_return_address_signing_enabled ())
7373 switch (aarch64_ra_sign_key)
7375 case AARCH64_KEY_A:
7376 insn = emit_insn (gen_paciasp ());
7377 break;
7378 case AARCH64_KEY_B:
7379 insn = emit_insn (gen_pacibsp ());
7380 break;
7381 default:
7382 gcc_unreachable ();
7384 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7385 RTX_FRAME_RELATED_P (insn) = 1;
7388 if (flag_stack_usage_info)
7389 current_function_static_stack_size = constant_lower_bound (frame_size);
7391 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7393 if (crtl->is_leaf && !cfun->calls_alloca)
7395 if (maybe_gt (frame_size, PROBE_INTERVAL)
7396 && maybe_gt (frame_size, get_stack_check_protect ()))
7397 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7398 (frame_size
7399 - get_stack_check_protect ()));
7401 else if (maybe_gt (frame_size, 0))
7402 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7405 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7406 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7408 /* In theory we should never have both an initial adjustment
7409 and a callee save adjustment. Verify that is the case since the
7410 code below does not handle it for -fstack-clash-protection. */
7411 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7413 /* Will only probe if the initial adjustment is larger than the guard
7414 less the amount of the guard reserved for use by the caller's
7415 outgoing args. */
7416 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7417 true, false);
7419 if (callee_adjust != 0)
7420 aarch64_push_regs (reg1, reg2, callee_adjust);
7422 /* The offset of the frame chain record (if any) from the current SP. */
7423 poly_int64 chain_offset = (initial_adjust + callee_adjust
7424 - cfun->machine->frame.hard_fp_offset);
7425 gcc_assert (known_ge (chain_offset, 0));
7427 /* The offset of the bottom of the save area from the current SP. */
7428 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7430 if (emit_frame_chain)
7432 if (callee_adjust == 0)
7434 reg1 = R29_REGNUM;
7435 reg2 = R30_REGNUM;
7436 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7437 false, false);
7439 else
7440 gcc_assert (known_eq (chain_offset, 0));
7441 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7442 stack_pointer_rtx, chain_offset,
7443 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7444 if (frame_pointer_needed && !frame_size.is_constant ())
7446 /* Variable-sized frames need to describe the save slot
7447 address using DW_CFA_expression rather than DW_CFA_offset.
7448 This means that, without taking further action, the
7449 locations of the registers that we've already saved would
7450 remain based on the stack pointer even after we redefine
7451 the CFA based on the frame pointer. We therefore need new
7452 DW_CFA_expressions to re-express the save slots with addresses
7453 based on the frame pointer. */
7454 rtx_insn *insn = get_last_insn ();
7455 gcc_assert (RTX_FRAME_RELATED_P (insn));
7457 /* Add an explicit CFA definition if this was previously
7458 implicit. */
7459 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7461 rtx src = plus_constant (Pmode, stack_pointer_rtx,
7462 callee_offset);
7463 add_reg_note (insn, REG_CFA_ADJUST_CFA,
7464 gen_rtx_SET (hard_frame_pointer_rtx, src));
7467 /* Change the save slot expressions for the registers that
7468 we've already saved. */
7469 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7470 hard_frame_pointer_rtx, UNITS_PER_WORD);
7471 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7472 hard_frame_pointer_rtx, 0);
7474 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7477 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7478 callee_adjust != 0 || emit_frame_chain,
7479 emit_frame_chain);
7480 if (maybe_ne (sve_callee_adjust, 0))
7482 gcc_assert (!flag_stack_clash_protection
7483 || known_eq (initial_adjust, 0));
7484 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7485 sve_callee_adjust,
7486 !frame_pointer_needed, false);
7487 saved_regs_offset += sve_callee_adjust;
7489 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7490 false, emit_frame_chain);
7491 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7492 callee_adjust != 0 || emit_frame_chain,
7493 emit_frame_chain);
7495 /* We may need to probe the final adjustment if it is larger than the guard
7496 that is assumed by the called. */
7497 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7498 !frame_pointer_needed, true);
7501 /* Return TRUE if we can use a simple_return insn.
7503 This function checks whether the callee saved stack is empty, which
7504 means no restore actions are need. The pro_and_epilogue will use
7505 this to check whether shrink-wrapping opt is feasible. */
7507 bool
7508 aarch64_use_return_insn_p (void)
7510 if (!reload_completed)
7511 return false;
7513 if (crtl->profile)
7514 return false;
7516 return known_eq (cfun->machine->frame.frame_size, 0);
7519 /* Generate the epilogue instructions for returning from a function.
7520 This is almost exactly the reverse of the prolog sequence, except
7521 that we need to insert barriers to avoid scheduling loads that read
7522 from a deallocated stack, and we optimize the unwind records by
7523 emitting them all together if possible. */
7524 void
7525 aarch64_expand_epilogue (bool for_sibcall)
7527 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7528 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7529 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7530 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7531 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7532 poly_int64 below_hard_fp_saved_regs_size
7533 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7534 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7535 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7536 rtx cfi_ops = NULL;
7537 rtx_insn *insn;
7538 /* A stack clash protection prologue may not have left EP0_REGNUM or
7539 EP1_REGNUM in a usable state. The same is true for allocations
7540 with an SVE component, since we then need both temporary registers
7541 for each allocation. For stack clash we are in a usable state if
7542 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
7543 HOST_WIDE_INT guard_size
7544 = 1 << param_stack_clash_protection_guard_size;
7545 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7547 /* We can re-use the registers when:
7549 (a) the deallocation amount is the same as the corresponding
7550 allocation amount (which is false if we combine the initial
7551 and SVE callee save allocations in the prologue); and
7553 (b) the allocation amount doesn't need a probe (which is false
7554 if the amount is guard_size - guard_used_by_caller or greater).
7556 In such situations the register should remain live with the correct
7557 value. */
7558 bool can_inherit_p = (initial_adjust.is_constant ()
7559 && final_adjust.is_constant ()
7560 && (!flag_stack_clash_protection
7561 || (known_lt (initial_adjust,
7562 guard_size - guard_used_by_caller)
7563 && known_eq (sve_callee_adjust, 0))));
7565 /* We need to add memory barrier to prevent read from deallocated stack. */
7566 bool need_barrier_p
7567 = maybe_ne (get_frame_size ()
7568 + cfun->machine->frame.saved_varargs_size, 0);
7570 /* Emit a barrier to prevent loads from a deallocated stack. */
7571 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7572 || cfun->calls_alloca
7573 || crtl->calls_eh_return)
7575 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7576 need_barrier_p = false;
7579 /* Restore the stack pointer from the frame pointer if it may not
7580 be the same as the stack pointer. */
7581 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7582 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7583 if (frame_pointer_needed
7584 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7585 /* If writeback is used when restoring callee-saves, the CFA
7586 is restored on the instruction doing the writeback. */
7587 aarch64_add_offset (Pmode, stack_pointer_rtx,
7588 hard_frame_pointer_rtx,
7589 -callee_offset - below_hard_fp_saved_regs_size,
7590 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7591 else
7592 /* The case where we need to re-use the register here is very rare, so
7593 avoid the complicated condition and just always emit a move if the
7594 immediate doesn't fit. */
7595 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7597 /* Restore the vector registers before the predicate registers,
7598 so that we can use P4 as a temporary for big-endian SVE frames. */
7599 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7600 callee_adjust != 0, &cfi_ops);
7601 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7602 false, &cfi_ops);
7603 if (maybe_ne (sve_callee_adjust, 0))
7604 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7605 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7606 R0_REGNUM, R30_REGNUM,
7607 callee_adjust != 0, &cfi_ops);
7609 if (need_barrier_p)
7610 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7612 if (callee_adjust != 0)
7613 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7615 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7617 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
7618 insn = get_last_insn ();
7619 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7620 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7621 RTX_FRAME_RELATED_P (insn) = 1;
7622 cfi_ops = NULL;
7625 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7626 add restriction on emit_move optimization to leaf functions. */
7627 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7628 (!can_inherit_p || !crtl->is_leaf
7629 || df_regs_ever_live_p (EP0_REGNUM)));
7631 if (cfi_ops)
7633 /* Emit delayed restores and reset the CFA to be SP. */
7634 insn = get_last_insn ();
7635 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7636 REG_NOTES (insn) = cfi_ops;
7637 RTX_FRAME_RELATED_P (insn) = 1;
7640 /* We prefer to emit the combined return/authenticate instruction RETAA,
7641 however there are three cases in which we must instead emit an explicit
7642 authentication instruction.
7644 1) Sibcalls don't return in a normal way, so if we're about to call one
7645 we must authenticate.
7647 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7648 generating code for !TARGET_ARMV8_3 we can't use it and must
7649 explicitly authenticate.
7651 3) On an eh_return path we make extra stack adjustments to update the
7652 canonical frame address to be the exception handler's CFA. We want
7653 to authenticate using the CFA of the function which calls eh_return.
7655 if (aarch64_return_address_signing_enabled ()
7656 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7658 switch (aarch64_ra_sign_key)
7660 case AARCH64_KEY_A:
7661 insn = emit_insn (gen_autiasp ());
7662 break;
7663 case AARCH64_KEY_B:
7664 insn = emit_insn (gen_autibsp ());
7665 break;
7666 default:
7667 gcc_unreachable ();
7669 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7670 RTX_FRAME_RELATED_P (insn) = 1;
7673 /* Stack adjustment for exception handler. */
7674 if (crtl->calls_eh_return && !for_sibcall)
7676 /* We need to unwind the stack by the offset computed by
7677 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
7678 to be SP; letting the CFA move during this adjustment
7679 is just as correct as retaining the CFA from the body
7680 of the function. Therefore, do nothing special. */
7681 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7684 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7685 if (!for_sibcall)
7686 emit_jump_insn (ret_rtx);
7689 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
7690 normally or return to a previous frame after unwinding.
7692 An EH return uses a single shared return sequence. The epilogue is
7693 exactly like a normal epilogue except that it has an extra input
7694 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7695 that must be applied after the frame has been destroyed. An extra label
7696 is inserted before the epilogue which initializes this register to zero,
7697 and this is the entry point for a normal return.
7699 An actual EH return updates the return address, initializes the stack
7700 adjustment and jumps directly into the epilogue (bypassing the zeroing
7701 of the adjustment). Since the return address is typically saved on the
7702 stack when a function makes a call, the saved LR must be updated outside
7703 the epilogue.
7705 This poses problems as the store is generated well before the epilogue,
7706 so the offset of LR is not known yet. Also optimizations will remove the
7707 store as it appears dead, even after the epilogue is generated (as the
7708 base or offset for loading LR is different in many cases).
7710 To avoid these problems this implementation forces the frame pointer
7711 in eh_return functions so that the location of LR is fixed and known early.
7712 It also marks the store volatile, so no optimization is permitted to
7713 remove the store. */
7715 aarch64_eh_return_handler_rtx (void)
7717 rtx tmp = gen_frame_mem (Pmode,
7718 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7720 /* Mark the store volatile, so no optimization is permitted to remove it. */
7721 MEM_VOLATILE_P (tmp) = true;
7722 return tmp;
7725 /* Output code to add DELTA to the first argument, and then jump
7726 to FUNCTION. Used for C++ multiple inheritance. */
7727 static void
7728 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7729 HOST_WIDE_INT delta,
7730 HOST_WIDE_INT vcall_offset,
7731 tree function)
7733 /* The this pointer is always in x0. Note that this differs from
7734 Arm where the this pointer maybe bumped to r1 if r0 is required
7735 to return a pointer to an aggregate. On AArch64 a result value
7736 pointer will be in x8. */
7737 int this_regno = R0_REGNUM;
7738 rtx this_rtx, temp0, temp1, addr, funexp;
7739 rtx_insn *insn;
7740 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7742 if (aarch64_bti_enabled ())
7743 emit_insn (gen_bti_c());
7745 reload_completed = 1;
7746 emit_note (NOTE_INSN_PROLOGUE_END);
7748 this_rtx = gen_rtx_REG (Pmode, this_regno);
7749 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7750 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7752 if (vcall_offset == 0)
7753 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7754 else
7756 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7758 addr = this_rtx;
7759 if (delta != 0)
7761 if (delta >= -256 && delta < 256)
7762 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7763 plus_constant (Pmode, this_rtx, delta));
7764 else
7765 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7766 temp1, temp0, false);
7769 if (Pmode == ptr_mode)
7770 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7771 else
7772 aarch64_emit_move (temp0,
7773 gen_rtx_ZERO_EXTEND (Pmode,
7774 gen_rtx_MEM (ptr_mode, addr)));
7776 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7777 addr = plus_constant (Pmode, temp0, vcall_offset);
7778 else
7780 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7781 Pmode);
7782 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7785 if (Pmode == ptr_mode)
7786 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7787 else
7788 aarch64_emit_move (temp1,
7789 gen_rtx_SIGN_EXTEND (Pmode,
7790 gen_rtx_MEM (ptr_mode, addr)));
7792 emit_insn (gen_add2_insn (this_rtx, temp1));
7795 /* Generate a tail call to the target function. */
7796 if (!TREE_USED (function))
7798 assemble_external (function);
7799 TREE_USED (function) = 1;
7801 funexp = XEXP (DECL_RTL (function), 0);
7802 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7803 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7804 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7805 SIBLING_CALL_P (insn) = 1;
7807 insn = get_insns ();
7808 shorten_branches (insn);
7810 assemble_start_function (thunk, fnname);
7811 final_start_function (insn, file, 1);
7812 final (insn, file, 1);
7813 final_end_function ();
7814 assemble_end_function (thunk, fnname);
7816 /* Stop pretending to be a post-reload pass. */
7817 reload_completed = 0;
7820 static bool
7821 aarch64_tls_referenced_p (rtx x)
7823 if (!TARGET_HAVE_TLS)
7824 return false;
7825 subrtx_iterator::array_type array;
7826 FOR_EACH_SUBRTX (iter, array, x, ALL)
7828 const_rtx x = *iter;
7829 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7830 return true;
7831 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7832 TLS offsets, not real symbol references. */
7833 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7834 iter.skip_subrtxes ();
7836 return false;
7840 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7841 a left shift of 0 or 12 bits. */
7842 bool
7843 aarch64_uimm12_shift (HOST_WIDE_INT val)
7845 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7846 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7850 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7851 that can be created with a left shift of 0 or 12. */
7852 static HOST_WIDE_INT
7853 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7855 /* Check to see if the value fits in 24 bits, as that is the maximum we can
7856 handle correctly. */
7857 gcc_assert ((val & 0xffffff) == val);
7859 if (((val & 0xfff) << 0) == val)
7860 return val;
7862 return val & (0xfff << 12);
7865 /* Return true if val is an immediate that can be loaded into a
7866 register by a MOVZ instruction. */
7867 static bool
7868 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7870 if (GET_MODE_SIZE (mode) > 4)
7872 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7873 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7874 return 1;
7876 else
7878 /* Ignore sign extension. */
7879 val &= (HOST_WIDE_INT) 0xffffffff;
7881 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7882 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7885 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7886 64-bit (DImode) integer. */
7888 static unsigned HOST_WIDE_INT
7889 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7891 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7892 while (size < 64)
7894 val &= (HOST_WIDE_INT_1U << size) - 1;
7895 val |= val << size;
7896 size *= 2;
7898 return val;
7901 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7903 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7905 0x0000000100000001ull,
7906 0x0001000100010001ull,
7907 0x0101010101010101ull,
7908 0x1111111111111111ull,
7909 0x5555555555555555ull,
7913 /* Return true if val is a valid bitmask immediate. */
7915 bool
7916 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7918 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7919 int bits;
7921 /* Check for a single sequence of one bits and return quickly if so.
7922 The special cases of all ones and all zeroes returns false. */
7923 val = aarch64_replicate_bitmask_imm (val_in, mode);
7924 tmp = val + (val & -val);
7926 if (tmp == (tmp & -tmp))
7927 return (val + 1) > 1;
7929 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7930 if (mode == SImode)
7931 val = (val << 32) | (val & 0xffffffff);
7933 /* Invert if the immediate doesn't start with a zero bit - this means we
7934 only need to search for sequences of one bits. */
7935 if (val & 1)
7936 val = ~val;
7938 /* Find the first set bit and set tmp to val with the first sequence of one
7939 bits removed. Return success if there is a single sequence of ones. */
7940 first_one = val & -val;
7941 tmp = val & (val + first_one);
7943 if (tmp == 0)
7944 return true;
7946 /* Find the next set bit and compute the difference in bit position. */
7947 next_one = tmp & -tmp;
7948 bits = clz_hwi (first_one) - clz_hwi (next_one);
7949 mask = val ^ tmp;
7951 /* Check the bit position difference is a power of 2, and that the first
7952 sequence of one bits fits within 'bits' bits. */
7953 if ((mask >> bits) != 0 || bits != (bits & -bits))
7954 return false;
7956 /* Check the sequence of one bits is repeated 64/bits times. */
7957 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7960 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7961 Assumed precondition: VAL_IN Is not zero. */
7963 unsigned HOST_WIDE_INT
7964 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7966 int lowest_bit_set = ctz_hwi (val_in);
7967 int highest_bit_set = floor_log2 (val_in);
7968 gcc_assert (val_in != 0);
7970 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7971 (HOST_WIDE_INT_1U << lowest_bit_set));
7974 /* Create constant where bits outside of lowest bit set to highest bit set
7975 are set to 1. */
7977 unsigned HOST_WIDE_INT
7978 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7980 return val_in | ~aarch64_and_split_imm1 (val_in);
7983 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7985 bool
7986 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7988 scalar_int_mode int_mode;
7989 if (!is_a <scalar_int_mode> (mode, &int_mode))
7990 return false;
7992 if (aarch64_bitmask_imm (val_in, int_mode))
7993 return false;
7995 if (aarch64_move_imm (val_in, int_mode))
7996 return false;
7998 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8000 return aarch64_bitmask_imm (imm2, int_mode);
8003 /* Return true if val is an immediate that can be loaded into a
8004 register in a single instruction. */
8005 bool
8006 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8008 scalar_int_mode int_mode;
8009 if (!is_a <scalar_int_mode> (mode, &int_mode))
8010 return false;
8012 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8013 return 1;
8014 return aarch64_bitmask_imm (val, int_mode);
8017 static bool
8018 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8020 rtx base, offset;
8022 if (GET_CODE (x) == HIGH)
8023 return true;
8025 /* There's no way to calculate VL-based values using relocations. */
8026 subrtx_iterator::array_type array;
8027 FOR_EACH_SUBRTX (iter, array, x, ALL)
8028 if (GET_CODE (*iter) == CONST_POLY_INT)
8029 return true;
8031 split_const (x, &base, &offset);
8032 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8034 if (aarch64_classify_symbol (base, INTVAL (offset))
8035 != SYMBOL_FORCE_TO_MEM)
8036 return true;
8037 else
8038 /* Avoid generating a 64-bit relocation in ILP32; leave
8039 to aarch64_expand_mov_immediate to handle it properly. */
8040 return mode != ptr_mode;
8043 return aarch64_tls_referenced_p (x);
8046 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8047 The expansion for a table switch is quite expensive due to the number
8048 of instructions, the table lookup and hard to predict indirect jump.
8049 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8050 set, otherwise use tables for > 16 cases as a tradeoff between size and
8051 performance. When optimizing for size, use the default setting. */
8053 static unsigned int
8054 aarch64_case_values_threshold (void)
8056 /* Use the specified limit for the number of cases before using jump
8057 tables at higher optimization levels. */
8058 if (optimize > 2
8059 && selected_cpu->tune->max_case_values != 0)
8060 return selected_cpu->tune->max_case_values;
8061 else
8062 return optimize_size ? default_case_values_threshold () : 17;
8065 /* Return true if register REGNO is a valid index register.
8066 STRICT_P is true if REG_OK_STRICT is in effect. */
8068 bool
8069 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8071 if (!HARD_REGISTER_NUM_P (regno))
8073 if (!strict_p)
8074 return true;
8076 if (!reg_renumber)
8077 return false;
8079 regno = reg_renumber[regno];
8081 return GP_REGNUM_P (regno);
8084 /* Return true if register REGNO is a valid base register for mode MODE.
8085 STRICT_P is true if REG_OK_STRICT is in effect. */
8087 bool
8088 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8090 if (!HARD_REGISTER_NUM_P (regno))
8092 if (!strict_p)
8093 return true;
8095 if (!reg_renumber)
8096 return false;
8098 regno = reg_renumber[regno];
8101 /* The fake registers will be eliminated to either the stack or
8102 hard frame pointer, both of which are usually valid base registers.
8103 Reload deals with the cases where the eliminated form isn't valid. */
8104 return (GP_REGNUM_P (regno)
8105 || regno == SP_REGNUM
8106 || regno == FRAME_POINTER_REGNUM
8107 || regno == ARG_POINTER_REGNUM);
8110 /* Return true if X is a valid base register for mode MODE.
8111 STRICT_P is true if REG_OK_STRICT is in effect. */
8113 static bool
8114 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8116 if (!strict_p
8117 && GET_CODE (x) == SUBREG
8118 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8119 x = SUBREG_REG (x);
8121 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8124 /* Return true if address offset is a valid index. If it is, fill in INFO
8125 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8127 static bool
8128 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8129 machine_mode mode, bool strict_p)
8131 enum aarch64_address_type type;
8132 rtx index;
8133 int shift;
8135 /* (reg:P) */
8136 if ((REG_P (x) || GET_CODE (x) == SUBREG)
8137 && GET_MODE (x) == Pmode)
8139 type = ADDRESS_REG_REG;
8140 index = x;
8141 shift = 0;
8143 /* (sign_extend:DI (reg:SI)) */
8144 else if ((GET_CODE (x) == SIGN_EXTEND
8145 || GET_CODE (x) == ZERO_EXTEND)
8146 && GET_MODE (x) == DImode
8147 && GET_MODE (XEXP (x, 0)) == SImode)
8149 type = (GET_CODE (x) == SIGN_EXTEND)
8150 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8151 index = XEXP (x, 0);
8152 shift = 0;
8154 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8155 else if (GET_CODE (x) == MULT
8156 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8157 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8158 && GET_MODE (XEXP (x, 0)) == DImode
8159 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8160 && CONST_INT_P (XEXP (x, 1)))
8162 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8163 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8164 index = XEXP (XEXP (x, 0), 0);
8165 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8167 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8168 else if (GET_CODE (x) == ASHIFT
8169 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8170 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8171 && GET_MODE (XEXP (x, 0)) == DImode
8172 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8173 && CONST_INT_P (XEXP (x, 1)))
8175 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8176 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8177 index = XEXP (XEXP (x, 0), 0);
8178 shift = INTVAL (XEXP (x, 1));
8180 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8181 else if ((GET_CODE (x) == SIGN_EXTRACT
8182 || GET_CODE (x) == ZERO_EXTRACT)
8183 && GET_MODE (x) == DImode
8184 && GET_CODE (XEXP (x, 0)) == MULT
8185 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8186 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8188 type = (GET_CODE (x) == SIGN_EXTRACT)
8189 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8190 index = XEXP (XEXP (x, 0), 0);
8191 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8192 if (INTVAL (XEXP (x, 1)) != 32 + shift
8193 || INTVAL (XEXP (x, 2)) != 0)
8194 shift = -1;
8196 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8197 (const_int 0xffffffff<<shift)) */
8198 else if (GET_CODE (x) == AND
8199 && GET_MODE (x) == DImode
8200 && GET_CODE (XEXP (x, 0)) == MULT
8201 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8202 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8203 && CONST_INT_P (XEXP (x, 1)))
8205 type = ADDRESS_REG_UXTW;
8206 index = XEXP (XEXP (x, 0), 0);
8207 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8208 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8209 shift = -1;
8211 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8212 else if ((GET_CODE (x) == SIGN_EXTRACT
8213 || GET_CODE (x) == ZERO_EXTRACT)
8214 && GET_MODE (x) == DImode
8215 && GET_CODE (XEXP (x, 0)) == ASHIFT
8216 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8217 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8219 type = (GET_CODE (x) == SIGN_EXTRACT)
8220 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8221 index = XEXP (XEXP (x, 0), 0);
8222 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8223 if (INTVAL (XEXP (x, 1)) != 32 + shift
8224 || INTVAL (XEXP (x, 2)) != 0)
8225 shift = -1;
8227 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8228 (const_int 0xffffffff<<shift)) */
8229 else if (GET_CODE (x) == AND
8230 && GET_MODE (x) == DImode
8231 && GET_CODE (XEXP (x, 0)) == ASHIFT
8232 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8233 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8234 && CONST_INT_P (XEXP (x, 1)))
8236 type = ADDRESS_REG_UXTW;
8237 index = XEXP (XEXP (x, 0), 0);
8238 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8239 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8240 shift = -1;
8242 /* (mult:P (reg:P) (const_int scale)) */
8243 else if (GET_CODE (x) == MULT
8244 && GET_MODE (x) == Pmode
8245 && GET_MODE (XEXP (x, 0)) == Pmode
8246 && CONST_INT_P (XEXP (x, 1)))
8248 type = ADDRESS_REG_REG;
8249 index = XEXP (x, 0);
8250 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8252 /* (ashift:P (reg:P) (const_int shift)) */
8253 else if (GET_CODE (x) == ASHIFT
8254 && GET_MODE (x) == Pmode
8255 && GET_MODE (XEXP (x, 0)) == Pmode
8256 && CONST_INT_P (XEXP (x, 1)))
8258 type = ADDRESS_REG_REG;
8259 index = XEXP (x, 0);
8260 shift = INTVAL (XEXP (x, 1));
8262 else
8263 return false;
8265 if (!strict_p
8266 && GET_CODE (index) == SUBREG
8267 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8268 index = SUBREG_REG (index);
8270 if (aarch64_sve_data_mode_p (mode))
8272 if (type != ADDRESS_REG_REG
8273 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8274 return false;
8276 else
8278 if (shift != 0
8279 && !(IN_RANGE (shift, 1, 3)
8280 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8281 return false;
8284 if (REG_P (index)
8285 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8287 info->type = type;
8288 info->offset = index;
8289 info->shift = shift;
8290 return true;
8293 return false;
8296 /* Return true if MODE is one of the modes for which we
8297 support LDP/STP operations. */
8299 static bool
8300 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8302 return mode == SImode || mode == DImode
8303 || mode == SFmode || mode == DFmode
8304 || (aarch64_vector_mode_supported_p (mode)
8305 && (known_eq (GET_MODE_SIZE (mode), 8)
8306 || (known_eq (GET_MODE_SIZE (mode), 16)
8307 && (aarch64_tune_params.extra_tuning_flags
8308 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8311 /* Return true if REGNO is a virtual pointer register, or an eliminable
8312 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8313 include stack_pointer or hard_frame_pointer. */
8314 static bool
8315 virt_or_elim_regno_p (unsigned regno)
8317 return ((regno >= FIRST_VIRTUAL_REGISTER
8318 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8319 || regno == FRAME_POINTER_REGNUM
8320 || regno == ARG_POINTER_REGNUM);
8323 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8324 If it is, fill in INFO appropriately. STRICT_P is true if
8325 REG_OK_STRICT is in effect. */
8327 bool
8328 aarch64_classify_address (struct aarch64_address_info *info,
8329 rtx x, machine_mode mode, bool strict_p,
8330 aarch64_addr_query_type type)
8332 enum rtx_code code = GET_CODE (x);
8333 rtx op0, op1;
8334 poly_int64 offset;
8336 HOST_WIDE_INT const_size;
8338 /* Whether a vector mode is partial doesn't affect address legitimacy.
8339 Partial vectors like VNx8QImode allow the same indexed addressing
8340 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8341 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8342 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8343 vec_flags &= ~VEC_PARTIAL;
8345 /* On BE, we use load/store pair for all large int mode load/stores.
8346 TI/TFmode may also use a load/store pair. */
8347 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8348 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8349 || type == ADDR_QUERY_LDP_STP_N
8350 || mode == TImode
8351 || mode == TFmode
8352 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8354 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8355 corresponds to the actual size of the memory being loaded/stored and the
8356 mode of the corresponding addressing mode is half of that. */
8357 if (type == ADDR_QUERY_LDP_STP_N
8358 && known_eq (GET_MODE_SIZE (mode), 16))
8359 mode = DFmode;
8361 bool allow_reg_index_p = (!load_store_pair_p
8362 && (known_lt (GET_MODE_SIZE (mode), 16)
8363 || vec_flags == VEC_ADVSIMD
8364 || vec_flags & VEC_SVE_DATA));
8366 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8367 [Rn, #offset, MUL VL]. */
8368 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8369 && (code != REG && code != PLUS))
8370 return false;
8372 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8373 REG addressing. */
8374 if (advsimd_struct_p
8375 && !BYTES_BIG_ENDIAN
8376 && (code != POST_INC && code != REG))
8377 return false;
8379 gcc_checking_assert (GET_MODE (x) == VOIDmode
8380 || SCALAR_INT_MODE_P (GET_MODE (x)));
8382 switch (code)
8384 case REG:
8385 case SUBREG:
8386 info->type = ADDRESS_REG_IMM;
8387 info->base = x;
8388 info->offset = const0_rtx;
8389 info->const_offset = 0;
8390 return aarch64_base_register_rtx_p (x, strict_p);
8392 case PLUS:
8393 op0 = XEXP (x, 0);
8394 op1 = XEXP (x, 1);
8396 if (! strict_p
8397 && REG_P (op0)
8398 && virt_or_elim_regno_p (REGNO (op0))
8399 && poly_int_rtx_p (op1, &offset))
8401 info->type = ADDRESS_REG_IMM;
8402 info->base = op0;
8403 info->offset = op1;
8404 info->const_offset = offset;
8406 return true;
8409 if (maybe_ne (GET_MODE_SIZE (mode), 0)
8410 && aarch64_base_register_rtx_p (op0, strict_p)
8411 && poly_int_rtx_p (op1, &offset))
8413 info->type = ADDRESS_REG_IMM;
8414 info->base = op0;
8415 info->offset = op1;
8416 info->const_offset = offset;
8418 /* TImode and TFmode values are allowed in both pairs of X
8419 registers and individual Q registers. The available
8420 address modes are:
8421 X,X: 7-bit signed scaled offset
8422 Q: 9-bit signed offset
8423 We conservatively require an offset representable in either mode.
8424 When performing the check for pairs of X registers i.e. LDP/STP
8425 pass down DImode since that is the natural size of the LDP/STP
8426 instruction memory accesses. */
8427 if (mode == TImode || mode == TFmode)
8428 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8429 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8430 || offset_12bit_unsigned_scaled_p (mode, offset)));
8432 /* A 7bit offset check because OImode will emit a ldp/stp
8433 instruction (only big endian will get here).
8434 For ldp/stp instructions, the offset is scaled for the size of a
8435 single element of the pair. */
8436 if (mode == OImode)
8437 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8439 /* Three 9/12 bit offsets checks because CImode will emit three
8440 ldr/str instructions (only big endian will get here). */
8441 if (mode == CImode)
8442 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8443 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8444 offset + 32)
8445 || offset_12bit_unsigned_scaled_p (V16QImode,
8446 offset + 32)));
8448 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8449 instructions (only big endian will get here). */
8450 if (mode == XImode)
8451 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8452 && aarch64_offset_7bit_signed_scaled_p (TImode,
8453 offset + 32));
8455 /* Make "m" use the LD1 offset range for SVE data modes, so
8456 that pre-RTL optimizers like ivopts will work to that
8457 instead of the wider LDR/STR range. */
8458 if (vec_flags == VEC_SVE_DATA)
8459 return (type == ADDR_QUERY_M
8460 ? offset_4bit_signed_scaled_p (mode, offset)
8461 : offset_9bit_signed_scaled_p (mode, offset));
8463 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8465 poly_int64 end_offset = (offset
8466 + GET_MODE_SIZE (mode)
8467 - BYTES_PER_SVE_VECTOR);
8468 return (type == ADDR_QUERY_M
8469 ? offset_4bit_signed_scaled_p (mode, offset)
8470 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8471 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8472 end_offset)));
8475 if (vec_flags == VEC_SVE_PRED)
8476 return offset_9bit_signed_scaled_p (mode, offset);
8478 if (load_store_pair_p)
8479 return ((known_eq (GET_MODE_SIZE (mode), 4)
8480 || known_eq (GET_MODE_SIZE (mode), 8)
8481 || known_eq (GET_MODE_SIZE (mode), 16))
8482 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8483 else
8484 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8485 || offset_12bit_unsigned_scaled_p (mode, offset));
8488 if (allow_reg_index_p)
8490 /* Look for base + (scaled/extended) index register. */
8491 if (aarch64_base_register_rtx_p (op0, strict_p)
8492 && aarch64_classify_index (info, op1, mode, strict_p))
8494 info->base = op0;
8495 return true;
8497 if (aarch64_base_register_rtx_p (op1, strict_p)
8498 && aarch64_classify_index (info, op0, mode, strict_p))
8500 info->base = op1;
8501 return true;
8505 return false;
8507 case POST_INC:
8508 case POST_DEC:
8509 case PRE_INC:
8510 case PRE_DEC:
8511 info->type = ADDRESS_REG_WB;
8512 info->base = XEXP (x, 0);
8513 info->offset = NULL_RTX;
8514 return aarch64_base_register_rtx_p (info->base, strict_p);
8516 case POST_MODIFY:
8517 case PRE_MODIFY:
8518 info->type = ADDRESS_REG_WB;
8519 info->base = XEXP (x, 0);
8520 if (GET_CODE (XEXP (x, 1)) == PLUS
8521 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8522 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8523 && aarch64_base_register_rtx_p (info->base, strict_p))
8525 info->offset = XEXP (XEXP (x, 1), 1);
8526 info->const_offset = offset;
8528 /* TImode and TFmode values are allowed in both pairs of X
8529 registers and individual Q registers. The available
8530 address modes are:
8531 X,X: 7-bit signed scaled offset
8532 Q: 9-bit signed offset
8533 We conservatively require an offset representable in either mode.
8535 if (mode == TImode || mode == TFmode)
8536 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8537 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8539 if (load_store_pair_p)
8540 return ((known_eq (GET_MODE_SIZE (mode), 4)
8541 || known_eq (GET_MODE_SIZE (mode), 8)
8542 || known_eq (GET_MODE_SIZE (mode), 16))
8543 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8544 else
8545 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8547 return false;
8549 case CONST:
8550 case SYMBOL_REF:
8551 case LABEL_REF:
8552 /* load literal: pc-relative constant pool entry. Only supported
8553 for SI mode or larger. */
8554 info->type = ADDRESS_SYMBOLIC;
8556 if (!load_store_pair_p
8557 && GET_MODE_SIZE (mode).is_constant (&const_size)
8558 && const_size >= 4)
8560 rtx sym, addend;
8562 split_const (x, &sym, &addend);
8563 return ((GET_CODE (sym) == LABEL_REF
8564 || (GET_CODE (sym) == SYMBOL_REF
8565 && CONSTANT_POOL_ADDRESS_P (sym)
8566 && aarch64_pcrelative_literal_loads)));
8568 return false;
8570 case LO_SUM:
8571 info->type = ADDRESS_LO_SUM;
8572 info->base = XEXP (x, 0);
8573 info->offset = XEXP (x, 1);
8574 if (allow_reg_index_p
8575 && aarch64_base_register_rtx_p (info->base, strict_p))
8577 rtx sym, offs;
8578 split_const (info->offset, &sym, &offs);
8579 if (GET_CODE (sym) == SYMBOL_REF
8580 && (aarch64_classify_symbol (sym, INTVAL (offs))
8581 == SYMBOL_SMALL_ABSOLUTE))
8583 /* The symbol and offset must be aligned to the access size. */
8584 unsigned int align;
8586 if (CONSTANT_POOL_ADDRESS_P (sym))
8587 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8588 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8590 tree exp = SYMBOL_REF_DECL (sym);
8591 align = TYPE_ALIGN (TREE_TYPE (exp));
8592 align = aarch64_constant_alignment (exp, align);
8594 else if (SYMBOL_REF_DECL (sym))
8595 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8596 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8597 && SYMBOL_REF_BLOCK (sym) != NULL)
8598 align = SYMBOL_REF_BLOCK (sym)->alignment;
8599 else
8600 align = BITS_PER_UNIT;
8602 poly_int64 ref_size = GET_MODE_SIZE (mode);
8603 if (known_eq (ref_size, 0))
8604 ref_size = GET_MODE_SIZE (DImode);
8606 return (multiple_p (INTVAL (offs), ref_size)
8607 && multiple_p (align / BITS_PER_UNIT, ref_size));
8610 return false;
8612 default:
8613 return false;
8617 /* Return true if the address X is valid for a PRFM instruction.
8618 STRICT_P is true if we should do strict checking with
8619 aarch64_classify_address. */
8621 bool
8622 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8624 struct aarch64_address_info addr;
8626 /* PRFM accepts the same addresses as DImode... */
8627 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8628 if (!res)
8629 return false;
8631 /* ... except writeback forms. */
8632 return addr.type != ADDRESS_REG_WB;
8635 bool
8636 aarch64_symbolic_address_p (rtx x)
8638 rtx offset;
8640 split_const (x, &x, &offset);
8641 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8644 /* Classify the base of symbolic expression X. */
8646 enum aarch64_symbol_type
8647 aarch64_classify_symbolic_expression (rtx x)
8649 rtx offset;
8651 split_const (x, &x, &offset);
8652 return aarch64_classify_symbol (x, INTVAL (offset));
8656 /* Return TRUE if X is a legitimate address for accessing memory in
8657 mode MODE. */
8658 static bool
8659 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8661 struct aarch64_address_info addr;
8663 return aarch64_classify_address (&addr, x, mode, strict_p);
8666 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8667 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
8668 bool
8669 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8670 aarch64_addr_query_type type)
8672 struct aarch64_address_info addr;
8674 return aarch64_classify_address (&addr, x, mode, strict_p, type);
8677 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
8679 static bool
8680 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8681 poly_int64 orig_offset,
8682 machine_mode mode)
8684 HOST_WIDE_INT size;
8685 if (GET_MODE_SIZE (mode).is_constant (&size))
8687 HOST_WIDE_INT const_offset, second_offset;
8689 /* A general SVE offset is A * VQ + B. Remove the A component from
8690 coefficient 0 in order to get the constant B. */
8691 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8693 /* Split an out-of-range address displacement into a base and
8694 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
8695 range otherwise to increase opportunities for sharing the base
8696 address of different sizes. Unaligned accesses use the signed
8697 9-bit range, TImode/TFmode use the intersection of signed
8698 scaled 7-bit and signed 9-bit offset. */
8699 if (mode == TImode || mode == TFmode)
8700 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8701 else if ((const_offset & (size - 1)) != 0)
8702 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8703 else
8704 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8706 if (second_offset == 0 || known_eq (orig_offset, second_offset))
8707 return false;
8709 /* Split the offset into second_offset and the rest. */
8710 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8711 *offset2 = gen_int_mode (second_offset, Pmode);
8712 return true;
8714 else
8716 /* Get the mode we should use as the basis of the range. For structure
8717 modes this is the mode of one vector. */
8718 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8719 machine_mode step_mode
8720 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8722 /* Get the "mul vl" multiplier we'd like to use. */
8723 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8724 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8725 if (vec_flags & VEC_SVE_DATA)
8726 /* LDR supports a 9-bit range, but the move patterns for
8727 structure modes require all vectors to be in range of the
8728 same base. The simplest way of accomodating that while still
8729 promoting reuse of anchor points between different modes is
8730 to use an 8-bit range unconditionally. */
8731 vnum = ((vnum + 128) & 255) - 128;
8732 else
8733 /* Predicates are only handled singly, so we might as well use
8734 the full range. */
8735 vnum = ((vnum + 256) & 511) - 256;
8736 if (vnum == 0)
8737 return false;
8739 /* Convert the "mul vl" multiplier into a byte offset. */
8740 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8741 if (known_eq (second_offset, orig_offset))
8742 return false;
8744 /* Split the offset into second_offset and the rest. */
8745 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8746 *offset2 = gen_int_mode (second_offset, Pmode);
8747 return true;
8751 /* Return the binary representation of floating point constant VALUE in INTVAL.
8752 If the value cannot be converted, return false without setting INTVAL.
8753 The conversion is done in the given MODE. */
8754 bool
8755 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8758 /* We make a general exception for 0. */
8759 if (aarch64_float_const_zero_rtx_p (value))
8761 *intval = 0;
8762 return true;
8765 scalar_float_mode mode;
8766 if (GET_CODE (value) != CONST_DOUBLE
8767 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8768 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8769 /* Only support up to DF mode. */
8770 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8771 return false;
8773 unsigned HOST_WIDE_INT ival = 0;
8775 long res[2];
8776 real_to_target (res,
8777 CONST_DOUBLE_REAL_VALUE (value),
8778 REAL_MODE_FORMAT (mode));
8780 if (mode == DFmode)
8782 int order = BYTES_BIG_ENDIAN ? 1 : 0;
8783 ival = zext_hwi (res[order], 32);
8784 ival |= (zext_hwi (res[1 - order], 32) << 32);
8786 else
8787 ival = zext_hwi (res[0], 32);
8789 *intval = ival;
8790 return true;
8793 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8794 single MOV(+MOVK) followed by an FMOV. */
8795 bool
8796 aarch64_float_const_rtx_p (rtx x)
8798 machine_mode mode = GET_MODE (x);
8799 if (mode == VOIDmode)
8800 return false;
8802 /* Determine whether it's cheaper to write float constants as
8803 mov/movk pairs over ldr/adrp pairs. */
8804 unsigned HOST_WIDE_INT ival;
8806 if (GET_CODE (x) == CONST_DOUBLE
8807 && SCALAR_FLOAT_MODE_P (mode)
8808 && aarch64_reinterpret_float_as_int (x, &ival))
8810 scalar_int_mode imode = (mode == HFmode
8811 ? SImode
8812 : int_mode_for_mode (mode).require ());
8813 int num_instr = aarch64_internal_mov_immediate
8814 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8815 return num_instr < 3;
8818 return false;
8821 /* Return TRUE if rtx X is immediate constant 0.0 */
8822 bool
8823 aarch64_float_const_zero_rtx_p (rtx x)
8825 if (GET_MODE (x) == VOIDmode)
8826 return false;
8828 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8829 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8830 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8833 /* Return TRUE if rtx X is immediate constant that fits in a single
8834 MOVI immediate operation. */
8835 bool
8836 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8838 if (!TARGET_SIMD)
8839 return false;
8841 machine_mode vmode;
8842 scalar_int_mode imode;
8843 unsigned HOST_WIDE_INT ival;
8845 if (GET_CODE (x) == CONST_DOUBLE
8846 && SCALAR_FLOAT_MODE_P (mode))
8848 if (!aarch64_reinterpret_float_as_int (x, &ival))
8849 return false;
8851 /* We make a general exception for 0. */
8852 if (aarch64_float_const_zero_rtx_p (x))
8853 return true;
8855 imode = int_mode_for_mode (mode).require ();
8857 else if (GET_CODE (x) == CONST_INT
8858 && is_a <scalar_int_mode> (mode, &imode))
8859 ival = INTVAL (x);
8860 else
8861 return false;
8863 /* use a 64 bit mode for everything except for DI/DF mode, where we use
8864 a 128 bit vector mode. */
8865 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8867 vmode = aarch64_simd_container_mode (imode, width);
8868 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8870 return aarch64_simd_valid_immediate (v_op, NULL);
8874 /* Return the fixed registers used for condition codes. */
8876 static bool
8877 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8879 *p1 = CC_REGNUM;
8880 *p2 = INVALID_REGNUM;
8881 return true;
8884 /* This function is used by the call expanders of the machine description.
8885 RESULT is the register in which the result is returned. It's NULL for
8886 "call" and "sibcall".
8887 MEM is the location of the function call.
8888 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8889 SIBCALL indicates whether this function call is normal call or sibling call.
8890 It will generate different pattern accordingly. */
8892 void
8893 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8895 rtx call, callee, tmp;
8896 rtvec vec;
8897 machine_mode mode;
8899 gcc_assert (MEM_P (mem));
8900 callee = XEXP (mem, 0);
8901 mode = GET_MODE (callee);
8902 gcc_assert (mode == Pmode);
8904 /* Decide if we should generate indirect calls by loading the
8905 address of the callee into a register before performing
8906 the branch-and-link. */
8907 if (SYMBOL_REF_P (callee)
8908 ? (aarch64_is_long_call_p (callee)
8909 || aarch64_is_noplt_call_p (callee))
8910 : !REG_P (callee))
8911 XEXP (mem, 0) = force_reg (mode, callee);
8913 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8915 if (result != NULL_RTX)
8916 call = gen_rtx_SET (result, call);
8918 if (sibcall)
8919 tmp = ret_rtx;
8920 else
8921 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8923 gcc_assert (CONST_INT_P (callee_abi));
8924 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8925 UNSPEC_CALLEE_ABI);
8927 vec = gen_rtvec (3, call, callee_abi, tmp);
8928 call = gen_rtx_PARALLEL (VOIDmode, vec);
8930 aarch64_emit_call_insn (call);
8933 /* Emit call insn with PAT and do aarch64-specific handling. */
8935 void
8936 aarch64_emit_call_insn (rtx pat)
8938 rtx insn = emit_call_insn (pat);
8940 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8941 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8942 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8945 machine_mode
8946 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8948 machine_mode mode_x = GET_MODE (x);
8949 rtx_code code_x = GET_CODE (x);
8951 /* All floating point compares return CCFP if it is an equality
8952 comparison, and CCFPE otherwise. */
8953 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8955 switch (code)
8957 case EQ:
8958 case NE:
8959 case UNORDERED:
8960 case ORDERED:
8961 case UNLT:
8962 case UNLE:
8963 case UNGT:
8964 case UNGE:
8965 case UNEQ:
8966 return CCFPmode;
8968 case LT:
8969 case LE:
8970 case GT:
8971 case GE:
8972 case LTGT:
8973 return CCFPEmode;
8975 default:
8976 gcc_unreachable ();
8980 /* Equality comparisons of short modes against zero can be performed
8981 using the TST instruction with the appropriate bitmask. */
8982 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8983 && (code == EQ || code == NE)
8984 && (mode_x == HImode || mode_x == QImode))
8985 return CC_NZmode;
8987 /* Similarly, comparisons of zero_extends from shorter modes can
8988 be performed using an ANDS with an immediate mask. */
8989 if (y == const0_rtx && code_x == ZERO_EXTEND
8990 && (mode_x == SImode || mode_x == DImode)
8991 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8992 && (code == EQ || code == NE))
8993 return CC_NZmode;
8995 if ((mode_x == SImode || mode_x == DImode)
8996 && y == const0_rtx
8997 && (code == EQ || code == NE || code == LT || code == GE)
8998 && (code_x == PLUS || code_x == MINUS || code_x == AND
8999 || code_x == NEG
9000 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9001 && CONST_INT_P (XEXP (x, 2)))))
9002 return CC_NZmode;
9004 /* A compare with a shifted operand. Because of canonicalization,
9005 the comparison will have to be swapped when we emit the assembly
9006 code. */
9007 if ((mode_x == SImode || mode_x == DImode)
9008 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9009 && (code_x == ASHIFT || code_x == ASHIFTRT
9010 || code_x == LSHIFTRT
9011 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9012 return CC_SWPmode;
9014 /* Similarly for a negated operand, but we can only do this for
9015 equalities. */
9016 if ((mode_x == SImode || mode_x == DImode)
9017 && (REG_P (y) || GET_CODE (y) == SUBREG)
9018 && (code == EQ || code == NE)
9019 && code_x == NEG)
9020 return CC_Zmode;
9022 /* A test for unsigned overflow from an addition. */
9023 if ((mode_x == DImode || mode_x == TImode)
9024 && (code == LTU || code == GEU)
9025 && code_x == PLUS
9026 && rtx_equal_p (XEXP (x, 0), y))
9027 return CC_Cmode;
9029 /* A test for unsigned overflow from an add with carry. */
9030 if ((mode_x == DImode || mode_x == TImode)
9031 && (code == LTU || code == GEU)
9032 && code_x == PLUS
9033 && CONST_SCALAR_INT_P (y)
9034 && (rtx_mode_t (y, mode_x)
9035 == (wi::shwi (1, mode_x)
9036 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9037 return CC_ADCmode;
9039 /* A test for signed overflow. */
9040 if ((mode_x == DImode || mode_x == TImode)
9041 && code == NE
9042 && code_x == PLUS
9043 && GET_CODE (y) == SIGN_EXTEND)
9044 return CC_Vmode;
9046 /* For everything else, return CCmode. */
9047 return CCmode;
9050 static int
9051 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9054 aarch64_get_condition_code (rtx x)
9056 machine_mode mode = GET_MODE (XEXP (x, 0));
9057 enum rtx_code comp_code = GET_CODE (x);
9059 if (GET_MODE_CLASS (mode) != MODE_CC)
9060 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9061 return aarch64_get_condition_code_1 (mode, comp_code);
9064 static int
9065 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9067 switch (mode)
9069 case E_CCFPmode:
9070 case E_CCFPEmode:
9071 switch (comp_code)
9073 case GE: return AARCH64_GE;
9074 case GT: return AARCH64_GT;
9075 case LE: return AARCH64_LS;
9076 case LT: return AARCH64_MI;
9077 case NE: return AARCH64_NE;
9078 case EQ: return AARCH64_EQ;
9079 case ORDERED: return AARCH64_VC;
9080 case UNORDERED: return AARCH64_VS;
9081 case UNLT: return AARCH64_LT;
9082 case UNLE: return AARCH64_LE;
9083 case UNGT: return AARCH64_HI;
9084 case UNGE: return AARCH64_PL;
9085 default: return -1;
9087 break;
9089 case E_CCmode:
9090 switch (comp_code)
9092 case NE: return AARCH64_NE;
9093 case EQ: return AARCH64_EQ;
9094 case GE: return AARCH64_GE;
9095 case GT: return AARCH64_GT;
9096 case LE: return AARCH64_LE;
9097 case LT: return AARCH64_LT;
9098 case GEU: return AARCH64_CS;
9099 case GTU: return AARCH64_HI;
9100 case LEU: return AARCH64_LS;
9101 case LTU: return AARCH64_CC;
9102 default: return -1;
9104 break;
9106 case E_CC_SWPmode:
9107 switch (comp_code)
9109 case NE: return AARCH64_NE;
9110 case EQ: return AARCH64_EQ;
9111 case GE: return AARCH64_LE;
9112 case GT: return AARCH64_LT;
9113 case LE: return AARCH64_GE;
9114 case LT: return AARCH64_GT;
9115 case GEU: return AARCH64_LS;
9116 case GTU: return AARCH64_CC;
9117 case LEU: return AARCH64_CS;
9118 case LTU: return AARCH64_HI;
9119 default: return -1;
9121 break;
9123 case E_CC_NZCmode:
9124 switch (comp_code)
9126 case NE: return AARCH64_NE; /* = any */
9127 case EQ: return AARCH64_EQ; /* = none */
9128 case GE: return AARCH64_PL; /* = nfrst */
9129 case LT: return AARCH64_MI; /* = first */
9130 case GEU: return AARCH64_CS; /* = nlast */
9131 case GTU: return AARCH64_HI; /* = pmore */
9132 case LEU: return AARCH64_LS; /* = plast */
9133 case LTU: return AARCH64_CC; /* = last */
9134 default: return -1;
9136 break;
9138 case E_CC_NZmode:
9139 switch (comp_code)
9141 case NE: return AARCH64_NE;
9142 case EQ: return AARCH64_EQ;
9143 case GE: return AARCH64_PL;
9144 case LT: return AARCH64_MI;
9145 default: return -1;
9147 break;
9149 case E_CC_Zmode:
9150 switch (comp_code)
9152 case NE: return AARCH64_NE;
9153 case EQ: return AARCH64_EQ;
9154 default: return -1;
9156 break;
9158 case E_CC_Cmode:
9159 switch (comp_code)
9161 case LTU: return AARCH64_CS;
9162 case GEU: return AARCH64_CC;
9163 default: return -1;
9165 break;
9167 case E_CC_ADCmode:
9168 switch (comp_code)
9170 case GEU: return AARCH64_CS;
9171 case LTU: return AARCH64_CC;
9172 default: return -1;
9174 break;
9176 case E_CC_Vmode:
9177 switch (comp_code)
9179 case NE: return AARCH64_VS;
9180 case EQ: return AARCH64_VC;
9181 default: return -1;
9183 break;
9185 default:
9186 return -1;
9189 return -1;
9192 bool
9193 aarch64_const_vec_all_same_in_range_p (rtx x,
9194 HOST_WIDE_INT minval,
9195 HOST_WIDE_INT maxval)
9197 rtx elt;
9198 return (const_vec_duplicate_p (x, &elt)
9199 && CONST_INT_P (elt)
9200 && IN_RANGE (INTVAL (elt), minval, maxval));
9203 bool
9204 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9206 return aarch64_const_vec_all_same_in_range_p (x, val, val);
9209 /* Return true if VEC is a constant in which every element is in the range
9210 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9212 static bool
9213 aarch64_const_vec_all_in_range_p (rtx vec,
9214 HOST_WIDE_INT minval,
9215 HOST_WIDE_INT maxval)
9217 if (GET_CODE (vec) != CONST_VECTOR
9218 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9219 return false;
9221 int nunits;
9222 if (!CONST_VECTOR_STEPPED_P (vec))
9223 nunits = const_vector_encoded_nelts (vec);
9224 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9225 return false;
9227 for (int i = 0; i < nunits; i++)
9229 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9230 if (!CONST_INT_P (vec_elem)
9231 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9232 return false;
9234 return true;
9237 /* N Z C V. */
9238 #define AARCH64_CC_V 1
9239 #define AARCH64_CC_C (1 << 1)
9240 #define AARCH64_CC_Z (1 << 2)
9241 #define AARCH64_CC_N (1 << 3)
9243 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9244 static const int aarch64_nzcv_codes[] =
9246 0, /* EQ, Z == 1. */
9247 AARCH64_CC_Z, /* NE, Z == 0. */
9248 0, /* CS, C == 1. */
9249 AARCH64_CC_C, /* CC, C == 0. */
9250 0, /* MI, N == 1. */
9251 AARCH64_CC_N, /* PL, N == 0. */
9252 0, /* VS, V == 1. */
9253 AARCH64_CC_V, /* VC, V == 0. */
9254 0, /* HI, C ==1 && Z == 0. */
9255 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
9256 AARCH64_CC_V, /* GE, N == V. */
9257 0, /* LT, N != V. */
9258 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
9259 0, /* LE, !(Z == 0 && N == V). */
9260 0, /* AL, Any. */
9261 0 /* NV, Any. */
9264 /* Print floating-point vector immediate operand X to F, negating it
9265 first if NEGATE is true. Return true on success, false if it isn't
9266 a constant we can handle. */
9268 static bool
9269 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9271 rtx elt;
9273 if (!const_vec_duplicate_p (x, &elt))
9274 return false;
9276 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9277 if (negate)
9278 r = real_value_negate (&r);
9280 /* Handle the SVE single-bit immediates specially, since they have a
9281 fixed form in the assembly syntax. */
9282 if (real_equal (&r, &dconst0))
9283 asm_fprintf (f, "0.0");
9284 else if (real_equal (&r, &dconst2))
9285 asm_fprintf (f, "2.0");
9286 else if (real_equal (&r, &dconst1))
9287 asm_fprintf (f, "1.0");
9288 else if (real_equal (&r, &dconsthalf))
9289 asm_fprintf (f, "0.5");
9290 else
9292 const int buf_size = 20;
9293 char float_buf[buf_size] = {'\0'};
9294 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9295 1, GET_MODE (elt));
9296 asm_fprintf (f, "%s", float_buf);
9299 return true;
9302 /* Return the equivalent letter for size. */
9303 static char
9304 sizetochar (int size)
9306 switch (size)
9308 case 64: return 'd';
9309 case 32: return 's';
9310 case 16: return 'h';
9311 case 8 : return 'b';
9312 default: gcc_unreachable ();
9316 /* Print operand X to file F in a target specific manner according to CODE.
9317 The acceptable formatting commands given by CODE are:
9318 'c': An integer or symbol address without a preceding #
9319 sign.
9320 'C': Take the duplicated element in a vector constant
9321 and print it in hex.
9322 'D': Take the duplicated element in a vector constant
9323 and print it as an unsigned integer, in decimal.
9324 'e': Print the sign/zero-extend size as a character 8->b,
9325 16->h, 32->w. Can also be used for masks:
9326 0xff->b, 0xffff->h, 0xffffffff->w.
9327 'I': If the operand is a duplicated vector constant,
9328 replace it with the duplicated scalar. If the
9329 operand is then a floating-point constant, replace
9330 it with the integer bit representation. Print the
9331 transformed constant as a signed decimal number.
9332 'p': Prints N such that 2^N == X (X must be power of 2 and
9333 const int).
9334 'P': Print the number of non-zero bits in X (a const_int).
9335 'H': Print the higher numbered register of a pair (TImode)
9336 of regs.
9337 'm': Print a condition (eq, ne, etc).
9338 'M': Same as 'm', but invert condition.
9339 'N': Take the duplicated element in a vector constant
9340 and print the negative of it in decimal.
9341 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9342 'S/T/U/V': Print a FP/SIMD register name for a register list.
9343 The register printed is the FP/SIMD register name
9344 of X + 0/1/2/3 for S/T/U/V.
9345 'R': Print a scalar Integer/FP/SIMD register name + 1.
9346 'X': Print bottom 16 bits of integer constant in hex.
9347 'w/x': Print a general register name or the zero register
9348 (32-bit or 64-bit).
9349 '0': Print a normal operand, if it's a general register,
9350 then we assume DImode.
9351 'k': Print NZCV for conditional compare instructions.
9352 'A': Output address constant representing the first
9353 argument of X, specifying a relocation offset
9354 if appropriate.
9355 'L': Output constant address specified by X
9356 with a relocation offset if appropriate.
9357 'G': Prints address of X, specifying a PC relative
9358 relocation mode if appropriate.
9359 'y': Output address of LDP or STP - this is used for
9360 some LDP/STPs which don't use a PARALLEL in their
9361 pattern (so the mode needs to be adjusted).
9362 'z': Output address of a typical LDP or STP. */
9364 static void
9365 aarch64_print_operand (FILE *f, rtx x, int code)
9367 rtx elt;
9368 switch (code)
9370 case 'c':
9371 switch (GET_CODE (x))
9373 case CONST_INT:
9374 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9375 break;
9377 case SYMBOL_REF:
9378 output_addr_const (f, x);
9379 break;
9381 case CONST:
9382 if (GET_CODE (XEXP (x, 0)) == PLUS
9383 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9385 output_addr_const (f, x);
9386 break;
9388 /* Fall through. */
9390 default:
9391 output_operand_lossage ("unsupported operand for code '%c'", code);
9393 break;
9395 case 'e':
9397 x = unwrap_const_vec_duplicate (x);
9398 if (!CONST_INT_P (x))
9400 output_operand_lossage ("invalid operand for '%%%c'", code);
9401 return;
9404 HOST_WIDE_INT val = INTVAL (x);
9405 if ((val & ~7) == 8 || val == 0xff)
9406 fputc ('b', f);
9407 else if ((val & ~7) == 16 || val == 0xffff)
9408 fputc ('h', f);
9409 else if ((val & ~7) == 32 || val == 0xffffffff)
9410 fputc ('w', f);
9411 else
9413 output_operand_lossage ("invalid operand for '%%%c'", code);
9414 return;
9417 break;
9419 case 'p':
9421 int n;
9423 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9425 output_operand_lossage ("invalid operand for '%%%c'", code);
9426 return;
9429 asm_fprintf (f, "%d", n);
9431 break;
9433 case 'P':
9434 if (!CONST_INT_P (x))
9436 output_operand_lossage ("invalid operand for '%%%c'", code);
9437 return;
9440 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9441 break;
9443 case 'H':
9444 if (x == const0_rtx)
9446 asm_fprintf (f, "xzr");
9447 break;
9450 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9452 output_operand_lossage ("invalid operand for '%%%c'", code);
9453 return;
9456 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9457 break;
9459 case 'I':
9461 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9462 if (CONST_INT_P (x))
9463 asm_fprintf (f, "%wd", INTVAL (x));
9464 else
9466 output_operand_lossage ("invalid operand for '%%%c'", code);
9467 return;
9469 break;
9472 case 'M':
9473 case 'm':
9475 int cond_code;
9476 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9477 if (x == const_true_rtx)
9479 if (code == 'M')
9480 fputs ("nv", f);
9481 return;
9484 if (!COMPARISON_P (x))
9486 output_operand_lossage ("invalid operand for '%%%c'", code);
9487 return;
9490 cond_code = aarch64_get_condition_code (x);
9491 gcc_assert (cond_code >= 0);
9492 if (code == 'M')
9493 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9494 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9495 fputs (aarch64_sve_condition_codes[cond_code], f);
9496 else
9497 fputs (aarch64_condition_codes[cond_code], f);
9499 break;
9501 case 'N':
9502 if (!const_vec_duplicate_p (x, &elt))
9504 output_operand_lossage ("invalid vector constant");
9505 return;
9508 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9509 asm_fprintf (f, "%wd", -INTVAL (elt));
9510 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9511 && aarch64_print_vector_float_operand (f, x, true))
9513 else
9515 output_operand_lossage ("invalid vector constant");
9516 return;
9518 break;
9520 case 'b':
9521 case 'h':
9522 case 's':
9523 case 'd':
9524 case 'q':
9525 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9527 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9528 return;
9530 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9531 break;
9533 case 'S':
9534 case 'T':
9535 case 'U':
9536 case 'V':
9537 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9539 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9540 return;
9542 asm_fprintf (f, "%c%d",
9543 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9544 REGNO (x) - V0_REGNUM + (code - 'S'));
9545 break;
9547 case 'R':
9548 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9549 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9550 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9551 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9552 else
9553 output_operand_lossage ("incompatible register operand for '%%%c'",
9554 code);
9555 break;
9557 case 'X':
9558 if (!CONST_INT_P (x))
9560 output_operand_lossage ("invalid operand for '%%%c'", code);
9561 return;
9563 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9564 break;
9566 case 'C':
9568 /* Print a replicated constant in hex. */
9569 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9571 output_operand_lossage ("invalid operand for '%%%c'", code);
9572 return;
9574 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9575 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9577 break;
9579 case 'D':
9581 /* Print a replicated constant in decimal, treating it as
9582 unsigned. */
9583 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9585 output_operand_lossage ("invalid operand for '%%%c'", code);
9586 return;
9588 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9589 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9591 break;
9593 case 'w':
9594 case 'x':
9595 if (x == const0_rtx
9596 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9598 asm_fprintf (f, "%czr", code);
9599 break;
9602 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9604 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9605 break;
9608 if (REG_P (x) && REGNO (x) == SP_REGNUM)
9610 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9611 break;
9614 /* Fall through */
9616 case 0:
9617 if (x == NULL)
9619 output_operand_lossage ("missing operand");
9620 return;
9623 switch (GET_CODE (x))
9625 case REG:
9626 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9628 if (REG_NREGS (x) == 1)
9629 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9630 else
9632 char suffix
9633 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9634 asm_fprintf (f, "{z%d.%c - z%d.%c}",
9635 REGNO (x) - V0_REGNUM, suffix,
9636 END_REGNO (x) - V0_REGNUM - 1, suffix);
9639 else
9640 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9641 break;
9643 case MEM:
9644 output_address (GET_MODE (x), XEXP (x, 0));
9645 break;
9647 case LABEL_REF:
9648 case SYMBOL_REF:
9649 output_addr_const (asm_out_file, x);
9650 break;
9652 case CONST_INT:
9653 asm_fprintf (f, "%wd", INTVAL (x));
9654 break;
9656 case CONST:
9657 if (!VECTOR_MODE_P (GET_MODE (x)))
9659 output_addr_const (asm_out_file, x);
9660 break;
9662 /* fall through */
9664 case CONST_VECTOR:
9665 if (!const_vec_duplicate_p (x, &elt))
9667 output_operand_lossage ("invalid vector constant");
9668 return;
9671 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9672 asm_fprintf (f, "%wd", INTVAL (elt));
9673 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9674 && aarch64_print_vector_float_operand (f, x, false))
9676 else
9678 output_operand_lossage ("invalid vector constant");
9679 return;
9681 break;
9683 case CONST_DOUBLE:
9684 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9685 be getting CONST_DOUBLEs holding integers. */
9686 gcc_assert (GET_MODE (x) != VOIDmode);
9687 if (aarch64_float_const_zero_rtx_p (x))
9689 fputc ('0', f);
9690 break;
9692 else if (aarch64_float_const_representable_p (x))
9694 #define buf_size 20
9695 char float_buf[buf_size] = {'\0'};
9696 real_to_decimal_for_mode (float_buf,
9697 CONST_DOUBLE_REAL_VALUE (x),
9698 buf_size, buf_size,
9699 1, GET_MODE (x));
9700 asm_fprintf (asm_out_file, "%s", float_buf);
9701 break;
9702 #undef buf_size
9704 output_operand_lossage ("invalid constant");
9705 return;
9706 default:
9707 output_operand_lossage ("invalid operand");
9708 return;
9710 break;
9712 case 'A':
9713 if (GET_CODE (x) == HIGH)
9714 x = XEXP (x, 0);
9716 switch (aarch64_classify_symbolic_expression (x))
9718 case SYMBOL_SMALL_GOT_4G:
9719 asm_fprintf (asm_out_file, ":got:");
9720 break;
9722 case SYMBOL_SMALL_TLSGD:
9723 asm_fprintf (asm_out_file, ":tlsgd:");
9724 break;
9726 case SYMBOL_SMALL_TLSDESC:
9727 asm_fprintf (asm_out_file, ":tlsdesc:");
9728 break;
9730 case SYMBOL_SMALL_TLSIE:
9731 asm_fprintf (asm_out_file, ":gottprel:");
9732 break;
9734 case SYMBOL_TLSLE24:
9735 asm_fprintf (asm_out_file, ":tprel:");
9736 break;
9738 case SYMBOL_TINY_GOT:
9739 gcc_unreachable ();
9740 break;
9742 default:
9743 break;
9745 output_addr_const (asm_out_file, x);
9746 break;
9748 case 'L':
9749 switch (aarch64_classify_symbolic_expression (x))
9751 case SYMBOL_SMALL_GOT_4G:
9752 asm_fprintf (asm_out_file, ":lo12:");
9753 break;
9755 case SYMBOL_SMALL_TLSGD:
9756 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9757 break;
9759 case SYMBOL_SMALL_TLSDESC:
9760 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9761 break;
9763 case SYMBOL_SMALL_TLSIE:
9764 asm_fprintf (asm_out_file, ":gottprel_lo12:");
9765 break;
9767 case SYMBOL_TLSLE12:
9768 asm_fprintf (asm_out_file, ":tprel_lo12:");
9769 break;
9771 case SYMBOL_TLSLE24:
9772 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9773 break;
9775 case SYMBOL_TINY_GOT:
9776 asm_fprintf (asm_out_file, ":got:");
9777 break;
9779 case SYMBOL_TINY_TLSIE:
9780 asm_fprintf (asm_out_file, ":gottprel:");
9781 break;
9783 default:
9784 break;
9786 output_addr_const (asm_out_file, x);
9787 break;
9789 case 'G':
9790 switch (aarch64_classify_symbolic_expression (x))
9792 case SYMBOL_TLSLE24:
9793 asm_fprintf (asm_out_file, ":tprel_hi12:");
9794 break;
9795 default:
9796 break;
9798 output_addr_const (asm_out_file, x);
9799 break;
9801 case 'k':
9803 HOST_WIDE_INT cond_code;
9805 if (!CONST_INT_P (x))
9807 output_operand_lossage ("invalid operand for '%%%c'", code);
9808 return;
9811 cond_code = INTVAL (x);
9812 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9813 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9815 break;
9817 case 'y':
9818 case 'z':
9820 machine_mode mode = GET_MODE (x);
9822 if (GET_CODE (x) != MEM
9823 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9825 output_operand_lossage ("invalid operand for '%%%c'", code);
9826 return;
9829 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9830 code == 'y'
9831 ? ADDR_QUERY_LDP_STP_N
9832 : ADDR_QUERY_LDP_STP))
9833 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9835 break;
9837 default:
9838 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9839 return;
9843 /* Print address 'x' of a memory access with mode 'mode'.
9844 'op' is the context required by aarch64_classify_address. It can either be
9845 MEM for a normal memory access or PARALLEL for LDP/STP. */
9846 static bool
9847 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9848 aarch64_addr_query_type type)
9850 struct aarch64_address_info addr;
9851 unsigned int size, vec_flags;
9853 /* Check all addresses are Pmode - including ILP32. */
9854 if (GET_MODE (x) != Pmode
9855 && (!CONST_INT_P (x)
9856 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9858 output_operand_lossage ("invalid address mode");
9859 return false;
9862 if (aarch64_classify_address (&addr, x, mode, true, type))
9863 switch (addr.type)
9865 case ADDRESS_REG_IMM:
9866 if (known_eq (addr.const_offset, 0))
9868 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9869 return true;
9872 vec_flags = aarch64_classify_vector_mode (mode);
9873 if (vec_flags & VEC_ANY_SVE)
9875 HOST_WIDE_INT vnum
9876 = exact_div (addr.const_offset,
9877 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9878 asm_fprintf (f, "[%s, #%wd, mul vl]",
9879 reg_names[REGNO (addr.base)], vnum);
9880 return true;
9883 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9884 INTVAL (addr.offset));
9885 return true;
9887 case ADDRESS_REG_REG:
9888 if (addr.shift == 0)
9889 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9890 reg_names [REGNO (addr.offset)]);
9891 else
9892 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9893 reg_names [REGNO (addr.offset)], addr.shift);
9894 return true;
9896 case ADDRESS_REG_UXTW:
9897 if (addr.shift == 0)
9898 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9899 REGNO (addr.offset) - R0_REGNUM);
9900 else
9901 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9902 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9903 return true;
9905 case ADDRESS_REG_SXTW:
9906 if (addr.shift == 0)
9907 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9908 REGNO (addr.offset) - R0_REGNUM);
9909 else
9910 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9911 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9912 return true;
9914 case ADDRESS_REG_WB:
9915 /* Writeback is only supported for fixed-width modes. */
9916 size = GET_MODE_SIZE (mode).to_constant ();
9917 switch (GET_CODE (x))
9919 case PRE_INC:
9920 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9921 return true;
9922 case POST_INC:
9923 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9924 return true;
9925 case PRE_DEC:
9926 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9927 return true;
9928 case POST_DEC:
9929 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9930 return true;
9931 case PRE_MODIFY:
9932 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9933 INTVAL (addr.offset));
9934 return true;
9935 case POST_MODIFY:
9936 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9937 INTVAL (addr.offset));
9938 return true;
9939 default:
9940 break;
9942 break;
9944 case ADDRESS_LO_SUM:
9945 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9946 output_addr_const (f, addr.offset);
9947 asm_fprintf (f, "]");
9948 return true;
9950 case ADDRESS_SYMBOLIC:
9951 output_addr_const (f, x);
9952 return true;
9955 return false;
9958 /* Print address 'x' of a memory access with mode 'mode'. */
9959 static void
9960 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9962 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9963 output_addr_const (f, x);
9966 bool
9967 aarch64_label_mentioned_p (rtx x)
9969 const char *fmt;
9970 int i;
9972 if (GET_CODE (x) == LABEL_REF)
9973 return true;
9975 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9976 referencing instruction, but they are constant offsets, not
9977 symbols. */
9978 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9979 return false;
9981 fmt = GET_RTX_FORMAT (GET_CODE (x));
9982 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9984 if (fmt[i] == 'E')
9986 int j;
9988 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9989 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9990 return 1;
9992 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9993 return 1;
9996 return 0;
9999 /* Implement REGNO_REG_CLASS. */
10001 enum reg_class
10002 aarch64_regno_regclass (unsigned regno)
10004 if (GP_REGNUM_P (regno))
10005 return GENERAL_REGS;
10007 if (regno == SP_REGNUM)
10008 return STACK_REG;
10010 if (regno == FRAME_POINTER_REGNUM
10011 || regno == ARG_POINTER_REGNUM)
10012 return POINTER_REGS;
10014 if (FP_REGNUM_P (regno))
10015 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10016 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10018 if (PR_REGNUM_P (regno))
10019 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10021 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10022 return FFR_REGS;
10024 return NO_REGS;
10027 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10028 If OFFSET is out of range, return an offset of an anchor point
10029 that is in range. Return 0 otherwise. */
10031 static HOST_WIDE_INT
10032 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10033 machine_mode mode)
10035 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10036 if (size > 16)
10037 return (offset + 0x400) & ~0x7f0;
10039 /* For offsets that aren't a multiple of the access size, the limit is
10040 -256...255. */
10041 if (offset & (size - 1))
10043 /* BLKmode typically uses LDP of X-registers. */
10044 if (mode == BLKmode)
10045 return (offset + 512) & ~0x3ff;
10046 return (offset + 0x100) & ~0x1ff;
10049 /* Small negative offsets are supported. */
10050 if (IN_RANGE (offset, -256, 0))
10051 return 0;
10053 if (mode == TImode || mode == TFmode)
10054 return (offset + 0x100) & ~0x1ff;
10056 /* Use 12-bit offset by access size. */
10057 return offset & (~0xfff * size);
10060 static rtx
10061 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
10063 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10064 where mask is selected by alignment and size of the offset.
10065 We try to pick as large a range for the offset as possible to
10066 maximize the chance of a CSE. However, for aligned addresses
10067 we limit the range to 4k so that structures with different sized
10068 elements are likely to use the same base. We need to be careful
10069 not to split a CONST for some forms of address expression, otherwise
10070 it will generate sub-optimal code. */
10072 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10074 rtx base = XEXP (x, 0);
10075 rtx offset_rtx = XEXP (x, 1);
10076 HOST_WIDE_INT offset = INTVAL (offset_rtx);
10078 if (GET_CODE (base) == PLUS)
10080 rtx op0 = XEXP (base, 0);
10081 rtx op1 = XEXP (base, 1);
10083 /* Force any scaling into a temp for CSE. */
10084 op0 = force_reg (Pmode, op0);
10085 op1 = force_reg (Pmode, op1);
10087 /* Let the pointer register be in op0. */
10088 if (REG_POINTER (op1))
10089 std::swap (op0, op1);
10091 /* If the pointer is virtual or frame related, then we know that
10092 virtual register instantiation or register elimination is going
10093 to apply a second constant. We want the two constants folded
10094 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10095 if (virt_or_elim_regno_p (REGNO (op0)))
10097 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10098 NULL_RTX, true, OPTAB_DIRECT);
10099 return gen_rtx_PLUS (Pmode, base, op1);
10102 /* Otherwise, in order to encourage CSE (and thence loop strength
10103 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10104 base = expand_binop (Pmode, add_optab, op0, op1,
10105 NULL_RTX, true, OPTAB_DIRECT);
10106 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10109 HOST_WIDE_INT size;
10110 if (GET_MODE_SIZE (mode).is_constant (&size))
10112 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10113 mode);
10114 if (base_offset != 0)
10116 base = plus_constant (Pmode, base, base_offset);
10117 base = force_operand (base, NULL_RTX);
10118 return plus_constant (Pmode, base, offset - base_offset);
10123 return x;
10126 static reg_class_t
10127 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10128 reg_class_t rclass,
10129 machine_mode mode,
10130 secondary_reload_info *sri)
10132 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10133 LDR and STR. See the comment at the head of aarch64-sve.md for
10134 more details about the big-endian handling. */
10135 if (reg_class_subset_p (rclass, FP_REGS)
10136 && !((REG_P (x) && HARD_REGISTER_P (x))
10137 || aarch64_simd_valid_immediate (x, NULL))
10138 && mode != VNx16QImode)
10140 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10141 if ((vec_flags & VEC_SVE_DATA)
10142 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10144 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10145 return NO_REGS;
10149 /* If we have to disable direct literal pool loads and stores because the
10150 function is too big, then we need a scratch register. */
10151 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10152 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10153 || targetm.vector_mode_supported_p (GET_MODE (x)))
10154 && !aarch64_pcrelative_literal_loads)
10156 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10157 return NO_REGS;
10160 /* Without the TARGET_SIMD instructions we cannot move a Q register
10161 to a Q register directly. We need a scratch. */
10162 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10163 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10164 && reg_class_subset_p (rclass, FP_REGS))
10166 sri->icode = code_for_aarch64_reload_mov (mode);
10167 return NO_REGS;
10170 /* A TFmode or TImode memory access should be handled via an FP_REGS
10171 because AArch64 has richer addressing modes for LDR/STR instructions
10172 than LDP/STP instructions. */
10173 if (TARGET_FLOAT && rclass == GENERAL_REGS
10174 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10175 return FP_REGS;
10177 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10178 return GENERAL_REGS;
10180 return NO_REGS;
10183 static bool
10184 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10186 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10188 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10189 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10190 if (frame_pointer_needed)
10191 return to == HARD_FRAME_POINTER_REGNUM;
10192 return true;
10195 poly_int64
10196 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10198 if (to == HARD_FRAME_POINTER_REGNUM)
10200 if (from == ARG_POINTER_REGNUM)
10201 return cfun->machine->frame.hard_fp_offset;
10203 if (from == FRAME_POINTER_REGNUM)
10204 return cfun->machine->frame.hard_fp_offset
10205 - cfun->machine->frame.locals_offset;
10208 if (to == STACK_POINTER_REGNUM)
10210 if (from == FRAME_POINTER_REGNUM)
10211 return cfun->machine->frame.frame_size
10212 - cfun->machine->frame.locals_offset;
10215 return cfun->machine->frame.frame_size;
10218 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10219 previous frame. */
10222 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10224 if (count != 0)
10225 return const0_rtx;
10226 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10230 static void
10231 aarch64_asm_trampoline_template (FILE *f)
10233 int offset1 = 16;
10234 int offset2 = 20;
10236 if (aarch64_bti_enabled ())
10238 asm_fprintf (f, "\thint\t34 // bti c\n");
10239 offset1 -= 4;
10240 offset2 -= 4;
10243 if (TARGET_ILP32)
10245 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10246 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10247 offset1);
10249 else
10251 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10252 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10253 offset2);
10255 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10257 /* The trampoline needs an extra padding instruction. In case if BTI is
10258 enabled the padding instruction is replaced by the BTI instruction at
10259 the beginning. */
10260 if (!aarch64_bti_enabled ())
10261 assemble_aligned_integer (4, const0_rtx);
10263 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10264 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10267 static void
10268 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10270 rtx fnaddr, mem, a_tramp;
10271 const int tramp_code_sz = 16;
10273 /* Don't need to copy the trailing D-words, we fill those in below. */
10274 emit_block_move (m_tramp, assemble_trampoline_template (),
10275 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10276 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10277 fnaddr = XEXP (DECL_RTL (fndecl), 0);
10278 if (GET_MODE (fnaddr) != ptr_mode)
10279 fnaddr = convert_memory_address (ptr_mode, fnaddr);
10280 emit_move_insn (mem, fnaddr);
10282 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10283 emit_move_insn (mem, chain_value);
10285 /* XXX We should really define a "clear_cache" pattern and use
10286 gen_clear_cache(). */
10287 a_tramp = XEXP (m_tramp, 0);
10288 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10289 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10290 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10291 ptr_mode);
10294 static unsigned char
10295 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10297 /* ??? Logically we should only need to provide a value when
10298 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10299 can hold MODE, but at the moment we need to handle all modes.
10300 Just ignore any runtime parts for registers that can't store them. */
10301 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10302 unsigned int nregs, vec_flags;
10303 switch (regclass)
10305 case TAILCALL_ADDR_REGS:
10306 case POINTER_REGS:
10307 case GENERAL_REGS:
10308 case ALL_REGS:
10309 case POINTER_AND_FP_REGS:
10310 case FP_REGS:
10311 case FP_LO_REGS:
10312 case FP_LO8_REGS:
10313 vec_flags = aarch64_classify_vector_mode (mode);
10314 if ((vec_flags & VEC_SVE_DATA)
10315 && constant_multiple_p (GET_MODE_SIZE (mode),
10316 aarch64_vl_bytes (mode, vec_flags), &nregs))
10317 return nregs;
10318 return (vec_flags & VEC_ADVSIMD
10319 ? CEIL (lowest_size, UNITS_PER_VREG)
10320 : CEIL (lowest_size, UNITS_PER_WORD));
10321 case STACK_REG:
10322 case PR_REGS:
10323 case PR_LO_REGS:
10324 case PR_HI_REGS:
10325 case FFR_REGS:
10326 case PR_AND_FFR_REGS:
10327 return 1;
10329 case NO_REGS:
10330 return 0;
10332 default:
10333 break;
10335 gcc_unreachable ();
10338 static reg_class_t
10339 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10341 if (regclass == POINTER_REGS)
10342 return GENERAL_REGS;
10344 if (regclass == STACK_REG)
10346 if (REG_P(x)
10347 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10348 return regclass;
10350 return NO_REGS;
10353 /* Register eliminiation can result in a request for
10354 SP+constant->FP_REGS. We cannot support such operations which
10355 use SP as source and an FP_REG as destination, so reject out
10356 right now. */
10357 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10359 rtx lhs = XEXP (x, 0);
10361 /* Look through a possible SUBREG introduced by ILP32. */
10362 if (GET_CODE (lhs) == SUBREG)
10363 lhs = SUBREG_REG (lhs);
10365 gcc_assert (REG_P (lhs));
10366 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10367 POINTER_REGS));
10368 return NO_REGS;
10371 return regclass;
10374 void
10375 aarch64_asm_output_labelref (FILE* f, const char *name)
10377 asm_fprintf (f, "%U%s", name);
10380 static void
10381 aarch64_elf_asm_constructor (rtx symbol, int priority)
10383 if (priority == DEFAULT_INIT_PRIORITY)
10384 default_ctor_section_asm_out_constructor (symbol, priority);
10385 else
10387 section *s;
10388 /* While priority is known to be in range [0, 65535], so 18 bytes
10389 would be enough, the compiler might not know that. To avoid
10390 -Wformat-truncation false positive, use a larger size. */
10391 char buf[23];
10392 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10393 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10394 switch_to_section (s);
10395 assemble_align (POINTER_SIZE);
10396 assemble_aligned_integer (POINTER_BYTES, symbol);
10400 static void
10401 aarch64_elf_asm_destructor (rtx symbol, int priority)
10403 if (priority == DEFAULT_INIT_PRIORITY)
10404 default_dtor_section_asm_out_destructor (symbol, priority);
10405 else
10407 section *s;
10408 /* While priority is known to be in range [0, 65535], so 18 bytes
10409 would be enough, the compiler might not know that. To avoid
10410 -Wformat-truncation false positive, use a larger size. */
10411 char buf[23];
10412 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10413 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10414 switch_to_section (s);
10415 assemble_align (POINTER_SIZE);
10416 assemble_aligned_integer (POINTER_BYTES, symbol);
10420 const char*
10421 aarch64_output_casesi (rtx *operands)
10423 char buf[100];
10424 char label[100];
10425 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10426 int index;
10427 static const char *const patterns[4][2] =
10430 "ldrb\t%w3, [%0,%w1,uxtw]",
10431 "add\t%3, %4, %w3, sxtb #2"
10434 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10435 "add\t%3, %4, %w3, sxth #2"
10438 "ldr\t%w3, [%0,%w1,uxtw #2]",
10439 "add\t%3, %4, %w3, sxtw #2"
10441 /* We assume that DImode is only generated when not optimizing and
10442 that we don't really need 64-bit address offsets. That would
10443 imply an object file with 8GB of code in a single function! */
10445 "ldr\t%w3, [%0,%w1,uxtw #2]",
10446 "add\t%3, %4, %w3, sxtw #2"
10450 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10452 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10453 index = exact_log2 (GET_MODE_SIZE (mode));
10455 gcc_assert (index >= 0 && index <= 3);
10457 /* Need to implement table size reduction, by chaning the code below. */
10458 output_asm_insn (patterns[index][0], operands);
10459 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10460 snprintf (buf, sizeof (buf),
10461 "adr\t%%4, %s", targetm.strip_name_encoding (label));
10462 output_asm_insn (buf, operands);
10463 output_asm_insn (patterns[index][1], operands);
10464 output_asm_insn ("br\t%3", operands);
10465 assemble_label (asm_out_file, label);
10466 return "";
10470 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10471 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10472 operator. */
10475 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10477 if (shift >= 0 && shift <= 3)
10479 int size;
10480 for (size = 8; size <= 32; size *= 2)
10482 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10483 if (mask == bits << shift)
10484 return size;
10487 return 0;
10490 /* Constant pools are per function only when PC relative
10491 literal loads are true or we are in the large memory
10492 model. */
10494 static inline bool
10495 aarch64_can_use_per_function_literal_pools_p (void)
10497 return (aarch64_pcrelative_literal_loads
10498 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10501 static bool
10502 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10504 /* We can't use blocks for constants when we're using a per-function
10505 constant pool. */
10506 return !aarch64_can_use_per_function_literal_pools_p ();
10509 /* Select appropriate section for constants depending
10510 on where we place literal pools. */
10512 static section *
10513 aarch64_select_rtx_section (machine_mode mode,
10514 rtx x,
10515 unsigned HOST_WIDE_INT align)
10517 if (aarch64_can_use_per_function_literal_pools_p ())
10518 return function_section (current_function_decl);
10520 return default_elf_select_rtx_section (mode, x, align);
10523 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
10524 void
10525 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10526 HOST_WIDE_INT offset)
10528 /* When using per-function literal pools, we must ensure that any code
10529 section is aligned to the minimal instruction length, lest we get
10530 errors from the assembler re "unaligned instructions". */
10531 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10532 ASM_OUTPUT_ALIGN (f, 2);
10535 /* Costs. */
10537 /* Helper function for rtx cost calculation. Strip a shift expression
10538 from X. Returns the inner operand if successful, or the original
10539 expression on failure. */
10540 static rtx
10541 aarch64_strip_shift (rtx x)
10543 rtx op = x;
10545 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10546 we can convert both to ROR during final output. */
10547 if ((GET_CODE (op) == ASHIFT
10548 || GET_CODE (op) == ASHIFTRT
10549 || GET_CODE (op) == LSHIFTRT
10550 || GET_CODE (op) == ROTATERT
10551 || GET_CODE (op) == ROTATE)
10552 && CONST_INT_P (XEXP (op, 1)))
10553 return XEXP (op, 0);
10555 if (GET_CODE (op) == MULT
10556 && CONST_INT_P (XEXP (op, 1))
10557 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10558 return XEXP (op, 0);
10560 return x;
10563 /* Helper function for rtx cost calculation. Strip an extend
10564 expression from X. Returns the inner operand if successful, or the
10565 original expression on failure. We deal with a number of possible
10566 canonicalization variations here. If STRIP_SHIFT is true, then
10567 we can strip off a shift also. */
10568 static rtx
10569 aarch64_strip_extend (rtx x, bool strip_shift)
10571 scalar_int_mode mode;
10572 rtx op = x;
10574 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10575 return op;
10577 /* Zero and sign extraction of a widened value. */
10578 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10579 && XEXP (op, 2) == const0_rtx
10580 && GET_CODE (XEXP (op, 0)) == MULT
10581 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10582 XEXP (op, 1)))
10583 return XEXP (XEXP (op, 0), 0);
10585 /* It can also be represented (for zero-extend) as an AND with an
10586 immediate. */
10587 if (GET_CODE (op) == AND
10588 && GET_CODE (XEXP (op, 0)) == MULT
10589 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10590 && CONST_INT_P (XEXP (op, 1))
10591 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10592 INTVAL (XEXP (op, 1))) != 0)
10593 return XEXP (XEXP (op, 0), 0);
10595 /* Now handle extended register, as this may also have an optional
10596 left shift by 1..4. */
10597 if (strip_shift
10598 && GET_CODE (op) == ASHIFT
10599 && CONST_INT_P (XEXP (op, 1))
10600 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10601 op = XEXP (op, 0);
10603 if (GET_CODE (op) == ZERO_EXTEND
10604 || GET_CODE (op) == SIGN_EXTEND)
10605 op = XEXP (op, 0);
10607 if (op != x)
10608 return op;
10610 return x;
10613 /* Return true iff CODE is a shift supported in combination
10614 with arithmetic instructions. */
10616 static bool
10617 aarch64_shift_p (enum rtx_code code)
10619 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10623 /* Return true iff X is a cheap shift without a sign extend. */
10625 static bool
10626 aarch64_cheap_mult_shift_p (rtx x)
10628 rtx op0, op1;
10630 op0 = XEXP (x, 0);
10631 op1 = XEXP (x, 1);
10633 if (!(aarch64_tune_params.extra_tuning_flags
10634 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10635 return false;
10637 if (GET_CODE (op0) == SIGN_EXTEND)
10638 return false;
10640 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10641 && UINTVAL (op1) <= 4)
10642 return true;
10644 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10645 return false;
10647 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10649 if (l2 > 0 && l2 <= 4)
10650 return true;
10652 return false;
10655 /* Helper function for rtx cost calculation. Calculate the cost of
10656 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10657 Return the calculated cost of the expression, recursing manually in to
10658 operands where needed. */
10660 static int
10661 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10663 rtx op0, op1;
10664 const struct cpu_cost_table *extra_cost
10665 = aarch64_tune_params.insn_extra_cost;
10666 int cost = 0;
10667 bool compound_p = (outer == PLUS || outer == MINUS);
10668 machine_mode mode = GET_MODE (x);
10670 gcc_checking_assert (code == MULT);
10672 op0 = XEXP (x, 0);
10673 op1 = XEXP (x, 1);
10675 if (VECTOR_MODE_P (mode))
10676 mode = GET_MODE_INNER (mode);
10678 /* Integer multiply/fma. */
10679 if (GET_MODE_CLASS (mode) == MODE_INT)
10681 /* The multiply will be canonicalized as a shift, cost it as such. */
10682 if (aarch64_shift_p (GET_CODE (x))
10683 || (CONST_INT_P (op1)
10684 && exact_log2 (INTVAL (op1)) > 0))
10686 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10687 || GET_CODE (op0) == SIGN_EXTEND;
10688 if (speed)
10690 if (compound_p)
10692 /* If the shift is considered cheap,
10693 then don't add any cost. */
10694 if (aarch64_cheap_mult_shift_p (x))
10696 else if (REG_P (op1))
10697 /* ARITH + shift-by-register. */
10698 cost += extra_cost->alu.arith_shift_reg;
10699 else if (is_extend)
10700 /* ARITH + extended register. We don't have a cost field
10701 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
10702 cost += extra_cost->alu.extend_arith;
10703 else
10704 /* ARITH + shift-by-immediate. */
10705 cost += extra_cost->alu.arith_shift;
10707 else
10708 /* LSL (immediate). */
10709 cost += extra_cost->alu.shift;
10712 /* Strip extends as we will have costed them in the case above. */
10713 if (is_extend)
10714 op0 = aarch64_strip_extend (op0, true);
10716 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10718 return cost;
10721 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
10722 compound and let the below cases handle it. After all, MNEG is a
10723 special-case alias of MSUB. */
10724 if (GET_CODE (op0) == NEG)
10726 op0 = XEXP (op0, 0);
10727 compound_p = true;
10730 /* Integer multiplies or FMAs have zero/sign extending variants. */
10731 if ((GET_CODE (op0) == ZERO_EXTEND
10732 && GET_CODE (op1) == ZERO_EXTEND)
10733 || (GET_CODE (op0) == SIGN_EXTEND
10734 && GET_CODE (op1) == SIGN_EXTEND))
10736 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10737 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10739 if (speed)
10741 if (compound_p)
10742 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
10743 cost += extra_cost->mult[0].extend_add;
10744 else
10745 /* MUL/SMULL/UMULL. */
10746 cost += extra_cost->mult[0].extend;
10749 return cost;
10752 /* This is either an integer multiply or a MADD. In both cases
10753 we want to recurse and cost the operands. */
10754 cost += rtx_cost (op0, mode, MULT, 0, speed);
10755 cost += rtx_cost (op1, mode, MULT, 1, speed);
10757 if (speed)
10759 if (compound_p)
10760 /* MADD/MSUB. */
10761 cost += extra_cost->mult[mode == DImode].add;
10762 else
10763 /* MUL. */
10764 cost += extra_cost->mult[mode == DImode].simple;
10767 return cost;
10769 else
10771 if (speed)
10773 /* Floating-point FMA/FMUL can also support negations of the
10774 operands, unless the rounding mode is upward or downward in
10775 which case FNMUL is different than FMUL with operand negation. */
10776 bool neg0 = GET_CODE (op0) == NEG;
10777 bool neg1 = GET_CODE (op1) == NEG;
10778 if (compound_p || !flag_rounding_math || (neg0 && neg1))
10780 if (neg0)
10781 op0 = XEXP (op0, 0);
10782 if (neg1)
10783 op1 = XEXP (op1, 0);
10786 if (compound_p)
10787 /* FMADD/FNMADD/FNMSUB/FMSUB. */
10788 cost += extra_cost->fp[mode == DFmode].fma;
10789 else
10790 /* FMUL/FNMUL. */
10791 cost += extra_cost->fp[mode == DFmode].mult;
10794 cost += rtx_cost (op0, mode, MULT, 0, speed);
10795 cost += rtx_cost (op1, mode, MULT, 1, speed);
10796 return cost;
10800 static int
10801 aarch64_address_cost (rtx x,
10802 machine_mode mode,
10803 addr_space_t as ATTRIBUTE_UNUSED,
10804 bool speed)
10806 enum rtx_code c = GET_CODE (x);
10807 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10808 struct aarch64_address_info info;
10809 int cost = 0;
10810 info.shift = 0;
10812 if (!aarch64_classify_address (&info, x, mode, false))
10814 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10816 /* This is a CONST or SYMBOL ref which will be split
10817 in a different way depending on the code model in use.
10818 Cost it through the generic infrastructure. */
10819 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10820 /* Divide through by the cost of one instruction to
10821 bring it to the same units as the address costs. */
10822 cost_symbol_ref /= COSTS_N_INSNS (1);
10823 /* The cost is then the cost of preparing the address,
10824 followed by an immediate (possibly 0) offset. */
10825 return cost_symbol_ref + addr_cost->imm_offset;
10827 else
10829 /* This is most likely a jump table from a case
10830 statement. */
10831 return addr_cost->register_offset;
10835 switch (info.type)
10837 case ADDRESS_LO_SUM:
10838 case ADDRESS_SYMBOLIC:
10839 case ADDRESS_REG_IMM:
10840 cost += addr_cost->imm_offset;
10841 break;
10843 case ADDRESS_REG_WB:
10844 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10845 cost += addr_cost->pre_modify;
10846 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10847 cost += addr_cost->post_modify;
10848 else
10849 gcc_unreachable ();
10851 break;
10853 case ADDRESS_REG_REG:
10854 cost += addr_cost->register_offset;
10855 break;
10857 case ADDRESS_REG_SXTW:
10858 cost += addr_cost->register_sextend;
10859 break;
10861 case ADDRESS_REG_UXTW:
10862 cost += addr_cost->register_zextend;
10863 break;
10865 default:
10866 gcc_unreachable ();
10870 if (info.shift > 0)
10872 /* For the sake of calculating the cost of the shifted register
10873 component, we can treat same sized modes in the same way. */
10874 if (known_eq (GET_MODE_BITSIZE (mode), 16))
10875 cost += addr_cost->addr_scale_costs.hi;
10876 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10877 cost += addr_cost->addr_scale_costs.si;
10878 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10879 cost += addr_cost->addr_scale_costs.di;
10880 else
10881 /* We can't tell, or this is a 128-bit vector. */
10882 cost += addr_cost->addr_scale_costs.ti;
10885 return cost;
10888 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10889 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10890 to be taken. */
10893 aarch64_branch_cost (bool speed_p, bool predictable_p)
10895 /* When optimizing for speed, use the cost of unpredictable branches. */
10896 const struct cpu_branch_cost *branch_costs =
10897 aarch64_tune_params.branch_costs;
10899 if (!speed_p || predictable_p)
10900 return branch_costs->predictable;
10901 else
10902 return branch_costs->unpredictable;
10905 /* Return true if the RTX X in mode MODE is a zero or sign extract
10906 usable in an ADD or SUB (extended register) instruction. */
10907 static bool
10908 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10910 /* Catch add with a sign extract.
10911 This is add_<optab><mode>_multp2. */
10912 if (GET_CODE (x) == SIGN_EXTRACT
10913 || GET_CODE (x) == ZERO_EXTRACT)
10915 rtx op0 = XEXP (x, 0);
10916 rtx op1 = XEXP (x, 1);
10917 rtx op2 = XEXP (x, 2);
10919 if (GET_CODE (op0) == MULT
10920 && CONST_INT_P (op1)
10921 && op2 == const0_rtx
10922 && CONST_INT_P (XEXP (op0, 1))
10923 && aarch64_is_extend_from_extract (mode,
10924 XEXP (op0, 1),
10925 op1))
10927 return true;
10930 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10931 No shift. */
10932 else if (GET_CODE (x) == SIGN_EXTEND
10933 || GET_CODE (x) == ZERO_EXTEND)
10934 return REG_P (XEXP (x, 0));
10936 return false;
10939 static bool
10940 aarch64_frint_unspec_p (unsigned int u)
10942 switch (u)
10944 case UNSPEC_FRINTZ:
10945 case UNSPEC_FRINTP:
10946 case UNSPEC_FRINTM:
10947 case UNSPEC_FRINTA:
10948 case UNSPEC_FRINTN:
10949 case UNSPEC_FRINTX:
10950 case UNSPEC_FRINTI:
10951 return true;
10953 default:
10954 return false;
10958 /* Return true iff X is an rtx that will match an extr instruction
10959 i.e. as described in the *extr<mode>5_insn family of patterns.
10960 OP0 and OP1 will be set to the operands of the shifts involved
10961 on success and will be NULL_RTX otherwise. */
10963 static bool
10964 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10966 rtx op0, op1;
10967 scalar_int_mode mode;
10968 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10969 return false;
10971 *res_op0 = NULL_RTX;
10972 *res_op1 = NULL_RTX;
10974 if (GET_CODE (x) != IOR)
10975 return false;
10977 op0 = XEXP (x, 0);
10978 op1 = XEXP (x, 1);
10980 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10981 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10983 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10984 if (GET_CODE (op1) == ASHIFT)
10985 std::swap (op0, op1);
10987 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10988 return false;
10990 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10991 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10993 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10994 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10996 *res_op0 = XEXP (op0, 0);
10997 *res_op1 = XEXP (op1, 0);
10998 return true;
11002 return false;
11005 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11006 storing it in *COST. Result is true if the total cost of the operation
11007 has now been calculated. */
11008 static bool
11009 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11011 rtx inner;
11012 rtx comparator;
11013 enum rtx_code cmpcode;
11015 if (COMPARISON_P (op0))
11017 inner = XEXP (op0, 0);
11018 comparator = XEXP (op0, 1);
11019 cmpcode = GET_CODE (op0);
11021 else
11023 inner = op0;
11024 comparator = const0_rtx;
11025 cmpcode = NE;
11028 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11030 /* Conditional branch. */
11031 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11032 return true;
11033 else
11035 if (cmpcode == NE || cmpcode == EQ)
11037 if (comparator == const0_rtx)
11039 /* TBZ/TBNZ/CBZ/CBNZ. */
11040 if (GET_CODE (inner) == ZERO_EXTRACT)
11041 /* TBZ/TBNZ. */
11042 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11043 ZERO_EXTRACT, 0, speed);
11044 else
11045 /* CBZ/CBNZ. */
11046 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11048 return true;
11051 else if (cmpcode == LT || cmpcode == GE)
11053 /* TBZ/TBNZ. */
11054 if (comparator == const0_rtx)
11055 return true;
11059 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11061 /* CCMP. */
11062 if (GET_CODE (op1) == COMPARE)
11064 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11065 if (XEXP (op1, 1) == const0_rtx)
11066 *cost += 1;
11067 if (speed)
11069 machine_mode mode = GET_MODE (XEXP (op1, 0));
11070 const struct cpu_cost_table *extra_cost
11071 = aarch64_tune_params.insn_extra_cost;
11073 if (GET_MODE_CLASS (mode) == MODE_INT)
11074 *cost += extra_cost->alu.arith;
11075 else
11076 *cost += extra_cost->fp[mode == DFmode].compare;
11078 return true;
11081 /* It's a conditional operation based on the status flags,
11082 so it must be some flavor of CSEL. */
11084 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11085 if (GET_CODE (op1) == NEG
11086 || GET_CODE (op1) == NOT
11087 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11088 op1 = XEXP (op1, 0);
11089 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11091 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11092 op1 = XEXP (op1, 0);
11093 op2 = XEXP (op2, 0);
11096 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11097 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11098 return true;
11101 /* We don't know what this is, cost all operands. */
11102 return false;
11105 /* Check whether X is a bitfield operation of the form shift + extend that
11106 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11107 operand to which the bitfield operation is applied. Otherwise return
11108 NULL_RTX. */
11110 static rtx
11111 aarch64_extend_bitfield_pattern_p (rtx x)
11113 rtx_code outer_code = GET_CODE (x);
11114 machine_mode outer_mode = GET_MODE (x);
11116 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11117 && outer_mode != SImode && outer_mode != DImode)
11118 return NULL_RTX;
11120 rtx inner = XEXP (x, 0);
11121 rtx_code inner_code = GET_CODE (inner);
11122 machine_mode inner_mode = GET_MODE (inner);
11123 rtx op = NULL_RTX;
11125 switch (inner_code)
11127 case ASHIFT:
11128 if (CONST_INT_P (XEXP (inner, 1))
11129 && (inner_mode == QImode || inner_mode == HImode))
11130 op = XEXP (inner, 0);
11131 break;
11132 case LSHIFTRT:
11133 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11134 && (inner_mode == QImode || inner_mode == HImode))
11135 op = XEXP (inner, 0);
11136 break;
11137 case ASHIFTRT:
11138 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11139 && (inner_mode == QImode || inner_mode == HImode))
11140 op = XEXP (inner, 0);
11141 break;
11142 default:
11143 break;
11146 return op;
11149 /* Return true if the mask and a shift amount from an RTX of the form
11150 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11151 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11153 bool
11154 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11155 rtx shft_amnt)
11157 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11158 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11159 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11160 && (INTVAL (mask)
11161 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11164 /* Return true if the masks and a shift amount from an RTX of the form
11165 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11166 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11168 bool
11169 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11170 unsigned HOST_WIDE_INT mask1,
11171 unsigned HOST_WIDE_INT shft_amnt,
11172 unsigned HOST_WIDE_INT mask2)
11174 unsigned HOST_WIDE_INT t;
11176 /* Verify that there is no overlap in what bits are set in the two masks. */
11177 if (mask1 != ~mask2)
11178 return false;
11180 /* Verify that mask2 is not all zeros or ones. */
11181 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11182 return false;
11184 /* The shift amount should always be less than the mode size. */
11185 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11187 /* Verify that the mask being shifted is contiguous and would be in the
11188 least significant bits after shifting by shft_amnt. */
11189 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11190 return (t == (t & -t));
11193 /* Calculate the cost of calculating X, storing it in *COST. Result
11194 is true if the total cost of the operation has now been calculated. */
11195 static bool
11196 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11197 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11199 rtx op0, op1, op2;
11200 const struct cpu_cost_table *extra_cost
11201 = aarch64_tune_params.insn_extra_cost;
11202 int code = GET_CODE (x);
11203 scalar_int_mode int_mode;
11205 /* By default, assume that everything has equivalent cost to the
11206 cheapest instruction. Any additional costs are applied as a delta
11207 above this default. */
11208 *cost = COSTS_N_INSNS (1);
11210 switch (code)
11212 case SET:
11213 /* The cost depends entirely on the operands to SET. */
11214 *cost = 0;
11215 op0 = SET_DEST (x);
11216 op1 = SET_SRC (x);
11218 switch (GET_CODE (op0))
11220 case MEM:
11221 if (speed)
11223 rtx address = XEXP (op0, 0);
11224 if (VECTOR_MODE_P (mode))
11225 *cost += extra_cost->ldst.storev;
11226 else if (GET_MODE_CLASS (mode) == MODE_INT)
11227 *cost += extra_cost->ldst.store;
11228 else if (mode == SFmode)
11229 *cost += extra_cost->ldst.storef;
11230 else if (mode == DFmode)
11231 *cost += extra_cost->ldst.stored;
11233 *cost +=
11234 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11235 0, speed));
11238 *cost += rtx_cost (op1, mode, SET, 1, speed);
11239 return true;
11241 case SUBREG:
11242 if (! REG_P (SUBREG_REG (op0)))
11243 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11245 /* Fall through. */
11246 case REG:
11247 /* The cost is one per vector-register copied. */
11248 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11250 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11251 *cost = COSTS_N_INSNS (nregs);
11253 /* const0_rtx is in general free, but we will use an
11254 instruction to set a register to 0. */
11255 else if (REG_P (op1) || op1 == const0_rtx)
11257 /* The cost is 1 per register copied. */
11258 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11259 *cost = COSTS_N_INSNS (nregs);
11261 else
11262 /* Cost is just the cost of the RHS of the set. */
11263 *cost += rtx_cost (op1, mode, SET, 1, speed);
11264 return true;
11266 case ZERO_EXTRACT:
11267 case SIGN_EXTRACT:
11268 /* Bit-field insertion. Strip any redundant widening of
11269 the RHS to meet the width of the target. */
11270 if (GET_CODE (op1) == SUBREG)
11271 op1 = SUBREG_REG (op1);
11272 if ((GET_CODE (op1) == ZERO_EXTEND
11273 || GET_CODE (op1) == SIGN_EXTEND)
11274 && CONST_INT_P (XEXP (op0, 1))
11275 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11276 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11277 op1 = XEXP (op1, 0);
11279 if (CONST_INT_P (op1))
11281 /* MOV immediate is assumed to always be cheap. */
11282 *cost = COSTS_N_INSNS (1);
11284 else
11286 /* BFM. */
11287 if (speed)
11288 *cost += extra_cost->alu.bfi;
11289 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11292 return true;
11294 default:
11295 /* We can't make sense of this, assume default cost. */
11296 *cost = COSTS_N_INSNS (1);
11297 return false;
11299 return false;
11301 case CONST_INT:
11302 /* If an instruction can incorporate a constant within the
11303 instruction, the instruction's expression avoids calling
11304 rtx_cost() on the constant. If rtx_cost() is called on a
11305 constant, then it is usually because the constant must be
11306 moved into a register by one or more instructions.
11308 The exception is constant 0, which can be expressed
11309 as XZR/WZR and is therefore free. The exception to this is
11310 if we have (set (reg) (const0_rtx)) in which case we must cost
11311 the move. However, we can catch that when we cost the SET, so
11312 we don't need to consider that here. */
11313 if (x == const0_rtx)
11314 *cost = 0;
11315 else
11317 /* To an approximation, building any other constant is
11318 proportionally expensive to the number of instructions
11319 required to build that constant. This is true whether we
11320 are compiling for SPEED or otherwise. */
11321 if (!is_a <scalar_int_mode> (mode, &int_mode))
11322 int_mode = word_mode;
11323 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11324 (NULL_RTX, x, false, int_mode));
11326 return true;
11328 case CONST_DOUBLE:
11330 /* First determine number of instructions to do the move
11331 as an integer constant. */
11332 if (!aarch64_float_const_representable_p (x)
11333 && !aarch64_can_const_movi_rtx_p (x, mode)
11334 && aarch64_float_const_rtx_p (x))
11336 unsigned HOST_WIDE_INT ival;
11337 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11338 gcc_assert (succeed);
11340 scalar_int_mode imode = (mode == HFmode
11341 ? SImode
11342 : int_mode_for_mode (mode).require ());
11343 int ncost = aarch64_internal_mov_immediate
11344 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11345 *cost += COSTS_N_INSNS (ncost);
11346 return true;
11349 if (speed)
11351 /* mov[df,sf]_aarch64. */
11352 if (aarch64_float_const_representable_p (x))
11353 /* FMOV (scalar immediate). */
11354 *cost += extra_cost->fp[mode == DFmode].fpconst;
11355 else if (!aarch64_float_const_zero_rtx_p (x))
11357 /* This will be a load from memory. */
11358 if (mode == DFmode)
11359 *cost += extra_cost->ldst.loadd;
11360 else
11361 *cost += extra_cost->ldst.loadf;
11363 else
11364 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11365 or MOV v0.s[0], wzr - neither of which are modeled by the
11366 cost tables. Just use the default cost. */
11371 return true;
11373 case MEM:
11374 if (speed)
11376 /* For loads we want the base cost of a load, plus an
11377 approximation for the additional cost of the addressing
11378 mode. */
11379 rtx address = XEXP (x, 0);
11380 if (VECTOR_MODE_P (mode))
11381 *cost += extra_cost->ldst.loadv;
11382 else if (GET_MODE_CLASS (mode) == MODE_INT)
11383 *cost += extra_cost->ldst.load;
11384 else if (mode == SFmode)
11385 *cost += extra_cost->ldst.loadf;
11386 else if (mode == DFmode)
11387 *cost += extra_cost->ldst.loadd;
11389 *cost +=
11390 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11391 0, speed));
11394 return true;
11396 case NEG:
11397 op0 = XEXP (x, 0);
11399 if (VECTOR_MODE_P (mode))
11401 if (speed)
11403 /* FNEG. */
11404 *cost += extra_cost->vect.alu;
11406 return false;
11409 if (GET_MODE_CLASS (mode) == MODE_INT)
11411 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11412 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11414 /* CSETM. */
11415 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11416 return true;
11419 /* Cost this as SUB wzr, X. */
11420 op0 = CONST0_RTX (mode);
11421 op1 = XEXP (x, 0);
11422 goto cost_minus;
11425 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11427 /* Support (neg(fma...)) as a single instruction only if
11428 sign of zeros is unimportant. This matches the decision
11429 making in aarch64.md. */
11430 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11432 /* FNMADD. */
11433 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11434 return true;
11436 if (GET_CODE (op0) == MULT)
11438 /* FNMUL. */
11439 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11440 return true;
11442 if (speed)
11443 /* FNEG. */
11444 *cost += extra_cost->fp[mode == DFmode].neg;
11445 return false;
11448 return false;
11450 case CLRSB:
11451 case CLZ:
11452 if (speed)
11454 if (VECTOR_MODE_P (mode))
11455 *cost += extra_cost->vect.alu;
11456 else
11457 *cost += extra_cost->alu.clz;
11460 return false;
11462 case COMPARE:
11463 op0 = XEXP (x, 0);
11464 op1 = XEXP (x, 1);
11466 if (op1 == const0_rtx
11467 && GET_CODE (op0) == AND)
11469 x = op0;
11470 mode = GET_MODE (op0);
11471 goto cost_logic;
11474 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11476 /* TODO: A write to the CC flags possibly costs extra, this
11477 needs encoding in the cost tables. */
11479 mode = GET_MODE (op0);
11480 /* ANDS. */
11481 if (GET_CODE (op0) == AND)
11483 x = op0;
11484 goto cost_logic;
11487 if (GET_CODE (op0) == PLUS)
11489 /* ADDS (and CMN alias). */
11490 x = op0;
11491 goto cost_plus;
11494 if (GET_CODE (op0) == MINUS)
11496 /* SUBS. */
11497 x = op0;
11498 goto cost_minus;
11501 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11502 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11503 && CONST_INT_P (XEXP (op0, 2)))
11505 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11506 Handle it here directly rather than going to cost_logic
11507 since we know the immediate generated for the TST is valid
11508 so we can avoid creating an intermediate rtx for it only
11509 for costing purposes. */
11510 if (speed)
11511 *cost += extra_cost->alu.logical;
11513 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11514 ZERO_EXTRACT, 0, speed);
11515 return true;
11518 if (GET_CODE (op1) == NEG)
11520 /* CMN. */
11521 if (speed)
11522 *cost += extra_cost->alu.arith;
11524 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11525 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11526 return true;
11529 /* CMP.
11531 Compare can freely swap the order of operands, and
11532 canonicalization puts the more complex operation first.
11533 But the integer MINUS logic expects the shift/extend
11534 operation in op1. */
11535 if (! (REG_P (op0)
11536 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11538 op0 = XEXP (x, 1);
11539 op1 = XEXP (x, 0);
11541 goto cost_minus;
11544 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11546 /* FCMP. */
11547 if (speed)
11548 *cost += extra_cost->fp[mode == DFmode].compare;
11550 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11552 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11553 /* FCMP supports constant 0.0 for no extra cost. */
11554 return true;
11556 return false;
11559 if (VECTOR_MODE_P (mode))
11561 /* Vector compare. */
11562 if (speed)
11563 *cost += extra_cost->vect.alu;
11565 if (aarch64_float_const_zero_rtx_p (op1))
11567 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11568 cost. */
11569 return true;
11571 return false;
11573 return false;
11575 case MINUS:
11577 op0 = XEXP (x, 0);
11578 op1 = XEXP (x, 1);
11580 cost_minus:
11581 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11583 /* Detect valid immediates. */
11584 if ((GET_MODE_CLASS (mode) == MODE_INT
11585 || (GET_MODE_CLASS (mode) == MODE_CC
11586 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11587 && CONST_INT_P (op1)
11588 && aarch64_uimm12_shift (INTVAL (op1)))
11590 if (speed)
11591 /* SUB(S) (immediate). */
11592 *cost += extra_cost->alu.arith;
11593 return true;
11596 /* Look for SUB (extended register). */
11597 if (is_a <scalar_int_mode> (mode, &int_mode)
11598 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11600 if (speed)
11601 *cost += extra_cost->alu.extend_arith;
11603 op1 = aarch64_strip_extend (op1, true);
11604 *cost += rtx_cost (op1, VOIDmode,
11605 (enum rtx_code) GET_CODE (op1), 0, speed);
11606 return true;
11609 rtx new_op1 = aarch64_strip_extend (op1, false);
11611 /* Cost this as an FMA-alike operation. */
11612 if ((GET_CODE (new_op1) == MULT
11613 || aarch64_shift_p (GET_CODE (new_op1)))
11614 && code != COMPARE)
11616 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11617 (enum rtx_code) code,
11618 speed);
11619 return true;
11622 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11624 if (speed)
11626 if (VECTOR_MODE_P (mode))
11628 /* Vector SUB. */
11629 *cost += extra_cost->vect.alu;
11631 else if (GET_MODE_CLASS (mode) == MODE_INT)
11633 /* SUB(S). */
11634 *cost += extra_cost->alu.arith;
11636 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11638 /* FSUB. */
11639 *cost += extra_cost->fp[mode == DFmode].addsub;
11642 return true;
11645 case PLUS:
11647 rtx new_op0;
11649 op0 = XEXP (x, 0);
11650 op1 = XEXP (x, 1);
11652 cost_plus:
11653 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11654 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11656 /* CSINC. */
11657 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11658 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11659 return true;
11662 if (GET_MODE_CLASS (mode) == MODE_INT
11663 && (aarch64_plus_immediate (op1, mode)
11664 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11666 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11668 if (speed)
11669 /* ADD (immediate). */
11670 *cost += extra_cost->alu.arith;
11671 return true;
11674 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11676 /* Look for ADD (extended register). */
11677 if (is_a <scalar_int_mode> (mode, &int_mode)
11678 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11680 if (speed)
11681 *cost += extra_cost->alu.extend_arith;
11683 op0 = aarch64_strip_extend (op0, true);
11684 *cost += rtx_cost (op0, VOIDmode,
11685 (enum rtx_code) GET_CODE (op0), 0, speed);
11686 return true;
11689 /* Strip any extend, leave shifts behind as we will
11690 cost them through mult_cost. */
11691 new_op0 = aarch64_strip_extend (op0, false);
11693 if (GET_CODE (new_op0) == MULT
11694 || aarch64_shift_p (GET_CODE (new_op0)))
11696 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11697 speed);
11698 return true;
11701 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11703 if (speed)
11705 if (VECTOR_MODE_P (mode))
11707 /* Vector ADD. */
11708 *cost += extra_cost->vect.alu;
11710 else if (GET_MODE_CLASS (mode) == MODE_INT)
11712 /* ADD. */
11713 *cost += extra_cost->alu.arith;
11715 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11717 /* FADD. */
11718 *cost += extra_cost->fp[mode == DFmode].addsub;
11721 return true;
11724 case BSWAP:
11725 *cost = COSTS_N_INSNS (1);
11727 if (speed)
11729 if (VECTOR_MODE_P (mode))
11730 *cost += extra_cost->vect.alu;
11731 else
11732 *cost += extra_cost->alu.rev;
11734 return false;
11736 case IOR:
11737 if (aarch_rev16_p (x))
11739 *cost = COSTS_N_INSNS (1);
11741 if (speed)
11743 if (VECTOR_MODE_P (mode))
11744 *cost += extra_cost->vect.alu;
11745 else
11746 *cost += extra_cost->alu.rev;
11748 return true;
11751 if (aarch64_extr_rtx_p (x, &op0, &op1))
11753 *cost += rtx_cost (op0, mode, IOR, 0, speed);
11754 *cost += rtx_cost (op1, mode, IOR, 1, speed);
11755 if (speed)
11756 *cost += extra_cost->alu.shift;
11758 return true;
11760 /* Fall through. */
11761 case XOR:
11762 case AND:
11763 cost_logic:
11764 op0 = XEXP (x, 0);
11765 op1 = XEXP (x, 1);
11767 if (VECTOR_MODE_P (mode))
11769 if (speed)
11770 *cost += extra_cost->vect.alu;
11771 return true;
11774 if (code == AND
11775 && GET_CODE (op0) == MULT
11776 && CONST_INT_P (XEXP (op0, 1))
11777 && CONST_INT_P (op1)
11778 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11779 INTVAL (op1)) != 0)
11781 /* This is a UBFM/SBFM. */
11782 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11783 if (speed)
11784 *cost += extra_cost->alu.bfx;
11785 return true;
11788 if (is_int_mode (mode, &int_mode))
11790 if (CONST_INT_P (op1))
11792 /* We have a mask + shift version of a UBFIZ
11793 i.e. the *andim_ashift<mode>_bfiz pattern. */
11794 if (GET_CODE (op0) == ASHIFT
11795 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11796 XEXP (op0, 1)))
11798 *cost += rtx_cost (XEXP (op0, 0), int_mode,
11799 (enum rtx_code) code, 0, speed);
11800 if (speed)
11801 *cost += extra_cost->alu.bfx;
11803 return true;
11805 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11807 /* We possibly get the immediate for free, this is not
11808 modelled. */
11809 *cost += rtx_cost (op0, int_mode,
11810 (enum rtx_code) code, 0, speed);
11811 if (speed)
11812 *cost += extra_cost->alu.logical;
11814 return true;
11817 else
11819 rtx new_op0 = op0;
11821 /* Handle ORN, EON, or BIC. */
11822 if (GET_CODE (op0) == NOT)
11823 op0 = XEXP (op0, 0);
11825 new_op0 = aarch64_strip_shift (op0);
11827 /* If we had a shift on op0 then this is a logical-shift-
11828 by-register/immediate operation. Otherwise, this is just
11829 a logical operation. */
11830 if (speed)
11832 if (new_op0 != op0)
11834 /* Shift by immediate. */
11835 if (CONST_INT_P (XEXP (op0, 1)))
11836 *cost += extra_cost->alu.log_shift;
11837 else
11838 *cost += extra_cost->alu.log_shift_reg;
11840 else
11841 *cost += extra_cost->alu.logical;
11844 /* In both cases we want to cost both operands. */
11845 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11846 0, speed);
11847 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11848 1, speed);
11850 return true;
11853 return false;
11855 case NOT:
11856 x = XEXP (x, 0);
11857 op0 = aarch64_strip_shift (x);
11859 if (VECTOR_MODE_P (mode))
11861 /* Vector NOT. */
11862 *cost += extra_cost->vect.alu;
11863 return false;
11866 /* MVN-shifted-reg. */
11867 if (op0 != x)
11869 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11871 if (speed)
11872 *cost += extra_cost->alu.log_shift;
11874 return true;
11876 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11877 Handle the second form here taking care that 'a' in the above can
11878 be a shift. */
11879 else if (GET_CODE (op0) == XOR)
11881 rtx newop0 = XEXP (op0, 0);
11882 rtx newop1 = XEXP (op0, 1);
11883 rtx op0_stripped = aarch64_strip_shift (newop0);
11885 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11886 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11888 if (speed)
11890 if (op0_stripped != newop0)
11891 *cost += extra_cost->alu.log_shift;
11892 else
11893 *cost += extra_cost->alu.logical;
11896 return true;
11898 /* MVN. */
11899 if (speed)
11900 *cost += extra_cost->alu.logical;
11902 return false;
11904 case ZERO_EXTEND:
11906 op0 = XEXP (x, 0);
11907 /* If a value is written in SI mode, then zero extended to DI
11908 mode, the operation will in general be free as a write to
11909 a 'w' register implicitly zeroes the upper bits of an 'x'
11910 register. However, if this is
11912 (set (reg) (zero_extend (reg)))
11914 we must cost the explicit register move. */
11915 if (mode == DImode
11916 && GET_MODE (op0) == SImode
11917 && outer == SET)
11919 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11921 /* If OP_COST is non-zero, then the cost of the zero extend
11922 is effectively the cost of the inner operation. Otherwise
11923 we have a MOV instruction and we take the cost from the MOV
11924 itself. This is true independently of whether we are
11925 optimizing for space or time. */
11926 if (op_cost)
11927 *cost = op_cost;
11929 return true;
11931 else if (MEM_P (op0))
11933 /* All loads can zero extend to any size for free. */
11934 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11935 return true;
11938 op0 = aarch64_extend_bitfield_pattern_p (x);
11939 if (op0)
11941 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11942 if (speed)
11943 *cost += extra_cost->alu.bfx;
11944 return true;
11947 if (speed)
11949 if (VECTOR_MODE_P (mode))
11951 /* UMOV. */
11952 *cost += extra_cost->vect.alu;
11954 else
11956 /* We generate an AND instead of UXTB/UXTH. */
11957 *cost += extra_cost->alu.logical;
11960 return false;
11962 case SIGN_EXTEND:
11963 if (MEM_P (XEXP (x, 0)))
11965 /* LDRSH. */
11966 if (speed)
11968 rtx address = XEXP (XEXP (x, 0), 0);
11969 *cost += extra_cost->ldst.load_sign_extend;
11971 *cost +=
11972 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11973 0, speed));
11975 return true;
11978 op0 = aarch64_extend_bitfield_pattern_p (x);
11979 if (op0)
11981 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11982 if (speed)
11983 *cost += extra_cost->alu.bfx;
11984 return true;
11987 if (speed)
11989 if (VECTOR_MODE_P (mode))
11990 *cost += extra_cost->vect.alu;
11991 else
11992 *cost += extra_cost->alu.extend;
11994 return false;
11996 case ASHIFT:
11997 op0 = XEXP (x, 0);
11998 op1 = XEXP (x, 1);
12000 if (CONST_INT_P (op1))
12002 if (speed)
12004 if (VECTOR_MODE_P (mode))
12006 /* Vector shift (immediate). */
12007 *cost += extra_cost->vect.alu;
12009 else
12011 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12012 aliases. */
12013 *cost += extra_cost->alu.shift;
12017 /* We can incorporate zero/sign extend for free. */
12018 if (GET_CODE (op0) == ZERO_EXTEND
12019 || GET_CODE (op0) == SIGN_EXTEND)
12020 op0 = XEXP (op0, 0);
12022 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12023 return true;
12025 else
12027 if (VECTOR_MODE_P (mode))
12029 if (speed)
12030 /* Vector shift (register). */
12031 *cost += extra_cost->vect.alu;
12033 else
12035 if (speed)
12036 /* LSLV. */
12037 *cost += extra_cost->alu.shift_reg;
12039 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12040 && CONST_INT_P (XEXP (op1, 1))
12041 && known_eq (INTVAL (XEXP (op1, 1)),
12042 GET_MODE_BITSIZE (mode) - 1))
12044 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12045 /* We already demanded XEXP (op1, 0) to be REG_P, so
12046 don't recurse into it. */
12047 return true;
12050 return false; /* All arguments need to be in registers. */
12053 case ROTATE:
12054 case ROTATERT:
12055 case LSHIFTRT:
12056 case ASHIFTRT:
12057 op0 = XEXP (x, 0);
12058 op1 = XEXP (x, 1);
12060 if (CONST_INT_P (op1))
12062 /* ASR (immediate) and friends. */
12063 if (speed)
12065 if (VECTOR_MODE_P (mode))
12066 *cost += extra_cost->vect.alu;
12067 else
12068 *cost += extra_cost->alu.shift;
12071 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12072 return true;
12074 else
12076 if (VECTOR_MODE_P (mode))
12078 if (speed)
12079 /* Vector shift (register). */
12080 *cost += extra_cost->vect.alu;
12082 else
12084 if (speed)
12085 /* ASR (register) and friends. */
12086 *cost += extra_cost->alu.shift_reg;
12088 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12089 && CONST_INT_P (XEXP (op1, 1))
12090 && known_eq (INTVAL (XEXP (op1, 1)),
12091 GET_MODE_BITSIZE (mode) - 1))
12093 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12094 /* We already demanded XEXP (op1, 0) to be REG_P, so
12095 don't recurse into it. */
12096 return true;
12099 return false; /* All arguments need to be in registers. */
12102 case SYMBOL_REF:
12104 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12105 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12107 /* LDR. */
12108 if (speed)
12109 *cost += extra_cost->ldst.load;
12111 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12112 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12114 /* ADRP, followed by ADD. */
12115 *cost += COSTS_N_INSNS (1);
12116 if (speed)
12117 *cost += 2 * extra_cost->alu.arith;
12119 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12120 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12122 /* ADR. */
12123 if (speed)
12124 *cost += extra_cost->alu.arith;
12127 if (flag_pic)
12129 /* One extra load instruction, after accessing the GOT. */
12130 *cost += COSTS_N_INSNS (1);
12131 if (speed)
12132 *cost += extra_cost->ldst.load;
12134 return true;
12136 case HIGH:
12137 case LO_SUM:
12138 /* ADRP/ADD (immediate). */
12139 if (speed)
12140 *cost += extra_cost->alu.arith;
12141 return true;
12143 case ZERO_EXTRACT:
12144 case SIGN_EXTRACT:
12145 /* UBFX/SBFX. */
12146 if (speed)
12148 if (VECTOR_MODE_P (mode))
12149 *cost += extra_cost->vect.alu;
12150 else
12151 *cost += extra_cost->alu.bfx;
12154 /* We can trust that the immediates used will be correct (there
12155 are no by-register forms), so we need only cost op0. */
12156 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12157 return true;
12159 case MULT:
12160 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12161 /* aarch64_rtx_mult_cost always handles recursion to its
12162 operands. */
12163 return true;
12165 case MOD:
12166 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12167 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12168 an unconditional negate. This case should only ever be reached through
12169 the set_smod_pow2_cheap check in expmed.c. */
12170 if (CONST_INT_P (XEXP (x, 1))
12171 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12172 && (mode == SImode || mode == DImode))
12174 /* We expand to 4 instructions. Reset the baseline. */
12175 *cost = COSTS_N_INSNS (4);
12177 if (speed)
12178 *cost += 2 * extra_cost->alu.logical
12179 + 2 * extra_cost->alu.arith;
12181 return true;
12184 /* Fall-through. */
12185 case UMOD:
12186 if (speed)
12188 /* Slighly prefer UMOD over SMOD. */
12189 if (VECTOR_MODE_P (mode))
12190 *cost += extra_cost->vect.alu;
12191 else if (GET_MODE_CLASS (mode) == MODE_INT)
12192 *cost += (extra_cost->mult[mode == DImode].add
12193 + extra_cost->mult[mode == DImode].idiv
12194 + (code == MOD ? 1 : 0));
12196 return false; /* All arguments need to be in registers. */
12198 case DIV:
12199 case UDIV:
12200 case SQRT:
12201 if (speed)
12203 if (VECTOR_MODE_P (mode))
12204 *cost += extra_cost->vect.alu;
12205 else if (GET_MODE_CLASS (mode) == MODE_INT)
12206 /* There is no integer SQRT, so only DIV and UDIV can get
12207 here. */
12208 *cost += (extra_cost->mult[mode == DImode].idiv
12209 /* Slighly prefer UDIV over SDIV. */
12210 + (code == DIV ? 1 : 0));
12211 else
12212 *cost += extra_cost->fp[mode == DFmode].div;
12214 return false; /* All arguments need to be in registers. */
12216 case IF_THEN_ELSE:
12217 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12218 XEXP (x, 2), cost, speed);
12220 case EQ:
12221 case NE:
12222 case GT:
12223 case GTU:
12224 case LT:
12225 case LTU:
12226 case GE:
12227 case GEU:
12228 case LE:
12229 case LEU:
12231 return false; /* All arguments must be in registers. */
12233 case FMA:
12234 op0 = XEXP (x, 0);
12235 op1 = XEXP (x, 1);
12236 op2 = XEXP (x, 2);
12238 if (speed)
12240 if (VECTOR_MODE_P (mode))
12241 *cost += extra_cost->vect.alu;
12242 else
12243 *cost += extra_cost->fp[mode == DFmode].fma;
12246 /* FMSUB, FNMADD, and FNMSUB are free. */
12247 if (GET_CODE (op0) == NEG)
12248 op0 = XEXP (op0, 0);
12250 if (GET_CODE (op2) == NEG)
12251 op2 = XEXP (op2, 0);
12253 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12254 and the by-element operand as operand 0. */
12255 if (GET_CODE (op1) == NEG)
12256 op1 = XEXP (op1, 0);
12258 /* Catch vector-by-element operations. The by-element operand can
12259 either be (vec_duplicate (vec_select (x))) or just
12260 (vec_select (x)), depending on whether we are multiplying by
12261 a vector or a scalar.
12263 Canonicalization is not very good in these cases, FMA4 will put the
12264 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12265 if (GET_CODE (op0) == VEC_DUPLICATE)
12266 op0 = XEXP (op0, 0);
12267 else if (GET_CODE (op1) == VEC_DUPLICATE)
12268 op1 = XEXP (op1, 0);
12270 if (GET_CODE (op0) == VEC_SELECT)
12271 op0 = XEXP (op0, 0);
12272 else if (GET_CODE (op1) == VEC_SELECT)
12273 op1 = XEXP (op1, 0);
12275 /* If the remaining parameters are not registers,
12276 get the cost to put them into registers. */
12277 *cost += rtx_cost (op0, mode, FMA, 0, speed);
12278 *cost += rtx_cost (op1, mode, FMA, 1, speed);
12279 *cost += rtx_cost (op2, mode, FMA, 2, speed);
12280 return true;
12282 case FLOAT:
12283 case UNSIGNED_FLOAT:
12284 if (speed)
12285 *cost += extra_cost->fp[mode == DFmode].fromint;
12286 return false;
12288 case FLOAT_EXTEND:
12289 if (speed)
12291 if (VECTOR_MODE_P (mode))
12293 /*Vector truncate. */
12294 *cost += extra_cost->vect.alu;
12296 else
12297 *cost += extra_cost->fp[mode == DFmode].widen;
12299 return false;
12301 case FLOAT_TRUNCATE:
12302 if (speed)
12304 if (VECTOR_MODE_P (mode))
12306 /*Vector conversion. */
12307 *cost += extra_cost->vect.alu;
12309 else
12310 *cost += extra_cost->fp[mode == DFmode].narrow;
12312 return false;
12314 case FIX:
12315 case UNSIGNED_FIX:
12316 x = XEXP (x, 0);
12317 /* Strip the rounding part. They will all be implemented
12318 by the fcvt* family of instructions anyway. */
12319 if (GET_CODE (x) == UNSPEC)
12321 unsigned int uns_code = XINT (x, 1);
12323 if (uns_code == UNSPEC_FRINTA
12324 || uns_code == UNSPEC_FRINTM
12325 || uns_code == UNSPEC_FRINTN
12326 || uns_code == UNSPEC_FRINTP
12327 || uns_code == UNSPEC_FRINTZ)
12328 x = XVECEXP (x, 0, 0);
12331 if (speed)
12333 if (VECTOR_MODE_P (mode))
12334 *cost += extra_cost->vect.alu;
12335 else
12336 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12339 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12340 fixed-point fcvt. */
12341 if (GET_CODE (x) == MULT
12342 && ((VECTOR_MODE_P (mode)
12343 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12344 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12346 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12347 0, speed);
12348 return true;
12351 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12352 return true;
12354 case ABS:
12355 if (VECTOR_MODE_P (mode))
12357 /* ABS (vector). */
12358 if (speed)
12359 *cost += extra_cost->vect.alu;
12361 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12363 op0 = XEXP (x, 0);
12365 /* FABD, which is analogous to FADD. */
12366 if (GET_CODE (op0) == MINUS)
12368 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12369 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12370 if (speed)
12371 *cost += extra_cost->fp[mode == DFmode].addsub;
12373 return true;
12375 /* Simple FABS is analogous to FNEG. */
12376 if (speed)
12377 *cost += extra_cost->fp[mode == DFmode].neg;
12379 else
12381 /* Integer ABS will either be split to
12382 two arithmetic instructions, or will be an ABS
12383 (scalar), which we don't model. */
12384 *cost = COSTS_N_INSNS (2);
12385 if (speed)
12386 *cost += 2 * extra_cost->alu.arith;
12388 return false;
12390 case SMAX:
12391 case SMIN:
12392 if (speed)
12394 if (VECTOR_MODE_P (mode))
12395 *cost += extra_cost->vect.alu;
12396 else
12398 /* FMAXNM/FMINNM/FMAX/FMIN.
12399 TODO: This may not be accurate for all implementations, but
12400 we do not model this in the cost tables. */
12401 *cost += extra_cost->fp[mode == DFmode].addsub;
12404 return false;
12406 case UNSPEC:
12407 /* The floating point round to integer frint* instructions. */
12408 if (aarch64_frint_unspec_p (XINT (x, 1)))
12410 if (speed)
12411 *cost += extra_cost->fp[mode == DFmode].roundint;
12413 return false;
12416 if (XINT (x, 1) == UNSPEC_RBIT)
12418 if (speed)
12419 *cost += extra_cost->alu.rev;
12421 return false;
12423 break;
12425 case TRUNCATE:
12427 /* Decompose <su>muldi3_highpart. */
12428 if (/* (truncate:DI */
12429 mode == DImode
12430 /* (lshiftrt:TI */
12431 && GET_MODE (XEXP (x, 0)) == TImode
12432 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12433 /* (mult:TI */
12434 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12435 /* (ANY_EXTEND:TI (reg:DI))
12436 (ANY_EXTEND:TI (reg:DI))) */
12437 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12438 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12439 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12440 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12441 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12442 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12443 /* (const_int 64) */
12444 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12445 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12447 /* UMULH/SMULH. */
12448 if (speed)
12449 *cost += extra_cost->mult[mode == DImode].extend;
12450 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12451 mode, MULT, 0, speed);
12452 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12453 mode, MULT, 1, speed);
12454 return true;
12457 /* Fall through. */
12458 default:
12459 break;
12462 if (dump_file
12463 && flag_aarch64_verbose_cost)
12464 fprintf (dump_file,
12465 "\nFailed to cost RTX. Assuming default cost.\n");
12467 return true;
12470 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12471 calculated for X. This cost is stored in *COST. Returns true
12472 if the total cost of X was calculated. */
12473 static bool
12474 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12475 int param, int *cost, bool speed)
12477 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12479 if (dump_file
12480 && flag_aarch64_verbose_cost)
12482 print_rtl_single (dump_file, x);
12483 fprintf (dump_file, "\n%s cost: %d (%s)\n",
12484 speed ? "Hot" : "Cold",
12485 *cost, result ? "final" : "partial");
12488 return result;
12491 static int
12492 aarch64_register_move_cost (machine_mode mode,
12493 reg_class_t from_i, reg_class_t to_i)
12495 enum reg_class from = (enum reg_class) from_i;
12496 enum reg_class to = (enum reg_class) to_i;
12497 const struct cpu_regmove_cost *regmove_cost
12498 = aarch64_tune_params.regmove_cost;
12500 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
12501 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12502 to = GENERAL_REGS;
12504 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12505 from = GENERAL_REGS;
12507 /* Make RDFFR very expensive. In particular, if we know that the FFR
12508 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12509 as a way of obtaining a PTRUE. */
12510 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12511 && hard_reg_set_subset_p (reg_class_contents[from_i],
12512 reg_class_contents[FFR_REGS]))
12513 return 80;
12515 /* Moving between GPR and stack cost is the same as GP2GP. */
12516 if ((from == GENERAL_REGS && to == STACK_REG)
12517 || (to == GENERAL_REGS && from == STACK_REG))
12518 return regmove_cost->GP2GP;
12520 /* To/From the stack register, we move via the gprs. */
12521 if (to == STACK_REG || from == STACK_REG)
12522 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12523 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12525 if (known_eq (GET_MODE_SIZE (mode), 16))
12527 /* 128-bit operations on general registers require 2 instructions. */
12528 if (from == GENERAL_REGS && to == GENERAL_REGS)
12529 return regmove_cost->GP2GP * 2;
12530 else if (from == GENERAL_REGS)
12531 return regmove_cost->GP2FP * 2;
12532 else if (to == GENERAL_REGS)
12533 return regmove_cost->FP2GP * 2;
12535 /* When AdvSIMD instructions are disabled it is not possible to move
12536 a 128-bit value directly between Q registers. This is handled in
12537 secondary reload. A general register is used as a scratch to move
12538 the upper DI value and the lower DI value is moved directly,
12539 hence the cost is the sum of three moves. */
12540 if (! TARGET_SIMD)
12541 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12543 return regmove_cost->FP2FP;
12546 if (from == GENERAL_REGS && to == GENERAL_REGS)
12547 return regmove_cost->GP2GP;
12548 else if (from == GENERAL_REGS)
12549 return regmove_cost->GP2FP;
12550 else if (to == GENERAL_REGS)
12551 return regmove_cost->FP2GP;
12553 return regmove_cost->FP2FP;
12556 static int
12557 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12558 reg_class_t rclass ATTRIBUTE_UNUSED,
12559 bool in ATTRIBUTE_UNUSED)
12561 return aarch64_tune_params.memmov_cost;
12564 /* Implement TARGET_INIT_BUILTINS. */
12565 static void
12566 aarch64_init_builtins ()
12568 aarch64_general_init_builtins ();
12569 aarch64_sve::init_builtins ();
12572 /* Implement TARGET_FOLD_BUILTIN. */
12573 static tree
12574 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12576 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12577 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12578 tree type = TREE_TYPE (TREE_TYPE (fndecl));
12579 switch (code & AARCH64_BUILTIN_CLASS)
12581 case AARCH64_BUILTIN_GENERAL:
12582 return aarch64_general_fold_builtin (subcode, type, nargs, args);
12584 case AARCH64_BUILTIN_SVE:
12585 return NULL_TREE;
12587 gcc_unreachable ();
12590 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
12591 static bool
12592 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12594 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12595 tree fndecl = gimple_call_fndecl (stmt);
12596 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12597 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12598 gimple *new_stmt = NULL;
12599 switch (code & AARCH64_BUILTIN_CLASS)
12601 case AARCH64_BUILTIN_GENERAL:
12602 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12603 break;
12605 case AARCH64_BUILTIN_SVE:
12606 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12607 break;
12610 if (!new_stmt)
12611 return false;
12613 gsi_replace (gsi, new_stmt, true);
12614 return true;
12617 /* Implement TARGET_EXPAND_BUILTIN. */
12618 static rtx
12619 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12621 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12622 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12623 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12624 switch (code & AARCH64_BUILTIN_CLASS)
12626 case AARCH64_BUILTIN_GENERAL:
12627 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12629 case AARCH64_BUILTIN_SVE:
12630 return aarch64_sve::expand_builtin (subcode, exp, target);
12632 gcc_unreachable ();
12635 /* Implement TARGET_BUILTIN_DECL. */
12636 static tree
12637 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12639 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12640 switch (code & AARCH64_BUILTIN_CLASS)
12642 case AARCH64_BUILTIN_GENERAL:
12643 return aarch64_general_builtin_decl (subcode, initialize_p);
12645 case AARCH64_BUILTIN_SVE:
12646 return aarch64_sve::builtin_decl (subcode, initialize_p);
12648 gcc_unreachable ();
12651 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12652 to optimize 1.0/sqrt. */
12654 static bool
12655 use_rsqrt_p (machine_mode mode)
12657 return (!flag_trapping_math
12658 && flag_unsafe_math_optimizations
12659 && ((aarch64_tune_params.approx_modes->recip_sqrt
12660 & AARCH64_APPROX_MODE (mode))
12661 || flag_mrecip_low_precision_sqrt));
12664 /* Function to decide when to use the approximate reciprocal square root
12665 builtin. */
12667 static tree
12668 aarch64_builtin_reciprocal (tree fndecl)
12670 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12672 if (!use_rsqrt_p (mode))
12673 return NULL_TREE;
12674 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12675 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12676 switch (code & AARCH64_BUILTIN_CLASS)
12678 case AARCH64_BUILTIN_GENERAL:
12679 return aarch64_general_builtin_rsqrt (subcode);
12681 case AARCH64_BUILTIN_SVE:
12682 return NULL_TREE;
12684 gcc_unreachable ();
12687 /* Emit instruction sequence to compute either the approximate square root
12688 or its approximate reciprocal, depending on the flag RECP, and return
12689 whether the sequence was emitted or not. */
12691 bool
12692 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12694 machine_mode mode = GET_MODE (dst);
12696 if (GET_MODE_INNER (mode) == HFmode)
12698 gcc_assert (!recp);
12699 return false;
12702 if (!recp)
12704 if (!(flag_mlow_precision_sqrt
12705 || (aarch64_tune_params.approx_modes->sqrt
12706 & AARCH64_APPROX_MODE (mode))))
12707 return false;
12709 if (flag_finite_math_only
12710 || flag_trapping_math
12711 || !flag_unsafe_math_optimizations
12712 || optimize_function_for_size_p (cfun))
12713 return false;
12715 else
12716 /* Caller assumes we cannot fail. */
12717 gcc_assert (use_rsqrt_p (mode));
12719 machine_mode mmsk = (VECTOR_MODE_P (mode)
12720 ? related_int_vector_mode (mode).require ()
12721 : int_mode_for_mode (mode).require ());
12722 rtx xmsk = gen_reg_rtx (mmsk);
12723 if (!recp)
12724 /* When calculating the approximate square root, compare the
12725 argument with 0.0 and create a mask. */
12726 emit_insn (gen_rtx_SET (xmsk,
12727 gen_rtx_NEG (mmsk,
12728 gen_rtx_EQ (mmsk, src,
12729 CONST0_RTX (mode)))));
12731 /* Estimate the approximate reciprocal square root. */
12732 rtx xdst = gen_reg_rtx (mode);
12733 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12735 /* Iterate over the series twice for SF and thrice for DF. */
12736 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12738 /* Optionally iterate over the series once less for faster performance
12739 while sacrificing the accuracy. */
12740 if ((recp && flag_mrecip_low_precision_sqrt)
12741 || (!recp && flag_mlow_precision_sqrt))
12742 iterations--;
12744 /* Iterate over the series to calculate the approximate reciprocal square
12745 root. */
12746 rtx x1 = gen_reg_rtx (mode);
12747 while (iterations--)
12749 rtx x2 = gen_reg_rtx (mode);
12750 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
12752 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12754 if (iterations > 0)
12755 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
12758 if (!recp)
12760 /* Qualify the approximate reciprocal square root when the argument is
12761 0.0 by squashing the intermediary result to 0.0. */
12762 rtx xtmp = gen_reg_rtx (mmsk);
12763 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12764 gen_rtx_SUBREG (mmsk, xdst, 0)));
12765 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12767 /* Calculate the approximate square root. */
12768 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
12771 /* Finalize the approximation. */
12772 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
12774 return true;
12777 /* Emit the instruction sequence to compute the approximation for the division
12778 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
12780 bool
12781 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12783 machine_mode mode = GET_MODE (quo);
12785 if (GET_MODE_INNER (mode) == HFmode)
12786 return false;
12788 bool use_approx_division_p = (flag_mlow_precision_div
12789 || (aarch64_tune_params.approx_modes->division
12790 & AARCH64_APPROX_MODE (mode)));
12792 if (!flag_finite_math_only
12793 || flag_trapping_math
12794 || !flag_unsafe_math_optimizations
12795 || optimize_function_for_size_p (cfun)
12796 || !use_approx_division_p)
12797 return false;
12799 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12800 return false;
12802 /* Estimate the approximate reciprocal. */
12803 rtx xrcp = gen_reg_rtx (mode);
12804 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12806 /* Iterate over the series twice for SF and thrice for DF. */
12807 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12809 /* Optionally iterate over the series once less for faster performance,
12810 while sacrificing the accuracy. */
12811 if (flag_mlow_precision_div)
12812 iterations--;
12814 /* Iterate over the series to calculate the approximate reciprocal. */
12815 rtx xtmp = gen_reg_rtx (mode);
12816 while (iterations--)
12818 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12820 if (iterations > 0)
12821 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
12824 if (num != CONST1_RTX (mode))
12826 /* As the approximate reciprocal of DEN is already calculated, only
12827 calculate the approximate division when NUM is not 1.0. */
12828 rtx xnum = force_reg (mode, num);
12829 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
12832 /* Finalize the approximation. */
12833 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
12834 return true;
12837 /* Return the number of instructions that can be issued per cycle. */
12838 static int
12839 aarch64_sched_issue_rate (void)
12841 return aarch64_tune_params.issue_rate;
12844 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
12845 static int
12846 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12848 if (DEBUG_INSN_P (insn))
12849 return more;
12851 rtx_code code = GET_CODE (PATTERN (insn));
12852 if (code == USE || code == CLOBBER)
12853 return more;
12855 if (get_attr_type (insn) == TYPE_NO_INSN)
12856 return more;
12858 return more - 1;
12861 static int
12862 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12864 int issue_rate = aarch64_sched_issue_rate ();
12866 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12870 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12871 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
12872 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
12874 static int
12875 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
12876 int ready_index)
12878 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
12882 /* Vectorizer cost model target hooks. */
12884 /* Implement targetm.vectorize.builtin_vectorization_cost. */
12885 static int
12886 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
12887 tree vectype,
12888 int misalign ATTRIBUTE_UNUSED)
12890 unsigned elements;
12891 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
12892 bool fp = false;
12894 if (vectype != NULL)
12895 fp = FLOAT_TYPE_P (vectype);
12897 switch (type_of_cost)
12899 case scalar_stmt:
12900 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
12902 case scalar_load:
12903 return costs->scalar_load_cost;
12905 case scalar_store:
12906 return costs->scalar_store_cost;
12908 case vector_stmt:
12909 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12911 case vector_load:
12912 return costs->vec_align_load_cost;
12914 case vector_store:
12915 return costs->vec_store_cost;
12917 case vec_to_scalar:
12918 return costs->vec_to_scalar_cost;
12920 case scalar_to_vec:
12921 return costs->scalar_to_vec_cost;
12923 case unaligned_load:
12924 case vector_gather_load:
12925 return costs->vec_unalign_load_cost;
12927 case unaligned_store:
12928 case vector_scatter_store:
12929 return costs->vec_unalign_store_cost;
12931 case cond_branch_taken:
12932 return costs->cond_taken_branch_cost;
12934 case cond_branch_not_taken:
12935 return costs->cond_not_taken_branch_cost;
12937 case vec_perm:
12938 return costs->vec_permute_cost;
12940 case vec_promote_demote:
12941 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12943 case vec_construct:
12944 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12945 return elements / 2 + 1;
12947 default:
12948 gcc_unreachable ();
12952 /* Return true if STMT_INFO extends the result of a load. */
12953 static bool
12954 aarch64_extending_load_p (stmt_vec_info stmt_info)
12956 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12957 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12958 return false;
12960 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
12961 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12962 tree rhs_type = TREE_TYPE (rhs);
12963 if (!INTEGRAL_TYPE_P (lhs_type)
12964 || !INTEGRAL_TYPE_P (rhs_type)
12965 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
12966 return false;
12968 stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
12969 return (def_stmt_info
12970 && STMT_VINFO_DATA_REF (def_stmt_info)
12971 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
12974 /* Return true if STMT_INFO is an integer truncation. */
12975 static bool
12976 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
12978 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12979 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12980 return false;
12982 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12983 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
12984 return (INTEGRAL_TYPE_P (lhs_type)
12985 && INTEGRAL_TYPE_P (rhs_type)
12986 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
12989 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
12990 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
12991 for SVE targets. */
12992 static unsigned int
12993 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
12994 unsigned int stmt_cost)
12996 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
12997 vector register size or number of units. Integer promotions of this
12998 type therefore map to SXT[BHW] or UXT[BHW].
13000 Most loads have extending forms that can do the sign or zero extension
13001 on the fly. Optimistically assume that a load followed by an extension
13002 will fold to this form during combine, and that the extension therefore
13003 comes for free. */
13004 if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13005 stmt_cost = 0;
13007 /* For similar reasons, vector_stmt integer truncations are a no-op,
13008 because we can just ignore the unused upper bits of the source. */
13009 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13010 stmt_cost = 0;
13012 return stmt_cost;
13015 /* Implement targetm.vectorize.add_stmt_cost. */
13016 static unsigned
13017 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13018 struct _stmt_vec_info *stmt_info, int misalign,
13019 enum vect_cost_model_location where)
13021 unsigned *cost = (unsigned *) data;
13022 unsigned retval = 0;
13024 if (flag_vect_cost_model)
13026 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13027 int stmt_cost =
13028 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13030 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13031 stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13033 /* Statements in an inner loop relative to the loop being
13034 vectorized are weighted more heavily. The value here is
13035 arbitrary and could potentially be improved with analysis. */
13036 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13037 count *= 50; /* FIXME */
13039 retval = (unsigned) (count * stmt_cost);
13040 cost[where] += retval;
13043 return retval;
13046 static void initialize_aarch64_code_model (struct gcc_options *);
13048 /* Parse the TO_PARSE string and put the architecture struct that it
13049 selects into RES and the architectural features into ISA_FLAGS.
13050 Return an aarch64_parse_opt_result describing the parse result.
13051 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13052 When the TO_PARSE string contains an invalid extension,
13053 a copy of the string is created and stored to INVALID_EXTENSION. */
13055 static enum aarch64_parse_opt_result
13056 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13057 uint64_t *isa_flags, std::string *invalid_extension)
13059 const char *ext;
13060 const struct processor *arch;
13061 size_t len;
13063 ext = strchr (to_parse, '+');
13065 if (ext != NULL)
13066 len = ext - to_parse;
13067 else
13068 len = strlen (to_parse);
13070 if (len == 0)
13071 return AARCH64_PARSE_MISSING_ARG;
13074 /* Loop through the list of supported ARCHes to find a match. */
13075 for (arch = all_architectures; arch->name != NULL; arch++)
13077 if (strlen (arch->name) == len
13078 && strncmp (arch->name, to_parse, len) == 0)
13080 uint64_t isa_temp = arch->flags;
13082 if (ext != NULL)
13084 /* TO_PARSE string contains at least one extension. */
13085 enum aarch64_parse_opt_result ext_res
13086 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13088 if (ext_res != AARCH64_PARSE_OK)
13089 return ext_res;
13091 /* Extension parsing was successful. Confirm the result
13092 arch and ISA flags. */
13093 *res = arch;
13094 *isa_flags = isa_temp;
13095 return AARCH64_PARSE_OK;
13099 /* ARCH name not found in list. */
13100 return AARCH64_PARSE_INVALID_ARG;
13103 /* Parse the TO_PARSE string and put the result tuning in RES and the
13104 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13105 describing the parse result. If there is an error parsing, RES and
13106 ISA_FLAGS are left unchanged.
13107 When the TO_PARSE string contains an invalid extension,
13108 a copy of the string is created and stored to INVALID_EXTENSION. */
13110 static enum aarch64_parse_opt_result
13111 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13112 uint64_t *isa_flags, std::string *invalid_extension)
13114 const char *ext;
13115 const struct processor *cpu;
13116 size_t len;
13118 ext = strchr (to_parse, '+');
13120 if (ext != NULL)
13121 len = ext - to_parse;
13122 else
13123 len = strlen (to_parse);
13125 if (len == 0)
13126 return AARCH64_PARSE_MISSING_ARG;
13129 /* Loop through the list of supported CPUs to find a match. */
13130 for (cpu = all_cores; cpu->name != NULL; cpu++)
13132 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13134 uint64_t isa_temp = cpu->flags;
13137 if (ext != NULL)
13139 /* TO_PARSE string contains at least one extension. */
13140 enum aarch64_parse_opt_result ext_res
13141 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13143 if (ext_res != AARCH64_PARSE_OK)
13144 return ext_res;
13146 /* Extension parsing was successfull. Confirm the result
13147 cpu and ISA flags. */
13148 *res = cpu;
13149 *isa_flags = isa_temp;
13150 return AARCH64_PARSE_OK;
13154 /* CPU name not found in list. */
13155 return AARCH64_PARSE_INVALID_ARG;
13158 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13159 Return an aarch64_parse_opt_result describing the parse result.
13160 If the parsing fails the RES does not change. */
13162 static enum aarch64_parse_opt_result
13163 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13165 const struct processor *cpu;
13167 /* Loop through the list of supported CPUs to find a match. */
13168 for (cpu = all_cores; cpu->name != NULL; cpu++)
13170 if (strcmp (cpu->name, to_parse) == 0)
13172 *res = cpu;
13173 return AARCH64_PARSE_OK;
13177 /* CPU name not found in list. */
13178 return AARCH64_PARSE_INVALID_ARG;
13181 /* Parse TOKEN, which has length LENGTH to see if it is an option
13182 described in FLAG. If it is, return the index bit for that fusion type.
13183 If not, error (printing OPTION_NAME) and return zero. */
13185 static unsigned int
13186 aarch64_parse_one_option_token (const char *token,
13187 size_t length,
13188 const struct aarch64_flag_desc *flag,
13189 const char *option_name)
13191 for (; flag->name != NULL; flag++)
13193 if (length == strlen (flag->name)
13194 && !strncmp (flag->name, token, length))
13195 return flag->flag;
13198 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13199 return 0;
13202 /* Parse OPTION which is a comma-separated list of flags to enable.
13203 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13204 default state we inherit from the CPU tuning structures. OPTION_NAME
13205 gives the top-level option we are parsing in the -moverride string,
13206 for use in error messages. */
13208 static unsigned int
13209 aarch64_parse_boolean_options (const char *option,
13210 const struct aarch64_flag_desc *flags,
13211 unsigned int initial_state,
13212 const char *option_name)
13214 const char separator = '.';
13215 const char* specs = option;
13216 const char* ntoken = option;
13217 unsigned int found_flags = initial_state;
13219 while ((ntoken = strchr (specs, separator)))
13221 size_t token_length = ntoken - specs;
13222 unsigned token_ops = aarch64_parse_one_option_token (specs,
13223 token_length,
13224 flags,
13225 option_name);
13226 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13227 in the token stream, reset the supported operations. So:
13229 adrp+add.cmp+branch.none.adrp+add
13231 would have the result of turning on only adrp+add fusion. */
13232 if (!token_ops)
13233 found_flags = 0;
13235 found_flags |= token_ops;
13236 specs = ++ntoken;
13239 /* We ended with a comma, print something. */
13240 if (!(*specs))
13242 error ("%s string ill-formed\n", option_name);
13243 return 0;
13246 /* We still have one more token to parse. */
13247 size_t token_length = strlen (specs);
13248 unsigned token_ops = aarch64_parse_one_option_token (specs,
13249 token_length,
13250 flags,
13251 option_name);
13252 if (!token_ops)
13253 found_flags = 0;
13255 found_flags |= token_ops;
13256 return found_flags;
13259 /* Support for overriding instruction fusion. */
13261 static void
13262 aarch64_parse_fuse_string (const char *fuse_string,
13263 struct tune_params *tune)
13265 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13266 aarch64_fusible_pairs,
13267 tune->fusible_ops,
13268 "fuse=");
13271 /* Support for overriding other tuning flags. */
13273 static void
13274 aarch64_parse_tune_string (const char *tune_string,
13275 struct tune_params *tune)
13277 tune->extra_tuning_flags
13278 = aarch64_parse_boolean_options (tune_string,
13279 aarch64_tuning_flags,
13280 tune->extra_tuning_flags,
13281 "tune=");
13284 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13285 Accept the valid SVE vector widths allowed by
13286 aarch64_sve_vector_bits_enum and use it to override sve_width
13287 in TUNE. */
13289 static void
13290 aarch64_parse_sve_width_string (const char *tune_string,
13291 struct tune_params *tune)
13293 int width = -1;
13295 int n = sscanf (tune_string, "%d", &width);
13296 if (n == EOF)
13298 error ("invalid format for sve_width");
13299 return;
13301 switch (width)
13303 case SVE_128:
13304 case SVE_256:
13305 case SVE_512:
13306 case SVE_1024:
13307 case SVE_2048:
13308 break;
13309 default:
13310 error ("invalid sve_width value: %d", width);
13312 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13315 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13316 we understand. If it is, extract the option string and handoff to
13317 the appropriate function. */
13319 void
13320 aarch64_parse_one_override_token (const char* token,
13321 size_t length,
13322 struct tune_params *tune)
13324 const struct aarch64_tuning_override_function *fn
13325 = aarch64_tuning_override_functions;
13327 const char *option_part = strchr (token, '=');
13328 if (!option_part)
13330 error ("tuning string missing in option (%s)", token);
13331 return;
13334 /* Get the length of the option name. */
13335 length = option_part - token;
13336 /* Skip the '=' to get to the option string. */
13337 option_part++;
13339 for (; fn->name != NULL; fn++)
13341 if (!strncmp (fn->name, token, length))
13343 fn->parse_override (option_part, tune);
13344 return;
13348 error ("unknown tuning option (%s)",token);
13349 return;
13352 /* A checking mechanism for the implementation of the tls size. */
13354 static void
13355 initialize_aarch64_tls_size (struct gcc_options *opts)
13357 if (aarch64_tls_size == 0)
13358 aarch64_tls_size = 24;
13360 switch (opts->x_aarch64_cmodel_var)
13362 case AARCH64_CMODEL_TINY:
13363 /* Both the default and maximum TLS size allowed under tiny is 1M which
13364 needs two instructions to address, so we clamp the size to 24. */
13365 if (aarch64_tls_size > 24)
13366 aarch64_tls_size = 24;
13367 break;
13368 case AARCH64_CMODEL_SMALL:
13369 /* The maximum TLS size allowed under small is 4G. */
13370 if (aarch64_tls_size > 32)
13371 aarch64_tls_size = 32;
13372 break;
13373 case AARCH64_CMODEL_LARGE:
13374 /* The maximum TLS size allowed under large is 16E.
13375 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13376 if (aarch64_tls_size > 48)
13377 aarch64_tls_size = 48;
13378 break;
13379 default:
13380 gcc_unreachable ();
13383 return;
13386 /* Parse STRING looking for options in the format:
13387 string :: option:string
13388 option :: name=substring
13389 name :: {a-z}
13390 substring :: defined by option. */
13392 static void
13393 aarch64_parse_override_string (const char* input_string,
13394 struct tune_params* tune)
13396 const char separator = ':';
13397 size_t string_length = strlen (input_string) + 1;
13398 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13399 char *string = string_root;
13400 strncpy (string, input_string, string_length);
13401 string[string_length - 1] = '\0';
13403 char* ntoken = string;
13405 while ((ntoken = strchr (string, separator)))
13407 size_t token_length = ntoken - string;
13408 /* Make this substring look like a string. */
13409 *ntoken = '\0';
13410 aarch64_parse_one_override_token (string, token_length, tune);
13411 string = ++ntoken;
13414 /* One last option to parse. */
13415 aarch64_parse_one_override_token (string, strlen (string), tune);
13416 free (string_root);
13420 static void
13421 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13423 if (accepted_branch_protection_string)
13425 opts->x_aarch64_branch_protection_string
13426 = xstrdup (accepted_branch_protection_string);
13429 /* PR 70044: We have to be careful about being called multiple times for the
13430 same function. This means all changes should be repeatable. */
13432 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13433 Disable the frame pointer flag so the mid-end will not use a frame
13434 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13435 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13436 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
13437 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13438 if (opts->x_flag_omit_frame_pointer == 0)
13439 opts->x_flag_omit_frame_pointer = 2;
13441 /* If not optimizing for size, set the default
13442 alignment to what the target wants. */
13443 if (!opts->x_optimize_size)
13445 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13446 opts->x_str_align_loops = aarch64_tune_params.loop_align;
13447 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13448 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13449 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13450 opts->x_str_align_functions = aarch64_tune_params.function_align;
13453 /* We default to no pc-relative literal loads. */
13455 aarch64_pcrelative_literal_loads = false;
13457 /* If -mpc-relative-literal-loads is set on the command line, this
13458 implies that the user asked for PC relative literal loads. */
13459 if (opts->x_pcrelative_literal_loads == 1)
13460 aarch64_pcrelative_literal_loads = true;
13462 /* In the tiny memory model it makes no sense to disallow PC relative
13463 literal pool loads. */
13464 if (aarch64_cmodel == AARCH64_CMODEL_TINY
13465 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13466 aarch64_pcrelative_literal_loads = true;
13468 /* When enabling the lower precision Newton series for the square root, also
13469 enable it for the reciprocal square root, since the latter is an
13470 intermediary step for the former. */
13471 if (flag_mlow_precision_sqrt)
13472 flag_mrecip_low_precision_sqrt = true;
13475 /* 'Unpack' up the internal tuning structs and update the options
13476 in OPTS. The caller must have set up selected_tune and selected_arch
13477 as all the other target-specific codegen decisions are
13478 derived from them. */
13480 void
13481 aarch64_override_options_internal (struct gcc_options *opts)
13483 aarch64_tune_flags = selected_tune->flags;
13484 aarch64_tune = selected_tune->sched_core;
13485 /* Make a copy of the tuning parameters attached to the core, which
13486 we may later overwrite. */
13487 aarch64_tune_params = *(selected_tune->tune);
13488 aarch64_architecture_version = selected_arch->architecture_version;
13490 if (opts->x_aarch64_override_tune_string)
13491 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13492 &aarch64_tune_params);
13494 /* This target defaults to strict volatile bitfields. */
13495 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13496 opts->x_flag_strict_volatile_bitfields = 1;
13498 if (aarch64_stack_protector_guard == SSP_GLOBAL
13499 && opts->x_aarch64_stack_protector_guard_offset_str)
13501 error ("incompatible options %<-mstack-protector-guard=global%> and "
13502 "%<-mstack-protector-guard-offset=%s%>",
13503 aarch64_stack_protector_guard_offset_str);
13506 if (aarch64_stack_protector_guard == SSP_SYSREG
13507 && !(opts->x_aarch64_stack_protector_guard_offset_str
13508 && opts->x_aarch64_stack_protector_guard_reg_str))
13510 error ("both %<-mstack-protector-guard-offset%> and "
13511 "%<-mstack-protector-guard-reg%> must be used "
13512 "with %<-mstack-protector-guard=sysreg%>");
13515 if (opts->x_aarch64_stack_protector_guard_reg_str)
13517 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13518 error ("specify a system register with a small string length.");
13521 if (opts->x_aarch64_stack_protector_guard_offset_str)
13523 char *end;
13524 const char *str = aarch64_stack_protector_guard_offset_str;
13525 errno = 0;
13526 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13527 if (!*str || *end || errno)
13528 error ("%qs is not a valid offset in %qs", str,
13529 "-mstack-protector-guard-offset=");
13530 aarch64_stack_protector_guard_offset = offs;
13533 initialize_aarch64_code_model (opts);
13534 initialize_aarch64_tls_size (opts);
13536 int queue_depth = 0;
13537 switch (aarch64_tune_params.autoprefetcher_model)
13539 case tune_params::AUTOPREFETCHER_OFF:
13540 queue_depth = -1;
13541 break;
13542 case tune_params::AUTOPREFETCHER_WEAK:
13543 queue_depth = 0;
13544 break;
13545 case tune_params::AUTOPREFETCHER_STRONG:
13546 queue_depth = max_insn_queue_index + 1;
13547 break;
13548 default:
13549 gcc_unreachable ();
13552 /* We don't mind passing in global_options_set here as we don't use
13553 the *options_set structs anyway. */
13554 SET_OPTION_IF_UNSET (opts, &global_options_set,
13555 param_sched_autopref_queue_depth, queue_depth);
13557 /* Set up parameters to be used in prefetching algorithm. Do not
13558 override the defaults unless we are tuning for a core we have
13559 researched values for. */
13560 if (aarch64_tune_params.prefetch->num_slots > 0)
13561 SET_OPTION_IF_UNSET (opts, &global_options_set,
13562 param_simultaneous_prefetches,
13563 aarch64_tune_params.prefetch->num_slots);
13564 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13565 SET_OPTION_IF_UNSET (opts, &global_options_set,
13566 param_l1_cache_size,
13567 aarch64_tune_params.prefetch->l1_cache_size);
13568 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13569 SET_OPTION_IF_UNSET (opts, &global_options_set,
13570 param_l1_cache_line_size,
13571 aarch64_tune_params.prefetch->l1_cache_line_size);
13572 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13573 SET_OPTION_IF_UNSET (opts, &global_options_set,
13574 param_l2_cache_size,
13575 aarch64_tune_params.prefetch->l2_cache_size);
13576 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13577 SET_OPTION_IF_UNSET (opts, &global_options_set,
13578 param_prefetch_dynamic_strides, 0);
13579 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13580 SET_OPTION_IF_UNSET (opts, &global_options_set,
13581 param_prefetch_minimum_stride,
13582 aarch64_tune_params.prefetch->minimum_stride);
13584 /* Use the alternative scheduling-pressure algorithm by default. */
13585 SET_OPTION_IF_UNSET (opts, &global_options_set,
13586 param_sched_pressure_algorithm,
13587 SCHED_PRESSURE_MODEL);
13589 /* Validate the guard size. */
13590 int guard_size = param_stack_clash_protection_guard_size;
13592 if (guard_size != 12 && guard_size != 16)
13593 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13594 "size. Given value %d (%llu KB) is out of range",
13595 guard_size, (1ULL << guard_size) / 1024ULL);
13597 /* Enforce that interval is the same size as size so the mid-end does the
13598 right thing. */
13599 SET_OPTION_IF_UNSET (opts, &global_options_set,
13600 param_stack_clash_protection_probe_interval,
13601 guard_size);
13603 /* The maybe_set calls won't update the value if the user has explicitly set
13604 one. Which means we need to validate that probing interval and guard size
13605 are equal. */
13606 int probe_interval
13607 = param_stack_clash_protection_probe_interval;
13608 if (guard_size != probe_interval)
13609 error ("stack clash guard size %<%d%> must be equal to probing interval "
13610 "%<%d%>", guard_size, probe_interval);
13612 /* Enable sw prefetching at specified optimization level for
13613 CPUS that have prefetch. Lower optimization level threshold by 1
13614 when profiling is enabled. */
13615 if (opts->x_flag_prefetch_loop_arrays < 0
13616 && !opts->x_optimize_size
13617 && aarch64_tune_params.prefetch->default_opt_level >= 0
13618 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13619 opts->x_flag_prefetch_loop_arrays = 1;
13621 if (opts->x_aarch64_arch_string == NULL)
13622 opts->x_aarch64_arch_string = selected_arch->name;
13623 if (opts->x_aarch64_cpu_string == NULL)
13624 opts->x_aarch64_cpu_string = selected_cpu->name;
13625 if (opts->x_aarch64_tune_string == NULL)
13626 opts->x_aarch64_tune_string = selected_tune->name;
13628 aarch64_override_options_after_change_1 (opts);
13631 /* Print a hint with a suggestion for a core or architecture name that
13632 most closely resembles what the user passed in STR. ARCH is true if
13633 the user is asking for an architecture name. ARCH is false if the user
13634 is asking for a core name. */
13636 static void
13637 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13639 auto_vec<const char *> candidates;
13640 const struct processor *entry = arch ? all_architectures : all_cores;
13641 for (; entry->name != NULL; entry++)
13642 candidates.safe_push (entry->name);
13644 #ifdef HAVE_LOCAL_CPU_DETECT
13645 /* Add also "native" as possible value. */
13646 if (arch)
13647 candidates.safe_push ("native");
13648 #endif
13650 char *s;
13651 const char *hint = candidates_list_and_hint (str, s, candidates);
13652 if (hint)
13653 inform (input_location, "valid arguments are: %s;"
13654 " did you mean %qs?", s, hint);
13655 else
13656 inform (input_location, "valid arguments are: %s", s);
13658 XDELETEVEC (s);
13661 /* Print a hint with a suggestion for a core name that most closely resembles
13662 what the user passed in STR. */
13664 inline static void
13665 aarch64_print_hint_for_core (const char *str)
13667 aarch64_print_hint_for_core_or_arch (str, false);
13670 /* Print a hint with a suggestion for an architecture name that most closely
13671 resembles what the user passed in STR. */
13673 inline static void
13674 aarch64_print_hint_for_arch (const char *str)
13676 aarch64_print_hint_for_core_or_arch (str, true);
13680 /* Print a hint with a suggestion for an extension name
13681 that most closely resembles what the user passed in STR. */
13683 void
13684 aarch64_print_hint_for_extensions (const std::string &str)
13686 auto_vec<const char *> candidates;
13687 aarch64_get_all_extension_candidates (&candidates);
13688 char *s;
13689 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13690 if (hint)
13691 inform (input_location, "valid arguments are: %s;"
13692 " did you mean %qs?", s, hint);
13693 else
13694 inform (input_location, "valid arguments are: %s;", s);
13696 XDELETEVEC (s);
13699 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
13700 specified in STR and throw errors if appropriate. Put the results if
13701 they are valid in RES and ISA_FLAGS. Return whether the option is
13702 valid. */
13704 static bool
13705 aarch64_validate_mcpu (const char *str, const struct processor **res,
13706 uint64_t *isa_flags)
13708 std::string invalid_extension;
13709 enum aarch64_parse_opt_result parse_res
13710 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13712 if (parse_res == AARCH64_PARSE_OK)
13713 return true;
13715 switch (parse_res)
13717 case AARCH64_PARSE_MISSING_ARG:
13718 error ("missing cpu name in %<-mcpu=%s%>", str);
13719 break;
13720 case AARCH64_PARSE_INVALID_ARG:
13721 error ("unknown value %qs for %<-mcpu%>", str);
13722 aarch64_print_hint_for_core (str);
13723 break;
13724 case AARCH64_PARSE_INVALID_FEATURE:
13725 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13726 invalid_extension.c_str (), str);
13727 aarch64_print_hint_for_extensions (invalid_extension);
13728 break;
13729 default:
13730 gcc_unreachable ();
13733 return false;
13736 /* Parses CONST_STR for branch protection features specified in
13737 aarch64_branch_protect_types, and set any global variables required. Returns
13738 the parsing result and assigns LAST_STR to the last processed token from
13739 CONST_STR so that it can be used for error reporting. */
13741 static enum
13742 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13743 char** last_str)
13745 char *str_root = xstrdup (const_str);
13746 char* token_save = NULL;
13747 char *str = strtok_r (str_root, "+", &token_save);
13748 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13749 if (!str)
13750 res = AARCH64_PARSE_MISSING_ARG;
13751 else
13753 char *next_str = strtok_r (NULL, "+", &token_save);
13754 /* Reset the branch protection features to their defaults. */
13755 aarch64_handle_no_branch_protection (NULL, NULL);
13757 while (str && res == AARCH64_PARSE_OK)
13759 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13760 bool found = false;
13761 /* Search for this type. */
13762 while (type && type->name && !found && res == AARCH64_PARSE_OK)
13764 if (strcmp (str, type->name) == 0)
13766 found = true;
13767 res = type->handler (str, next_str);
13768 str = next_str;
13769 next_str = strtok_r (NULL, "+", &token_save);
13771 else
13772 type++;
13774 if (found && res == AARCH64_PARSE_OK)
13776 bool found_subtype = true;
13777 /* Loop through each token until we find one that isn't a
13778 subtype. */
13779 while (found_subtype)
13781 found_subtype = false;
13782 const aarch64_branch_protect_type *subtype = type->subtypes;
13783 /* Search for the subtype. */
13784 while (str && subtype && subtype->name && !found_subtype
13785 && res == AARCH64_PARSE_OK)
13787 if (strcmp (str, subtype->name) == 0)
13789 found_subtype = true;
13790 res = subtype->handler (str, next_str);
13791 str = next_str;
13792 next_str = strtok_r (NULL, "+", &token_save);
13794 else
13795 subtype++;
13799 else if (!found)
13800 res = AARCH64_PARSE_INVALID_ARG;
13803 /* Copy the last processed token into the argument to pass it back.
13804 Used by option and attribute validation to print the offending token. */
13805 if (last_str)
13807 if (str) strcpy (*last_str, str);
13808 else *last_str = NULL;
13810 if (res == AARCH64_PARSE_OK)
13812 /* If needed, alloc the accepted string then copy in const_str.
13813 Used by override_option_after_change_1. */
13814 if (!accepted_branch_protection_string)
13815 accepted_branch_protection_string = (char *) xmalloc (
13816 BRANCH_PROTECT_STR_MAX
13817 + 1);
13818 strncpy (accepted_branch_protection_string, const_str,
13819 BRANCH_PROTECT_STR_MAX + 1);
13820 /* Forcibly null-terminate. */
13821 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13823 return res;
13826 static bool
13827 aarch64_validate_mbranch_protection (const char *const_str)
13829 char *str = (char *) xmalloc (strlen (const_str));
13830 enum aarch64_parse_opt_result res =
13831 aarch64_parse_branch_protection (const_str, &str);
13832 if (res == AARCH64_PARSE_INVALID_ARG)
13833 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13834 else if (res == AARCH64_PARSE_MISSING_ARG)
13835 error ("missing argument for %<-mbranch-protection=%>");
13836 free (str);
13837 return res == AARCH64_PARSE_OK;
13840 /* Validate a command-line -march option. Parse the arch and extensions
13841 (if any) specified in STR and throw errors if appropriate. Put the
13842 results, if they are valid, in RES and ISA_FLAGS. Return whether the
13843 option is valid. */
13845 static bool
13846 aarch64_validate_march (const char *str, const struct processor **res,
13847 uint64_t *isa_flags)
13849 std::string invalid_extension;
13850 enum aarch64_parse_opt_result parse_res
13851 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13853 if (parse_res == AARCH64_PARSE_OK)
13854 return true;
13856 switch (parse_res)
13858 case AARCH64_PARSE_MISSING_ARG:
13859 error ("missing arch name in %<-march=%s%>", str);
13860 break;
13861 case AARCH64_PARSE_INVALID_ARG:
13862 error ("unknown value %qs for %<-march%>", str);
13863 aarch64_print_hint_for_arch (str);
13864 break;
13865 case AARCH64_PARSE_INVALID_FEATURE:
13866 error ("invalid feature modifier %qs in %<-march=%s%>",
13867 invalid_extension.c_str (), str);
13868 aarch64_print_hint_for_extensions (invalid_extension);
13869 break;
13870 default:
13871 gcc_unreachable ();
13874 return false;
13877 /* Validate a command-line -mtune option. Parse the cpu
13878 specified in STR and throw errors if appropriate. Put the
13879 result, if it is valid, in RES. Return whether the option is
13880 valid. */
13882 static bool
13883 aarch64_validate_mtune (const char *str, const struct processor **res)
13885 enum aarch64_parse_opt_result parse_res
13886 = aarch64_parse_tune (str, res);
13888 if (parse_res == AARCH64_PARSE_OK)
13889 return true;
13891 switch (parse_res)
13893 case AARCH64_PARSE_MISSING_ARG:
13894 error ("missing cpu name in %<-mtune=%s%>", str);
13895 break;
13896 case AARCH64_PARSE_INVALID_ARG:
13897 error ("unknown value %qs for %<-mtune%>", str);
13898 aarch64_print_hint_for_core (str);
13899 break;
13900 default:
13901 gcc_unreachable ();
13903 return false;
13906 /* Return the CPU corresponding to the enum CPU.
13907 If it doesn't specify a cpu, return the default. */
13909 static const struct processor *
13910 aarch64_get_tune_cpu (enum aarch64_processor cpu)
13912 if (cpu != aarch64_none)
13913 return &all_cores[cpu];
13915 /* The & 0x3f is to extract the bottom 6 bits that encode the
13916 default cpu as selected by the --with-cpu GCC configure option
13917 in config.gcc.
13918 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13919 flags mechanism should be reworked to make it more sane. */
13920 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13923 /* Return the architecture corresponding to the enum ARCH.
13924 If it doesn't specify a valid architecture, return the default. */
13926 static const struct processor *
13927 aarch64_get_arch (enum aarch64_arch arch)
13929 if (arch != aarch64_no_arch)
13930 return &all_architectures[arch];
13932 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13934 return &all_architectures[cpu->arch];
13937 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
13939 static poly_uint16
13940 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
13942 /* 128-bit SVE and Advanced SIMD modes use different register layouts
13943 on big-endian targets, so we would need to forbid subregs that convert
13944 from one to the other. By default a reinterpret sequence would then
13945 involve a store to memory in one mode and a load back in the other.
13946 Even if we optimize that sequence using reverse instructions,
13947 it would still be a significant potential overhead.
13949 For now, it seems better to generate length-agnostic code for that
13950 case instead. */
13951 if (value == SVE_SCALABLE
13952 || (value == SVE_128 && BYTES_BIG_ENDIAN))
13953 return poly_uint16 (2, 2);
13954 else
13955 return (int) value / 64;
13958 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
13959 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13960 tuning structs. In particular it must set selected_tune and
13961 aarch64_isa_flags that define the available ISA features and tuning
13962 decisions. It must also set selected_arch as this will be used to
13963 output the .arch asm tags for each function. */
13965 static void
13966 aarch64_override_options (void)
13968 uint64_t cpu_isa = 0;
13969 uint64_t arch_isa = 0;
13970 aarch64_isa_flags = 0;
13972 bool valid_cpu = true;
13973 bool valid_tune = true;
13974 bool valid_arch = true;
13976 selected_cpu = NULL;
13977 selected_arch = NULL;
13978 selected_tune = NULL;
13980 if (aarch64_branch_protection_string)
13981 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13983 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13984 If either of -march or -mtune is given, they override their
13985 respective component of -mcpu. */
13986 if (aarch64_cpu_string)
13987 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13988 &cpu_isa);
13990 if (aarch64_arch_string)
13991 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13992 &arch_isa);
13994 if (aarch64_tune_string)
13995 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13997 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13998 SUBTARGET_OVERRIDE_OPTIONS;
13999 #endif
14001 /* If the user did not specify a processor, choose the default
14002 one for them. This will be the CPU set during configuration using
14003 --with-cpu, otherwise it is "generic". */
14004 if (!selected_cpu)
14006 if (selected_arch)
14008 selected_cpu = &all_cores[selected_arch->ident];
14009 aarch64_isa_flags = arch_isa;
14010 explicit_arch = selected_arch->arch;
14012 else
14014 /* Get default configure-time CPU. */
14015 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14016 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14019 if (selected_tune)
14020 explicit_tune_core = selected_tune->ident;
14022 /* If both -mcpu and -march are specified check that they are architecturally
14023 compatible, warn if they're not and prefer the -march ISA flags. */
14024 else if (selected_arch)
14026 if (selected_arch->arch != selected_cpu->arch)
14028 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14029 all_architectures[selected_cpu->arch].name,
14030 selected_arch->name);
14032 aarch64_isa_flags = arch_isa;
14033 explicit_arch = selected_arch->arch;
14034 explicit_tune_core = selected_tune ? selected_tune->ident
14035 : selected_cpu->ident;
14037 else
14039 /* -mcpu but no -march. */
14040 aarch64_isa_flags = cpu_isa;
14041 explicit_tune_core = selected_tune ? selected_tune->ident
14042 : selected_cpu->ident;
14043 gcc_assert (selected_cpu);
14044 selected_arch = &all_architectures[selected_cpu->arch];
14045 explicit_arch = selected_arch->arch;
14048 /* Set the arch as well as we will need it when outputing
14049 the .arch directive in assembly. */
14050 if (!selected_arch)
14052 gcc_assert (selected_cpu);
14053 selected_arch = &all_architectures[selected_cpu->arch];
14056 if (!selected_tune)
14057 selected_tune = selected_cpu;
14059 if (aarch64_enable_bti == 2)
14061 #ifdef TARGET_ENABLE_BTI
14062 aarch64_enable_bti = 1;
14063 #else
14064 aarch64_enable_bti = 0;
14065 #endif
14068 /* Return address signing is currently not supported for ILP32 targets. For
14069 LP64 targets use the configured option in the absence of a command-line
14070 option for -mbranch-protection. */
14071 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14073 #ifdef TARGET_ENABLE_PAC_RET
14074 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14075 #else
14076 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14077 #endif
14080 #ifndef HAVE_AS_MABI_OPTION
14081 /* The compiler may have been configured with 2.23.* binutils, which does
14082 not have support for ILP32. */
14083 if (TARGET_ILP32)
14084 error ("assembler does not support %<-mabi=ilp32%>");
14085 #endif
14087 /* Convert -msve-vector-bits to a VG count. */
14088 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14090 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14091 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14093 /* Make sure we properly set up the explicit options. */
14094 if ((aarch64_cpu_string && valid_cpu)
14095 || (aarch64_tune_string && valid_tune))
14096 gcc_assert (explicit_tune_core != aarch64_none);
14098 if ((aarch64_cpu_string && valid_cpu)
14099 || (aarch64_arch_string && valid_arch))
14100 gcc_assert (explicit_arch != aarch64_no_arch);
14102 /* The pass to insert speculation tracking runs before
14103 shrink-wrapping and the latter does not know how to update the
14104 tracking status. So disable it in this case. */
14105 if (aarch64_track_speculation)
14106 flag_shrink_wrap = 0;
14108 aarch64_override_options_internal (&global_options);
14110 /* Save these options as the default ones in case we push and pop them later
14111 while processing functions with potential target attributes. */
14112 target_option_default_node = target_option_current_node
14113 = build_target_option_node (&global_options);
14116 /* Implement targetm.override_options_after_change. */
14118 static void
14119 aarch64_override_options_after_change (void)
14121 aarch64_override_options_after_change_1 (&global_options);
14124 static struct machine_function *
14125 aarch64_init_machine_status (void)
14127 struct machine_function *machine;
14128 machine = ggc_cleared_alloc<machine_function> ();
14129 return machine;
14132 void
14133 aarch64_init_expanders (void)
14135 init_machine_status = aarch64_init_machine_status;
14138 /* A checking mechanism for the implementation of the various code models. */
14139 static void
14140 initialize_aarch64_code_model (struct gcc_options *opts)
14142 if (opts->x_flag_pic)
14144 switch (opts->x_aarch64_cmodel_var)
14146 case AARCH64_CMODEL_TINY:
14147 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14148 break;
14149 case AARCH64_CMODEL_SMALL:
14150 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14151 aarch64_cmodel = (flag_pic == 2
14152 ? AARCH64_CMODEL_SMALL_PIC
14153 : AARCH64_CMODEL_SMALL_SPIC);
14154 #else
14155 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14156 #endif
14157 break;
14158 case AARCH64_CMODEL_LARGE:
14159 sorry ("code model %qs with %<-f%s%>", "large",
14160 opts->x_flag_pic > 1 ? "PIC" : "pic");
14161 break;
14162 default:
14163 gcc_unreachable ();
14166 else
14167 aarch64_cmodel = opts->x_aarch64_cmodel_var;
14170 /* Implement TARGET_OPTION_SAVE. */
14172 static void
14173 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14175 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14176 ptr->x_aarch64_branch_protection_string
14177 = opts->x_aarch64_branch_protection_string;
14180 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14181 using the information saved in PTR. */
14183 static void
14184 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14186 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14187 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14188 opts->x_explicit_arch = ptr->x_explicit_arch;
14189 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14190 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14191 opts->x_aarch64_branch_protection_string
14192 = ptr->x_aarch64_branch_protection_string;
14193 if (opts->x_aarch64_branch_protection_string)
14195 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14196 NULL);
14199 aarch64_override_options_internal (opts);
14202 /* Implement TARGET_OPTION_PRINT. */
14204 static void
14205 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14207 const struct processor *cpu
14208 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14209 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14210 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14211 std::string extension
14212 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14214 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14215 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14216 arch->name, extension.c_str ());
14219 static GTY(()) tree aarch64_previous_fndecl;
14221 void
14222 aarch64_reset_previous_fndecl (void)
14224 aarch64_previous_fndecl = NULL;
14227 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14228 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14229 make sure optab availability predicates are recomputed when necessary. */
14231 void
14232 aarch64_save_restore_target_globals (tree new_tree)
14234 if (TREE_TARGET_GLOBALS (new_tree))
14235 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14236 else if (new_tree == target_option_default_node)
14237 restore_target_globals (&default_target_globals);
14238 else
14239 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14242 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14243 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14244 of the function, if such exists. This function may be called multiple
14245 times on a single function so use aarch64_previous_fndecl to avoid
14246 setting up identical state. */
14248 static void
14249 aarch64_set_current_function (tree fndecl)
14251 if (!fndecl || fndecl == aarch64_previous_fndecl)
14252 return;
14254 tree old_tree = (aarch64_previous_fndecl
14255 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14256 : NULL_TREE);
14258 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14260 /* If current function has no attributes but the previous one did,
14261 use the default node. */
14262 if (!new_tree && old_tree)
14263 new_tree = target_option_default_node;
14265 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14266 the default have been handled by aarch64_save_restore_target_globals from
14267 aarch64_pragma_target_parse. */
14268 if (old_tree == new_tree)
14269 return;
14271 aarch64_previous_fndecl = fndecl;
14273 /* First set the target options. */
14274 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14276 aarch64_save_restore_target_globals (new_tree);
14279 /* Enum describing the various ways we can handle attributes.
14280 In many cases we can reuse the generic option handling machinery. */
14282 enum aarch64_attr_opt_type
14284 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
14285 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
14286 aarch64_attr_enum, /* Attribute sets an enum variable. */
14287 aarch64_attr_custom /* Attribute requires a custom handling function. */
14290 /* All the information needed to handle a target attribute.
14291 NAME is the name of the attribute.
14292 ATTR_TYPE specifies the type of behavior of the attribute as described
14293 in the definition of enum aarch64_attr_opt_type.
14294 ALLOW_NEG is true if the attribute supports a "no-" form.
14295 HANDLER is the function that takes the attribute string as an argument
14296 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14297 OPT_NUM is the enum specifying the option that the attribute modifies.
14298 This is needed for attributes that mirror the behavior of a command-line
14299 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14300 aarch64_attr_enum. */
14302 struct aarch64_attribute_info
14304 const char *name;
14305 enum aarch64_attr_opt_type attr_type;
14306 bool allow_neg;
14307 bool (*handler) (const char *);
14308 enum opt_code opt_num;
14311 /* Handle the ARCH_STR argument to the arch= target attribute. */
14313 static bool
14314 aarch64_handle_attr_arch (const char *str)
14316 const struct processor *tmp_arch = NULL;
14317 std::string invalid_extension;
14318 enum aarch64_parse_opt_result parse_res
14319 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14321 if (parse_res == AARCH64_PARSE_OK)
14323 gcc_assert (tmp_arch);
14324 selected_arch = tmp_arch;
14325 explicit_arch = selected_arch->arch;
14326 return true;
14329 switch (parse_res)
14331 case AARCH64_PARSE_MISSING_ARG:
14332 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14333 break;
14334 case AARCH64_PARSE_INVALID_ARG:
14335 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14336 aarch64_print_hint_for_arch (str);
14337 break;
14338 case AARCH64_PARSE_INVALID_FEATURE:
14339 error ("invalid feature modifier %s of value (\"%s\") in "
14340 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14341 aarch64_print_hint_for_extensions (invalid_extension);
14342 break;
14343 default:
14344 gcc_unreachable ();
14347 return false;
14350 /* Handle the argument CPU_STR to the cpu= target attribute. */
14352 static bool
14353 aarch64_handle_attr_cpu (const char *str)
14355 const struct processor *tmp_cpu = NULL;
14356 std::string invalid_extension;
14357 enum aarch64_parse_opt_result parse_res
14358 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14360 if (parse_res == AARCH64_PARSE_OK)
14362 gcc_assert (tmp_cpu);
14363 selected_tune = tmp_cpu;
14364 explicit_tune_core = selected_tune->ident;
14366 selected_arch = &all_architectures[tmp_cpu->arch];
14367 explicit_arch = selected_arch->arch;
14368 return true;
14371 switch (parse_res)
14373 case AARCH64_PARSE_MISSING_ARG:
14374 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14375 break;
14376 case AARCH64_PARSE_INVALID_ARG:
14377 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14378 aarch64_print_hint_for_core (str);
14379 break;
14380 case AARCH64_PARSE_INVALID_FEATURE:
14381 error ("invalid feature modifier %s of value (\"%s\") in "
14382 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14383 aarch64_print_hint_for_extensions (invalid_extension);
14384 break;
14385 default:
14386 gcc_unreachable ();
14389 return false;
14392 /* Handle the argument STR to the branch-protection= attribute. */
14394 static bool
14395 aarch64_handle_attr_branch_protection (const char* str)
14397 char *err_str = (char *) xmalloc (strlen (str) + 1);
14398 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14399 &err_str);
14400 bool success = false;
14401 switch (res)
14403 case AARCH64_PARSE_MISSING_ARG:
14404 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14405 " attribute");
14406 break;
14407 case AARCH64_PARSE_INVALID_ARG:
14408 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14409 "=\")%> pragma or attribute", err_str);
14410 break;
14411 case AARCH64_PARSE_OK:
14412 success = true;
14413 /* Fall through. */
14414 case AARCH64_PARSE_INVALID_FEATURE:
14415 break;
14416 default:
14417 gcc_unreachable ();
14419 free (err_str);
14420 return success;
14423 /* Handle the argument STR to the tune= target attribute. */
14425 static bool
14426 aarch64_handle_attr_tune (const char *str)
14428 const struct processor *tmp_tune = NULL;
14429 enum aarch64_parse_opt_result parse_res
14430 = aarch64_parse_tune (str, &tmp_tune);
14432 if (parse_res == AARCH64_PARSE_OK)
14434 gcc_assert (tmp_tune);
14435 selected_tune = tmp_tune;
14436 explicit_tune_core = selected_tune->ident;
14437 return true;
14440 switch (parse_res)
14442 case AARCH64_PARSE_INVALID_ARG:
14443 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14444 aarch64_print_hint_for_core (str);
14445 break;
14446 default:
14447 gcc_unreachable ();
14450 return false;
14453 /* Parse an architecture extensions target attribute string specified in STR.
14454 For example "+fp+nosimd". Show any errors if needed. Return TRUE
14455 if successful. Update aarch64_isa_flags to reflect the ISA features
14456 modified. */
14458 static bool
14459 aarch64_handle_attr_isa_flags (char *str)
14461 enum aarch64_parse_opt_result parse_res;
14462 uint64_t isa_flags = aarch64_isa_flags;
14464 /* We allow "+nothing" in the beginning to clear out all architectural
14465 features if the user wants to handpick specific features. */
14466 if (strncmp ("+nothing", str, 8) == 0)
14468 isa_flags = 0;
14469 str += 8;
14472 std::string invalid_extension;
14473 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14475 if (parse_res == AARCH64_PARSE_OK)
14477 aarch64_isa_flags = isa_flags;
14478 return true;
14481 switch (parse_res)
14483 case AARCH64_PARSE_MISSING_ARG:
14484 error ("missing value in %<target()%> pragma or attribute");
14485 break;
14487 case AARCH64_PARSE_INVALID_FEATURE:
14488 error ("invalid feature modifier %s of value (\"%s\") in "
14489 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14490 break;
14492 default:
14493 gcc_unreachable ();
14496 return false;
14499 /* The target attributes that we support. On top of these we also support just
14500 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
14501 handled explicitly in aarch64_process_one_target_attr. */
14503 static const struct aarch64_attribute_info aarch64_attributes[] =
14505 { "general-regs-only", aarch64_attr_mask, false, NULL,
14506 OPT_mgeneral_regs_only },
14507 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14508 OPT_mfix_cortex_a53_835769 },
14509 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14510 OPT_mfix_cortex_a53_843419 },
14511 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14512 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14513 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14514 OPT_momit_leaf_frame_pointer },
14515 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14516 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14517 OPT_march_ },
14518 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14519 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14520 OPT_mtune_ },
14521 { "branch-protection", aarch64_attr_custom, false,
14522 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14523 { "sign-return-address", aarch64_attr_enum, false, NULL,
14524 OPT_msign_return_address_ },
14525 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14528 /* Parse ARG_STR which contains the definition of one target attribute.
14529 Show appropriate errors if any or return true if the attribute is valid. */
14531 static bool
14532 aarch64_process_one_target_attr (char *arg_str)
14534 bool invert = false;
14536 size_t len = strlen (arg_str);
14538 if (len == 0)
14540 error ("malformed %<target()%> pragma or attribute");
14541 return false;
14544 char *str_to_check = (char *) alloca (len + 1);
14545 strcpy (str_to_check, arg_str);
14547 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14548 It is easier to detect and handle it explicitly here rather than going
14549 through the machinery for the rest of the target attributes in this
14550 function. */
14551 if (*str_to_check == '+')
14552 return aarch64_handle_attr_isa_flags (str_to_check);
14554 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14556 invert = true;
14557 str_to_check += 3;
14559 char *arg = strchr (str_to_check, '=');
14561 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14562 and point ARG to "foo". */
14563 if (arg)
14565 *arg = '\0';
14566 arg++;
14568 const struct aarch64_attribute_info *p_attr;
14569 bool found = false;
14570 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14572 /* If the names don't match up, or the user has given an argument
14573 to an attribute that doesn't accept one, or didn't give an argument
14574 to an attribute that expects one, fail to match. */
14575 if (strcmp (str_to_check, p_attr->name) != 0)
14576 continue;
14578 found = true;
14579 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14580 || p_attr->attr_type == aarch64_attr_enum;
14582 if (attr_need_arg_p ^ (arg != NULL))
14584 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14585 return false;
14588 /* If the name matches but the attribute does not allow "no-" versions
14589 then we can't match. */
14590 if (invert && !p_attr->allow_neg)
14592 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14593 return false;
14596 switch (p_attr->attr_type)
14598 /* Has a custom handler registered.
14599 For example, cpu=, arch=, tune=. */
14600 case aarch64_attr_custom:
14601 gcc_assert (p_attr->handler);
14602 if (!p_attr->handler (arg))
14603 return false;
14604 break;
14606 /* Either set or unset a boolean option. */
14607 case aarch64_attr_bool:
14609 struct cl_decoded_option decoded;
14611 generate_option (p_attr->opt_num, NULL, !invert,
14612 CL_TARGET, &decoded);
14613 aarch64_handle_option (&global_options, &global_options_set,
14614 &decoded, input_location);
14615 break;
14617 /* Set or unset a bit in the target_flags. aarch64_handle_option
14618 should know what mask to apply given the option number. */
14619 case aarch64_attr_mask:
14621 struct cl_decoded_option decoded;
14622 /* We only need to specify the option number.
14623 aarch64_handle_option will know which mask to apply. */
14624 decoded.opt_index = p_attr->opt_num;
14625 decoded.value = !invert;
14626 aarch64_handle_option (&global_options, &global_options_set,
14627 &decoded, input_location);
14628 break;
14630 /* Use the option setting machinery to set an option to an enum. */
14631 case aarch64_attr_enum:
14633 gcc_assert (arg);
14634 bool valid;
14635 int value;
14636 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14637 &value, CL_TARGET);
14638 if (valid)
14640 set_option (&global_options, NULL, p_attr->opt_num, value,
14641 NULL, DK_UNSPECIFIED, input_location,
14642 global_dc);
14644 else
14646 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14648 break;
14650 default:
14651 gcc_unreachable ();
14655 /* If we reached here we either have found an attribute and validated
14656 it or didn't match any. If we matched an attribute but its arguments
14657 were malformed we will have returned false already. */
14658 return found;
14661 /* Count how many times the character C appears in
14662 NULL-terminated string STR. */
14664 static unsigned int
14665 num_occurences_in_str (char c, char *str)
14667 unsigned int res = 0;
14668 while (*str != '\0')
14670 if (*str == c)
14671 res++;
14673 str++;
14676 return res;
14679 /* Parse the tree in ARGS that contains the target attribute information
14680 and update the global target options space. */
14682 bool
14683 aarch64_process_target_attr (tree args)
14685 if (TREE_CODE (args) == TREE_LIST)
14689 tree head = TREE_VALUE (args);
14690 if (head)
14692 if (!aarch64_process_target_attr (head))
14693 return false;
14695 args = TREE_CHAIN (args);
14696 } while (args);
14698 return true;
14701 if (TREE_CODE (args) != STRING_CST)
14703 error ("attribute %<target%> argument not a string");
14704 return false;
14707 size_t len = strlen (TREE_STRING_POINTER (args));
14708 char *str_to_check = (char *) alloca (len + 1);
14709 strcpy (str_to_check, TREE_STRING_POINTER (args));
14711 if (len == 0)
14713 error ("malformed %<target()%> pragma or attribute");
14714 return false;
14717 /* Used to catch empty spaces between commas i.e.
14718 attribute ((target ("attr1,,attr2"))). */
14719 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14721 /* Handle multiple target attributes separated by ','. */
14722 char *token = strtok_r (str_to_check, ",", &str_to_check);
14724 unsigned int num_attrs = 0;
14725 while (token)
14727 num_attrs++;
14728 if (!aarch64_process_one_target_attr (token))
14730 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14731 return false;
14734 token = strtok_r (NULL, ",", &str_to_check);
14737 if (num_attrs != num_commas + 1)
14739 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14740 return false;
14743 return true;
14746 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
14747 process attribute ((target ("..."))). */
14749 static bool
14750 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14752 struct cl_target_option cur_target;
14753 bool ret;
14754 tree old_optimize;
14755 tree new_target, new_optimize;
14756 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14758 /* If what we're processing is the current pragma string then the
14759 target option node is already stored in target_option_current_node
14760 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
14761 having to re-parse the string. This is especially useful to keep
14762 arm_neon.h compile times down since that header contains a lot
14763 of intrinsics enclosed in pragmas. */
14764 if (!existing_target && args == current_target_pragma)
14766 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14767 return true;
14769 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14771 old_optimize = build_optimization_node (&global_options);
14772 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14774 /* If the function changed the optimization levels as well as setting
14775 target options, start with the optimizations specified. */
14776 if (func_optimize && func_optimize != old_optimize)
14777 cl_optimization_restore (&global_options,
14778 TREE_OPTIMIZATION (func_optimize));
14780 /* Save the current target options to restore at the end. */
14781 cl_target_option_save (&cur_target, &global_options);
14783 /* If fndecl already has some target attributes applied to it, unpack
14784 them so that we add this attribute on top of them, rather than
14785 overwriting them. */
14786 if (existing_target)
14788 struct cl_target_option *existing_options
14789 = TREE_TARGET_OPTION (existing_target);
14791 if (existing_options)
14792 cl_target_option_restore (&global_options, existing_options);
14794 else
14795 cl_target_option_restore (&global_options,
14796 TREE_TARGET_OPTION (target_option_current_node));
14798 ret = aarch64_process_target_attr (args);
14800 /* Set up any additional state. */
14801 if (ret)
14803 aarch64_override_options_internal (&global_options);
14804 /* Initialize SIMD builtins if we haven't already.
14805 Set current_target_pragma to NULL for the duration so that
14806 the builtin initialization code doesn't try to tag the functions
14807 being built with the attributes specified by any current pragma, thus
14808 going into an infinite recursion. */
14809 if (TARGET_SIMD)
14811 tree saved_current_target_pragma = current_target_pragma;
14812 current_target_pragma = NULL;
14813 aarch64_init_simd_builtins ();
14814 current_target_pragma = saved_current_target_pragma;
14816 new_target = build_target_option_node (&global_options);
14818 else
14819 new_target = NULL;
14821 new_optimize = build_optimization_node (&global_options);
14823 if (fndecl && ret)
14825 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14827 if (old_optimize != new_optimize)
14828 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14831 cl_target_option_restore (&global_options, &cur_target);
14833 if (old_optimize != new_optimize)
14834 cl_optimization_restore (&global_options,
14835 TREE_OPTIMIZATION (old_optimize));
14836 return ret;
14839 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
14840 tri-bool options (yes, no, don't care) and the default value is
14841 DEF, determine whether to reject inlining. */
14843 static bool
14844 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14845 int dont_care, int def)
14847 /* If the callee doesn't care, always allow inlining. */
14848 if (callee == dont_care)
14849 return true;
14851 /* If the caller doesn't care, always allow inlining. */
14852 if (caller == dont_care)
14853 return true;
14855 /* Otherwise, allow inlining if either the callee and caller values
14856 agree, or if the callee is using the default value. */
14857 return (callee == caller || callee == def);
14860 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
14861 to inline CALLEE into CALLER based on target-specific info.
14862 Make sure that the caller and callee have compatible architectural
14863 features. Then go through the other possible target attributes
14864 and see if they can block inlining. Try not to reject always_inline
14865 callees unless they are incompatible architecturally. */
14867 static bool
14868 aarch64_can_inline_p (tree caller, tree callee)
14870 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14871 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14873 struct cl_target_option *caller_opts
14874 = TREE_TARGET_OPTION (caller_tree ? caller_tree
14875 : target_option_default_node);
14877 struct cl_target_option *callee_opts
14878 = TREE_TARGET_OPTION (callee_tree ? callee_tree
14879 : target_option_default_node);
14881 /* Callee's ISA flags should be a subset of the caller's. */
14882 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
14883 != callee_opts->x_aarch64_isa_flags)
14884 return false;
14886 /* Allow non-strict aligned functions inlining into strict
14887 aligned ones. */
14888 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
14889 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
14890 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
14891 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
14892 return false;
14894 bool always_inline = lookup_attribute ("always_inline",
14895 DECL_ATTRIBUTES (callee));
14897 /* If the architectural features match up and the callee is always_inline
14898 then the other attributes don't matter. */
14899 if (always_inline)
14900 return true;
14902 if (caller_opts->x_aarch64_cmodel_var
14903 != callee_opts->x_aarch64_cmodel_var)
14904 return false;
14906 if (caller_opts->x_aarch64_tls_dialect
14907 != callee_opts->x_aarch64_tls_dialect)
14908 return false;
14910 /* Honour explicit requests to workaround errata. */
14911 if (!aarch64_tribools_ok_for_inlining_p (
14912 caller_opts->x_aarch64_fix_a53_err835769,
14913 callee_opts->x_aarch64_fix_a53_err835769,
14914 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
14915 return false;
14917 if (!aarch64_tribools_ok_for_inlining_p (
14918 caller_opts->x_aarch64_fix_a53_err843419,
14919 callee_opts->x_aarch64_fix_a53_err843419,
14920 2, TARGET_FIX_ERR_A53_843419))
14921 return false;
14923 /* If the user explicitly specified -momit-leaf-frame-pointer for the
14924 caller and calle and they don't match up, reject inlining. */
14925 if (!aarch64_tribools_ok_for_inlining_p (
14926 caller_opts->x_flag_omit_leaf_frame_pointer,
14927 callee_opts->x_flag_omit_leaf_frame_pointer,
14928 2, 1))
14929 return false;
14931 /* If the callee has specific tuning overrides, respect them. */
14932 if (callee_opts->x_aarch64_override_tune_string != NULL
14933 && caller_opts->x_aarch64_override_tune_string == NULL)
14934 return false;
14936 /* If the user specified tuning override strings for the
14937 caller and callee and they don't match up, reject inlining.
14938 We just do a string compare here, we don't analyze the meaning
14939 of the string, as it would be too costly for little gain. */
14940 if (callee_opts->x_aarch64_override_tune_string
14941 && caller_opts->x_aarch64_override_tune_string
14942 && (strcmp (callee_opts->x_aarch64_override_tune_string,
14943 caller_opts->x_aarch64_override_tune_string) != 0))
14944 return false;
14946 return true;
14949 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14950 been already. */
14952 unsigned int
14953 aarch64_tlsdesc_abi_id ()
14955 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
14956 if (!tlsdesc_abi.initialized_p ())
14958 HARD_REG_SET full_reg_clobbers;
14959 CLEAR_HARD_REG_SET (full_reg_clobbers);
14960 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
14961 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
14962 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
14963 SET_HARD_REG_BIT (full_reg_clobbers, regno);
14964 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
14966 return tlsdesc_abi.id ();
14969 /* Return true if SYMBOL_REF X binds locally. */
14971 static bool
14972 aarch64_symbol_binds_local_p (const_rtx x)
14974 return (SYMBOL_REF_DECL (x)
14975 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
14976 : SYMBOL_REF_LOCAL_P (x));
14979 /* Return true if SYMBOL_REF X is thread local */
14980 static bool
14981 aarch64_tls_symbol_p (rtx x)
14983 if (! TARGET_HAVE_TLS)
14984 return false;
14986 if (GET_CODE (x) != SYMBOL_REF)
14987 return false;
14989 return SYMBOL_REF_TLS_MODEL (x) != 0;
14992 /* Classify a TLS symbol into one of the TLS kinds. */
14993 enum aarch64_symbol_type
14994 aarch64_classify_tls_symbol (rtx x)
14996 enum tls_model tls_kind = tls_symbolic_operand_type (x);
14998 switch (tls_kind)
15000 case TLS_MODEL_GLOBAL_DYNAMIC:
15001 case TLS_MODEL_LOCAL_DYNAMIC:
15002 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15004 case TLS_MODEL_INITIAL_EXEC:
15005 switch (aarch64_cmodel)
15007 case AARCH64_CMODEL_TINY:
15008 case AARCH64_CMODEL_TINY_PIC:
15009 return SYMBOL_TINY_TLSIE;
15010 default:
15011 return SYMBOL_SMALL_TLSIE;
15014 case TLS_MODEL_LOCAL_EXEC:
15015 if (aarch64_tls_size == 12)
15016 return SYMBOL_TLSLE12;
15017 else if (aarch64_tls_size == 24)
15018 return SYMBOL_TLSLE24;
15019 else if (aarch64_tls_size == 32)
15020 return SYMBOL_TLSLE32;
15021 else if (aarch64_tls_size == 48)
15022 return SYMBOL_TLSLE48;
15023 else
15024 gcc_unreachable ();
15026 case TLS_MODEL_EMULATED:
15027 case TLS_MODEL_NONE:
15028 return SYMBOL_FORCE_TO_MEM;
15030 default:
15031 gcc_unreachable ();
15035 /* Return the correct method for accessing X + OFFSET, where X is either
15036 a SYMBOL_REF or LABEL_REF. */
15038 enum aarch64_symbol_type
15039 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15041 if (GET_CODE (x) == LABEL_REF)
15043 switch (aarch64_cmodel)
15045 case AARCH64_CMODEL_LARGE:
15046 return SYMBOL_FORCE_TO_MEM;
15048 case AARCH64_CMODEL_TINY_PIC:
15049 case AARCH64_CMODEL_TINY:
15050 return SYMBOL_TINY_ABSOLUTE;
15052 case AARCH64_CMODEL_SMALL_SPIC:
15053 case AARCH64_CMODEL_SMALL_PIC:
15054 case AARCH64_CMODEL_SMALL:
15055 return SYMBOL_SMALL_ABSOLUTE;
15057 default:
15058 gcc_unreachable ();
15062 if (GET_CODE (x) == SYMBOL_REF)
15064 if (aarch64_tls_symbol_p (x))
15065 return aarch64_classify_tls_symbol (x);
15067 switch (aarch64_cmodel)
15069 case AARCH64_CMODEL_TINY:
15070 /* When we retrieve symbol + offset address, we have to make sure
15071 the offset does not cause overflow of the final address. But
15072 we have no way of knowing the address of symbol at compile time
15073 so we can't accurately say if the distance between the PC and
15074 symbol + offset is outside the addressible range of +/-1MB in the
15075 TINY code model. So we limit the maximum offset to +/-64KB and
15076 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15077 If offset_within_block_p is true we allow larger offsets.
15078 Furthermore force to memory if the symbol is a weak reference to
15079 something that doesn't resolve to a symbol in this module. */
15081 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15082 return SYMBOL_FORCE_TO_MEM;
15083 if (!(IN_RANGE (offset, -0x10000, 0x10000)
15084 || offset_within_block_p (x, offset)))
15085 return SYMBOL_FORCE_TO_MEM;
15087 return SYMBOL_TINY_ABSOLUTE;
15089 case AARCH64_CMODEL_SMALL:
15090 /* Same reasoning as the tiny code model, but the offset cap here is
15091 1MB, allowing +/-3.9GB for the offset to the symbol. */
15093 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15094 return SYMBOL_FORCE_TO_MEM;
15095 if (!(IN_RANGE (offset, -0x100000, 0x100000)
15096 || offset_within_block_p (x, offset)))
15097 return SYMBOL_FORCE_TO_MEM;
15099 return SYMBOL_SMALL_ABSOLUTE;
15101 case AARCH64_CMODEL_TINY_PIC:
15102 if (!aarch64_symbol_binds_local_p (x))
15103 return SYMBOL_TINY_GOT;
15104 return SYMBOL_TINY_ABSOLUTE;
15106 case AARCH64_CMODEL_SMALL_SPIC:
15107 case AARCH64_CMODEL_SMALL_PIC:
15108 if (!aarch64_symbol_binds_local_p (x))
15109 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15110 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15111 return SYMBOL_SMALL_ABSOLUTE;
15113 case AARCH64_CMODEL_LARGE:
15114 /* This is alright even in PIC code as the constant
15115 pool reference is always PC relative and within
15116 the same translation unit. */
15117 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15118 return SYMBOL_SMALL_ABSOLUTE;
15119 else
15120 return SYMBOL_FORCE_TO_MEM;
15122 default:
15123 gcc_unreachable ();
15127 /* By default push everything into the constant pool. */
15128 return SYMBOL_FORCE_TO_MEM;
15131 bool
15132 aarch64_constant_address_p (rtx x)
15134 return (CONSTANT_P (x) && memory_address_p (DImode, x));
15137 bool
15138 aarch64_legitimate_pic_operand_p (rtx x)
15140 if (GET_CODE (x) == SYMBOL_REF
15141 || (GET_CODE (x) == CONST
15142 && GET_CODE (XEXP (x, 0)) == PLUS
15143 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15144 return false;
15146 return true;
15149 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15150 that should be rematerialized rather than spilled. */
15152 static bool
15153 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15155 /* Support CSE and rematerialization of common constants. */
15156 if (CONST_INT_P (x)
15157 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15158 || GET_CODE (x) == CONST_VECTOR)
15159 return true;
15161 /* Do not allow vector struct mode constants for Advanced SIMD.
15162 We could support 0 and -1 easily, but they need support in
15163 aarch64-simd.md. */
15164 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15165 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15166 return false;
15168 /* Only accept variable-length vector constants if they can be
15169 handled directly.
15171 ??? It would be possible to handle rematerialization of other
15172 constants via secondary reloads. */
15173 if (vec_flags & VEC_ANY_SVE)
15174 return aarch64_simd_valid_immediate (x, NULL);
15176 if (GET_CODE (x) == HIGH)
15177 x = XEXP (x, 0);
15179 /* Accept polynomial constants that can be calculated by using the
15180 destination of a move as the sole temporary. Constants that
15181 require a second temporary cannot be rematerialized (they can't be
15182 forced to memory and also aren't legitimate constants). */
15183 poly_int64 offset;
15184 if (poly_int_rtx_p (x, &offset))
15185 return aarch64_offset_temporaries (false, offset) <= 1;
15187 /* If an offset is being added to something else, we need to allow the
15188 base to be moved into the destination register, meaning that there
15189 are no free temporaries for the offset. */
15190 x = strip_offset (x, &offset);
15191 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15192 return false;
15194 /* Do not allow const (plus (anchor_symbol, const_int)). */
15195 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15196 return false;
15198 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15199 so spilling them is better than rematerialization. */
15200 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15201 return true;
15203 /* Label references are always constant. */
15204 if (GET_CODE (x) == LABEL_REF)
15205 return true;
15207 return false;
15211 aarch64_load_tp (rtx target)
15213 if (!target
15214 || GET_MODE (target) != Pmode
15215 || !register_operand (target, Pmode))
15216 target = gen_reg_rtx (Pmode);
15218 /* Can return in any reg. */
15219 emit_insn (gen_aarch64_load_tp_hard (target));
15220 return target;
15223 /* On AAPCS systems, this is the "struct __va_list". */
15224 static GTY(()) tree va_list_type;
15226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15227 Return the type to use as __builtin_va_list.
15229 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15231 struct __va_list
15233 void *__stack;
15234 void *__gr_top;
15235 void *__vr_top;
15236 int __gr_offs;
15237 int __vr_offs;
15238 }; */
15240 static tree
15241 aarch64_build_builtin_va_list (void)
15243 tree va_list_name;
15244 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15246 /* Create the type. */
15247 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15248 /* Give it the required name. */
15249 va_list_name = build_decl (BUILTINS_LOCATION,
15250 TYPE_DECL,
15251 get_identifier ("__va_list"),
15252 va_list_type);
15253 DECL_ARTIFICIAL (va_list_name) = 1;
15254 TYPE_NAME (va_list_type) = va_list_name;
15255 TYPE_STUB_DECL (va_list_type) = va_list_name;
15257 /* Create the fields. */
15258 f_stack = build_decl (BUILTINS_LOCATION,
15259 FIELD_DECL, get_identifier ("__stack"),
15260 ptr_type_node);
15261 f_grtop = build_decl (BUILTINS_LOCATION,
15262 FIELD_DECL, get_identifier ("__gr_top"),
15263 ptr_type_node);
15264 f_vrtop = build_decl (BUILTINS_LOCATION,
15265 FIELD_DECL, get_identifier ("__vr_top"),
15266 ptr_type_node);
15267 f_groff = build_decl (BUILTINS_LOCATION,
15268 FIELD_DECL, get_identifier ("__gr_offs"),
15269 integer_type_node);
15270 f_vroff = build_decl (BUILTINS_LOCATION,
15271 FIELD_DECL, get_identifier ("__vr_offs"),
15272 integer_type_node);
15274 /* Tell tree-stdarg pass about our internal offset fields.
15275 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15276 purpose to identify whether the code is updating va_list internal
15277 offset fields through irregular way. */
15278 va_list_gpr_counter_field = f_groff;
15279 va_list_fpr_counter_field = f_vroff;
15281 DECL_ARTIFICIAL (f_stack) = 1;
15282 DECL_ARTIFICIAL (f_grtop) = 1;
15283 DECL_ARTIFICIAL (f_vrtop) = 1;
15284 DECL_ARTIFICIAL (f_groff) = 1;
15285 DECL_ARTIFICIAL (f_vroff) = 1;
15287 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15288 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15289 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15290 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15291 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15293 TYPE_FIELDS (va_list_type) = f_stack;
15294 DECL_CHAIN (f_stack) = f_grtop;
15295 DECL_CHAIN (f_grtop) = f_vrtop;
15296 DECL_CHAIN (f_vrtop) = f_groff;
15297 DECL_CHAIN (f_groff) = f_vroff;
15299 /* Compute its layout. */
15300 layout_type (va_list_type);
15302 return va_list_type;
15305 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15306 static void
15307 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15309 const CUMULATIVE_ARGS *cum;
15310 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15311 tree stack, grtop, vrtop, groff, vroff;
15312 tree t;
15313 int gr_save_area_size = cfun->va_list_gpr_size;
15314 int vr_save_area_size = cfun->va_list_fpr_size;
15315 int vr_offset;
15317 cum = &crtl->args.info;
15318 if (cfun->va_list_gpr_size)
15319 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15320 cfun->va_list_gpr_size);
15321 if (cfun->va_list_fpr_size)
15322 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15323 * UNITS_PER_VREG, cfun->va_list_fpr_size);
15325 if (!TARGET_FLOAT)
15327 gcc_assert (cum->aapcs_nvrn == 0);
15328 vr_save_area_size = 0;
15331 f_stack = TYPE_FIELDS (va_list_type_node);
15332 f_grtop = DECL_CHAIN (f_stack);
15333 f_vrtop = DECL_CHAIN (f_grtop);
15334 f_groff = DECL_CHAIN (f_vrtop);
15335 f_vroff = DECL_CHAIN (f_groff);
15337 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15338 NULL_TREE);
15339 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15340 NULL_TREE);
15341 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15342 NULL_TREE);
15343 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15344 NULL_TREE);
15345 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15346 NULL_TREE);
15348 /* Emit code to initialize STACK, which points to the next varargs stack
15349 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15350 by named arguments. STACK is 8-byte aligned. */
15351 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15352 if (cum->aapcs_stack_size > 0)
15353 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15354 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15355 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15357 /* Emit code to initialize GRTOP, the top of the GR save area.
15358 virtual_incoming_args_rtx should have been 16 byte aligned. */
15359 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15360 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15361 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15363 /* Emit code to initialize VRTOP, the top of the VR save area.
15364 This address is gr_save_area_bytes below GRTOP, rounded
15365 down to the next 16-byte boundary. */
15366 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15367 vr_offset = ROUND_UP (gr_save_area_size,
15368 STACK_BOUNDARY / BITS_PER_UNIT);
15370 if (vr_offset)
15371 t = fold_build_pointer_plus_hwi (t, -vr_offset);
15372 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15373 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15375 /* Emit code to initialize GROFF, the offset from GRTOP of the
15376 next GPR argument. */
15377 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15378 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15379 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15381 /* Likewise emit code to initialize VROFF, the offset from FTOP
15382 of the next VR argument. */
15383 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15384 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15385 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15390 static tree
15391 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15392 gimple_seq *post_p ATTRIBUTE_UNUSED)
15394 tree addr;
15395 bool indirect_p;
15396 bool is_ha; /* is HFA or HVA. */
15397 bool dw_align; /* double-word align. */
15398 machine_mode ag_mode = VOIDmode;
15399 int nregs;
15400 machine_mode mode;
15402 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15403 tree stack, f_top, f_off, off, arg, roundup, on_stack;
15404 HOST_WIDE_INT size, rsize, adjust, align;
15405 tree t, u, cond1, cond2;
15407 indirect_p = pass_va_arg_by_reference (type);
15408 if (indirect_p)
15409 type = build_pointer_type (type);
15411 mode = TYPE_MODE (type);
15413 f_stack = TYPE_FIELDS (va_list_type_node);
15414 f_grtop = DECL_CHAIN (f_stack);
15415 f_vrtop = DECL_CHAIN (f_grtop);
15416 f_groff = DECL_CHAIN (f_vrtop);
15417 f_vroff = DECL_CHAIN (f_groff);
15419 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15420 f_stack, NULL_TREE);
15421 size = int_size_in_bytes (type);
15423 bool abi_break;
15424 align
15425 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15427 dw_align = false;
15428 adjust = 0;
15429 if (aarch64_vfp_is_call_or_return_candidate (mode,
15430 type,
15431 &ag_mode,
15432 &nregs,
15433 &is_ha))
15435 /* No frontends can create types with variable-sized modes, so we
15436 shouldn't be asked to pass or return them. */
15437 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15439 /* TYPE passed in fp/simd registers. */
15440 if (!TARGET_FLOAT)
15441 aarch64_err_no_fpadvsimd (mode);
15443 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15444 unshare_expr (valist), f_vrtop, NULL_TREE);
15445 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15446 unshare_expr (valist), f_vroff, NULL_TREE);
15448 rsize = nregs * UNITS_PER_VREG;
15450 if (is_ha)
15452 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15453 adjust = UNITS_PER_VREG - ag_size;
15455 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15456 && size < UNITS_PER_VREG)
15458 adjust = UNITS_PER_VREG - size;
15461 else
15463 /* TYPE passed in general registers. */
15464 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15465 unshare_expr (valist), f_grtop, NULL_TREE);
15466 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15467 unshare_expr (valist), f_groff, NULL_TREE);
15468 rsize = ROUND_UP (size, UNITS_PER_WORD);
15469 nregs = rsize / UNITS_PER_WORD;
15471 if (align > 8)
15473 if (abi_break && warn_psabi)
15474 inform (input_location, "parameter passing for argument of type "
15475 "%qT changed in GCC 9.1", type);
15476 dw_align = true;
15479 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15480 && size < UNITS_PER_WORD)
15482 adjust = UNITS_PER_WORD - size;
15486 /* Get a local temporary for the field value. */
15487 off = get_initialized_tmp_var (f_off, pre_p, NULL);
15489 /* Emit code to branch if off >= 0. */
15490 t = build2 (GE_EXPR, boolean_type_node, off,
15491 build_int_cst (TREE_TYPE (off), 0));
15492 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15494 if (dw_align)
15496 /* Emit: offs = (offs + 15) & -16. */
15497 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15498 build_int_cst (TREE_TYPE (off), 15));
15499 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15500 build_int_cst (TREE_TYPE (off), -16));
15501 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15503 else
15504 roundup = NULL;
15506 /* Update ap.__[g|v]r_offs */
15507 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15508 build_int_cst (TREE_TYPE (off), rsize));
15509 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15511 /* String up. */
15512 if (roundup)
15513 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15515 /* [cond2] if (ap.__[g|v]r_offs > 0) */
15516 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15517 build_int_cst (TREE_TYPE (f_off), 0));
15518 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15520 /* String up: make sure the assignment happens before the use. */
15521 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15522 COND_EXPR_ELSE (cond1) = t;
15524 /* Prepare the trees handling the argument that is passed on the stack;
15525 the top level node will store in ON_STACK. */
15526 arg = get_initialized_tmp_var (stack, pre_p, NULL);
15527 if (align > 8)
15529 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
15530 t = fold_build_pointer_plus_hwi (arg, 15);
15531 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15532 build_int_cst (TREE_TYPE (t), -16));
15533 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15535 else
15536 roundup = NULL;
15537 /* Advance ap.__stack */
15538 t = fold_build_pointer_plus_hwi (arg, size + 7);
15539 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15540 build_int_cst (TREE_TYPE (t), -8));
15541 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15542 /* String up roundup and advance. */
15543 if (roundup)
15544 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15545 /* String up with arg */
15546 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15547 /* Big-endianness related address adjustment. */
15548 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15549 && size < UNITS_PER_WORD)
15551 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15552 size_int (UNITS_PER_WORD - size));
15553 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15556 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15557 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15559 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
15560 t = off;
15561 if (adjust)
15562 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15563 build_int_cst (TREE_TYPE (off), adjust));
15565 t = fold_convert (sizetype, t);
15566 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15568 if (is_ha)
15570 /* type ha; // treat as "struct {ftype field[n];}"
15571 ... [computing offs]
15572 for (i = 0; i <nregs; ++i, offs += 16)
15573 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15574 return ha; */
15575 int i;
15576 tree tmp_ha, field_t, field_ptr_t;
15578 /* Declare a local variable. */
15579 tmp_ha = create_tmp_var_raw (type, "ha");
15580 gimple_add_tmp_var (tmp_ha);
15582 /* Establish the base type. */
15583 switch (ag_mode)
15585 case E_SFmode:
15586 field_t = float_type_node;
15587 field_ptr_t = float_ptr_type_node;
15588 break;
15589 case E_DFmode:
15590 field_t = double_type_node;
15591 field_ptr_t = double_ptr_type_node;
15592 break;
15593 case E_TFmode:
15594 field_t = long_double_type_node;
15595 field_ptr_t = long_double_ptr_type_node;
15596 break;
15597 case E_HFmode:
15598 field_t = aarch64_fp16_type_node;
15599 field_ptr_t = aarch64_fp16_ptr_type_node;
15600 break;
15601 case E_BFmode:
15602 field_t = aarch64_bf16_type_node;
15603 field_ptr_t = aarch64_bf16_ptr_type_node;
15604 break;
15605 case E_V2SImode:
15606 case E_V4SImode:
15608 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15609 field_t = build_vector_type_for_mode (innertype, ag_mode);
15610 field_ptr_t = build_pointer_type (field_t);
15612 break;
15613 default:
15614 gcc_assert (0);
15617 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
15618 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15619 addr = t;
15620 t = fold_convert (field_ptr_t, addr);
15621 t = build2 (MODIFY_EXPR, field_t,
15622 build1 (INDIRECT_REF, field_t, tmp_ha),
15623 build1 (INDIRECT_REF, field_t, t));
15625 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
15626 for (i = 1; i < nregs; ++i)
15628 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15629 u = fold_convert (field_ptr_t, addr);
15630 u = build2 (MODIFY_EXPR, field_t,
15631 build2 (MEM_REF, field_t, tmp_ha,
15632 build_int_cst (field_ptr_t,
15633 (i *
15634 int_size_in_bytes (field_t)))),
15635 build1 (INDIRECT_REF, field_t, u));
15636 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15639 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15640 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15643 COND_EXPR_ELSE (cond2) = t;
15644 addr = fold_convert (build_pointer_type (type), cond1);
15645 addr = build_va_arg_indirect_ref (addr);
15647 if (indirect_p)
15648 addr = build_va_arg_indirect_ref (addr);
15650 return addr;
15653 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
15655 static void
15656 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15657 const function_arg_info &arg,
15658 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15660 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15661 CUMULATIVE_ARGS local_cum;
15662 int gr_saved = cfun->va_list_gpr_size;
15663 int vr_saved = cfun->va_list_fpr_size;
15665 /* The caller has advanced CUM up to, but not beyond, the last named
15666 argument. Advance a local copy of CUM past the last "real" named
15667 argument, to find out how many registers are left over. */
15668 local_cum = *cum;
15669 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15671 /* Found out how many registers we need to save.
15672 Honor tree-stdvar analysis results. */
15673 if (cfun->va_list_gpr_size)
15674 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15675 cfun->va_list_gpr_size / UNITS_PER_WORD);
15676 if (cfun->va_list_fpr_size)
15677 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15678 cfun->va_list_fpr_size / UNITS_PER_VREG);
15680 if (!TARGET_FLOAT)
15682 gcc_assert (local_cum.aapcs_nvrn == 0);
15683 vr_saved = 0;
15686 if (!no_rtl)
15688 if (gr_saved > 0)
15690 rtx ptr, mem;
15692 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
15693 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15694 - gr_saved * UNITS_PER_WORD);
15695 mem = gen_frame_mem (BLKmode, ptr);
15696 set_mem_alias_set (mem, get_varargs_alias_set ());
15698 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15699 mem, gr_saved);
15701 if (vr_saved > 0)
15703 /* We can't use move_block_from_reg, because it will use
15704 the wrong mode, storing D regs only. */
15705 machine_mode mode = TImode;
15706 int off, i, vr_start;
15708 /* Set OFF to the offset from virtual_incoming_args_rtx of
15709 the first vector register. The VR save area lies below
15710 the GR one, and is aligned to 16 bytes. */
15711 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15712 STACK_BOUNDARY / BITS_PER_UNIT);
15713 off -= vr_saved * UNITS_PER_VREG;
15715 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15716 for (i = 0; i < vr_saved; ++i)
15718 rtx ptr, mem;
15720 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15721 mem = gen_frame_mem (mode, ptr);
15722 set_mem_alias_set (mem, get_varargs_alias_set ());
15723 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15724 off += UNITS_PER_VREG;
15729 /* We don't save the size into *PRETEND_SIZE because we want to avoid
15730 any complication of having crtl->args.pretend_args_size changed. */
15731 cfun->machine->frame.saved_varargs_size
15732 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15733 STACK_BOUNDARY / BITS_PER_UNIT)
15734 + vr_saved * UNITS_PER_VREG);
15737 static void
15738 aarch64_conditional_register_usage (void)
15740 int i;
15741 if (!TARGET_FLOAT)
15743 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15745 fixed_regs[i] = 1;
15746 call_used_regs[i] = 1;
15749 if (!TARGET_SVE)
15750 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15752 fixed_regs[i] = 1;
15753 call_used_regs[i] = 1;
15756 /* Only allow the FFR and FFRT to be accessed via special patterns. */
15757 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15758 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15760 /* When tracking speculation, we need a couple of call-clobbered registers
15761 to track the speculation state. It would be nice to just use
15762 IP0 and IP1, but currently there are numerous places that just
15763 assume these registers are free for other uses (eg pointer
15764 authentication). */
15765 if (aarch64_track_speculation)
15767 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15768 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15769 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15770 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15774 /* Walk down the type tree of TYPE counting consecutive base elements.
15775 If *MODEP is VOIDmode, then set it to the first valid floating point
15776 type. If a non-floating point type is found, or if a floating point
15777 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15778 otherwise return the count in the sub-tree. */
15779 static int
15780 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15782 machine_mode mode;
15783 HOST_WIDE_INT size;
15785 /* SVE types (and types containing SVE types) must be handled
15786 before calling this function. */
15787 gcc_assert (!aarch64_sve::builtin_type_p (type));
15789 switch (TREE_CODE (type))
15791 case REAL_TYPE:
15792 mode = TYPE_MODE (type);
15793 if (mode != DFmode && mode != SFmode
15794 && mode != TFmode && mode != HFmode)
15795 return -1;
15797 if (*modep == VOIDmode)
15798 *modep = mode;
15800 if (*modep == mode)
15801 return 1;
15803 break;
15805 case COMPLEX_TYPE:
15806 mode = TYPE_MODE (TREE_TYPE (type));
15807 if (mode != DFmode && mode != SFmode
15808 && mode != TFmode && mode != HFmode)
15809 return -1;
15811 if (*modep == VOIDmode)
15812 *modep = mode;
15814 if (*modep == mode)
15815 return 2;
15817 break;
15819 case VECTOR_TYPE:
15820 /* Use V2SImode and V4SImode as representatives of all 64-bit
15821 and 128-bit vector types. */
15822 size = int_size_in_bytes (type);
15823 switch (size)
15825 case 8:
15826 mode = V2SImode;
15827 break;
15828 case 16:
15829 mode = V4SImode;
15830 break;
15831 default:
15832 return -1;
15835 if (*modep == VOIDmode)
15836 *modep = mode;
15838 /* Vector modes are considered to be opaque: two vectors are
15839 equivalent for the purposes of being homogeneous aggregates
15840 if they are the same size. */
15841 if (*modep == mode)
15842 return 1;
15844 break;
15846 case ARRAY_TYPE:
15848 int count;
15849 tree index = TYPE_DOMAIN (type);
15851 /* Can't handle incomplete types nor sizes that are not
15852 fixed. */
15853 if (!COMPLETE_TYPE_P (type)
15854 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15855 return -1;
15857 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15858 if (count == -1
15859 || !index
15860 || !TYPE_MAX_VALUE (index)
15861 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15862 || !TYPE_MIN_VALUE (index)
15863 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15864 || count < 0)
15865 return -1;
15867 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15868 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15870 /* There must be no padding. */
15871 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15872 count * GET_MODE_BITSIZE (*modep)))
15873 return -1;
15875 return count;
15878 case RECORD_TYPE:
15880 int count = 0;
15881 int sub_count;
15882 tree field;
15884 /* Can't handle incomplete types nor sizes that are not
15885 fixed. */
15886 if (!COMPLETE_TYPE_P (type)
15887 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15888 return -1;
15890 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15892 if (TREE_CODE (field) != FIELD_DECL)
15893 continue;
15895 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15896 if (sub_count < 0)
15897 return -1;
15898 count += sub_count;
15901 /* There must be no padding. */
15902 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15903 count * GET_MODE_BITSIZE (*modep)))
15904 return -1;
15906 return count;
15909 case UNION_TYPE:
15910 case QUAL_UNION_TYPE:
15912 /* These aren't very interesting except in a degenerate case. */
15913 int count = 0;
15914 int sub_count;
15915 tree field;
15917 /* Can't handle incomplete types nor sizes that are not
15918 fixed. */
15919 if (!COMPLETE_TYPE_P (type)
15920 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15921 return -1;
15923 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15925 if (TREE_CODE (field) != FIELD_DECL)
15926 continue;
15928 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15929 if (sub_count < 0)
15930 return -1;
15931 count = count > sub_count ? count : sub_count;
15934 /* There must be no padding. */
15935 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15936 count * GET_MODE_BITSIZE (*modep)))
15937 return -1;
15939 return count;
15942 default:
15943 break;
15946 return -1;
15949 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15950 type as described in AAPCS64 \S 4.1.2.
15952 See the comment above aarch64_composite_type_p for the notes on MODE. */
15954 static bool
15955 aarch64_short_vector_p (const_tree type,
15956 machine_mode mode)
15958 poly_int64 size = -1;
15960 if (type && aarch64_sve::builtin_type_p (type))
15961 return false;
15963 if (type && TREE_CODE (type) == VECTOR_TYPE)
15964 size = int_size_in_bytes (type);
15965 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
15966 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
15967 size = GET_MODE_SIZE (mode);
15969 return known_eq (size, 8) || known_eq (size, 16);
15972 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15973 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
15974 array types. The C99 floating-point complex types are also considered
15975 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
15976 types, which are GCC extensions and out of the scope of AAPCS64, are
15977 treated as composite types here as well.
15979 Note that MODE itself is not sufficient in determining whether a type
15980 is such a composite type or not. This is because
15981 stor-layout.c:compute_record_mode may have already changed the MODE
15982 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
15983 structure with only one field may have its MODE set to the mode of the
15984 field. Also an integer mode whose size matches the size of the
15985 RECORD_TYPE type may be used to substitute the original mode
15986 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
15987 solely relied on. */
15989 static bool
15990 aarch64_composite_type_p (const_tree type,
15991 machine_mode mode)
15993 if (aarch64_short_vector_p (type, mode))
15994 return false;
15996 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
15997 return true;
15999 if (mode == BLKmode
16000 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16001 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16002 return true;
16004 return false;
16007 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16008 shall be passed or returned in simd/fp register(s) (providing these
16009 parameter passing registers are available).
16011 Upon successful return, *COUNT returns the number of needed registers,
16012 *BASE_MODE returns the mode of the individual register and when IS_HAF
16013 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16014 floating-point aggregate or a homogeneous short-vector aggregate. */
16016 static bool
16017 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16018 const_tree type,
16019 machine_mode *base_mode,
16020 int *count,
16021 bool *is_ha)
16023 if (is_ha != NULL) *is_ha = false;
16025 if (type && aarch64_sve::builtin_type_p (type))
16026 return false;
16028 machine_mode new_mode = VOIDmode;
16029 bool composite_p = aarch64_composite_type_p (type, mode);
16031 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16032 || aarch64_short_vector_p (type, mode))
16034 *count = 1;
16035 new_mode = mode;
16037 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16039 if (is_ha != NULL) *is_ha = true;
16040 *count = 2;
16041 new_mode = GET_MODE_INNER (mode);
16043 else if (type && composite_p)
16045 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16047 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16049 if (is_ha != NULL) *is_ha = true;
16050 *count = ag_count;
16052 else
16053 return false;
16055 else
16056 return false;
16058 *base_mode = new_mode;
16059 return true;
16062 /* Implement TARGET_STRUCT_VALUE_RTX. */
16064 static rtx
16065 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16066 int incoming ATTRIBUTE_UNUSED)
16068 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16071 /* Implements target hook vector_mode_supported_p. */
16072 static bool
16073 aarch64_vector_mode_supported_p (machine_mode mode)
16075 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16076 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16079 /* Return the full-width SVE vector mode for element mode MODE, if one
16080 exists. */
16081 opt_machine_mode
16082 aarch64_full_sve_mode (scalar_mode mode)
16084 switch (mode)
16086 case E_DFmode:
16087 return VNx2DFmode;
16088 case E_SFmode:
16089 return VNx4SFmode;
16090 case E_HFmode:
16091 return VNx8HFmode;
16092 case E_DImode:
16093 return VNx2DImode;
16094 case E_SImode:
16095 return VNx4SImode;
16096 case E_HImode:
16097 return VNx8HImode;
16098 case E_QImode:
16099 return VNx16QImode;
16100 default:
16101 return opt_machine_mode ();
16105 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16106 if it exists. */
16107 opt_machine_mode
16108 aarch64_vq_mode (scalar_mode mode)
16110 switch (mode)
16112 case E_DFmode:
16113 return V2DFmode;
16114 case E_SFmode:
16115 return V4SFmode;
16116 case E_HFmode:
16117 return V8HFmode;
16118 case E_BFmode:
16119 return V8BFmode;
16120 case E_SImode:
16121 return V4SImode;
16122 case E_HImode:
16123 return V8HImode;
16124 case E_QImode:
16125 return V16QImode;
16126 case E_DImode:
16127 return V2DImode;
16128 default:
16129 return opt_machine_mode ();
16133 /* Return appropriate SIMD container
16134 for MODE within a vector of WIDTH bits. */
16135 static machine_mode
16136 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16138 if (TARGET_SVE
16139 && maybe_ne (width, 128)
16140 && known_eq (width, BITS_PER_SVE_VECTOR))
16141 return aarch64_full_sve_mode (mode).else_mode (word_mode);
16143 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16144 if (TARGET_SIMD)
16146 if (known_eq (width, 128))
16147 return aarch64_vq_mode (mode).else_mode (word_mode);
16148 else
16149 switch (mode)
16151 case E_SFmode:
16152 return V2SFmode;
16153 case E_HFmode:
16154 return V4HFmode;
16155 case E_BFmode:
16156 return V4BFmode;
16157 case E_SImode:
16158 return V2SImode;
16159 case E_HImode:
16160 return V4HImode;
16161 case E_QImode:
16162 return V8QImode;
16163 default:
16164 break;
16167 return word_mode;
16170 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16171 static machine_mode
16172 aarch64_preferred_simd_mode (scalar_mode mode)
16174 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16175 return aarch64_simd_container_mode (mode, bits);
16178 /* Return a list of possible vector sizes for the vectorizer
16179 to iterate over. */
16180 static unsigned int
16181 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16183 static const machine_mode sve_modes[] = {
16184 /* Try using full vectors for all element types. */
16185 VNx16QImode,
16187 /* Try using 16-bit containers for 8-bit elements and full vectors
16188 for wider elements. */
16189 VNx8QImode,
16191 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16192 full vectors for wider elements. */
16193 VNx4QImode,
16195 /* Try using 64-bit containers for all element types. */
16196 VNx2QImode
16199 static const machine_mode advsimd_modes[] = {
16200 /* Try using 128-bit vectors for all element types. */
16201 V16QImode,
16203 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16204 for wider elements. */
16205 V8QImode,
16207 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16208 for wider elements.
16210 TODO: We could support a limited form of V4QImode too, so that
16211 we use 32-bit vectors for 8-bit elements. */
16212 V4HImode,
16214 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16215 for 64-bit elements.
16217 TODO: We could similarly support limited forms of V2QImode and V2HImode
16218 for this case. */
16219 V2SImode
16222 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16223 This is because:
16225 - If we can't use N-byte Advanced SIMD vectors then the placement
16226 doesn't matter; we'll just continue as though the Advanced SIMD
16227 entry didn't exist.
16229 - If an SVE main loop with N bytes ends up being cheaper than an
16230 Advanced SIMD main loop with N bytes then by default we'll replace
16231 the Advanced SIMD version with the SVE one.
16233 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16234 than an SVE main loop with N bytes then by default we'll try to
16235 use the SVE loop to vectorize the epilogue instead. */
16236 unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16237 unsigned int advsimd_i = 0;
16238 while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16240 if (sve_i < ARRAY_SIZE (sve_modes)
16241 && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16242 GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16243 modes->safe_push (sve_modes[sve_i++]);
16244 else
16245 modes->safe_push (advsimd_modes[advsimd_i++]);
16247 while (sve_i < ARRAY_SIZE (sve_modes))
16248 modes->safe_push (sve_modes[sve_i++]);
16250 unsigned int flags = 0;
16251 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16252 can compare SVE against Advanced SIMD and so that we can compare
16253 multiple SVE vectorization approaches against each other. There's
16254 not really any point doing this for Advanced SIMD only, since the
16255 first mode that works should always be the best. */
16256 if (TARGET_SVE && aarch64_sve_compare_costs)
16257 flags |= VECT_COMPARE_COSTS;
16258 return flags;
16261 /* Implement TARGET_MANGLE_TYPE. */
16263 static const char *
16264 aarch64_mangle_type (const_tree type)
16266 /* The AArch64 ABI documents say that "__va_list" has to be
16267 mangled as if it is in the "std" namespace. */
16268 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16269 return "St9__va_list";
16271 /* Half-precision floating point types. */
16272 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16274 if (TYPE_MODE (type) == BFmode)
16275 return "u6__bf16";
16276 else
16277 return "Dh";
16280 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16281 builtin types. */
16282 if (TYPE_NAME (type) != NULL)
16284 const char *res;
16285 if ((res = aarch64_general_mangle_builtin_type (type))
16286 || (res = aarch64_sve::mangle_builtin_type (type)))
16287 return res;
16290 /* Use the default mangling. */
16291 return NULL;
16294 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16296 static bool
16297 aarch64_verify_type_context (location_t loc, type_context_kind context,
16298 const_tree type, bool silent_p)
16300 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16303 /* Find the first rtx_insn before insn that will generate an assembly
16304 instruction. */
16306 static rtx_insn *
16307 aarch64_prev_real_insn (rtx_insn *insn)
16309 if (!insn)
16310 return NULL;
16314 insn = prev_real_insn (insn);
16316 while (insn && recog_memoized (insn) < 0);
16318 return insn;
16321 static bool
16322 is_madd_op (enum attr_type t1)
16324 unsigned int i;
16325 /* A number of these may be AArch32 only. */
16326 enum attr_type mlatypes[] = {
16327 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16328 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16329 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16332 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16334 if (t1 == mlatypes[i])
16335 return true;
16338 return false;
16341 /* Check if there is a register dependency between a load and the insn
16342 for which we hold recog_data. */
16344 static bool
16345 dep_between_memop_and_curr (rtx memop)
16347 rtx load_reg;
16348 int opno;
16350 gcc_assert (GET_CODE (memop) == SET);
16352 if (!REG_P (SET_DEST (memop)))
16353 return false;
16355 load_reg = SET_DEST (memop);
16356 for (opno = 1; opno < recog_data.n_operands; opno++)
16358 rtx operand = recog_data.operand[opno];
16359 if (REG_P (operand)
16360 && reg_overlap_mentioned_p (load_reg, operand))
16361 return true;
16364 return false;
16368 /* When working around the Cortex-A53 erratum 835769,
16369 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16370 instruction and has a preceding memory instruction such that a NOP
16371 should be inserted between them. */
16373 bool
16374 aarch64_madd_needs_nop (rtx_insn* insn)
16376 enum attr_type attr_type;
16377 rtx_insn *prev;
16378 rtx body;
16380 if (!TARGET_FIX_ERR_A53_835769)
16381 return false;
16383 if (!INSN_P (insn) || recog_memoized (insn) < 0)
16384 return false;
16386 attr_type = get_attr_type (insn);
16387 if (!is_madd_op (attr_type))
16388 return false;
16390 prev = aarch64_prev_real_insn (insn);
16391 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16392 Restore recog state to INSN to avoid state corruption. */
16393 extract_constrain_insn_cached (insn);
16395 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16396 return false;
16398 body = single_set (prev);
16400 /* If the previous insn is a memory op and there is no dependency between
16401 it and the DImode madd, emit a NOP between them. If body is NULL then we
16402 have a complex memory operation, probably a load/store pair.
16403 Be conservative for now and emit a NOP. */
16404 if (GET_MODE (recog_data.operand[0]) == DImode
16405 && (!body || !dep_between_memop_and_curr (body)))
16406 return true;
16408 return false;
16413 /* Implement FINAL_PRESCAN_INSN. */
16415 void
16416 aarch64_final_prescan_insn (rtx_insn *insn)
16418 if (aarch64_madd_needs_nop (insn))
16419 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16423 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16424 instruction. */
16426 bool
16427 aarch64_sve_index_immediate_p (rtx base_or_step)
16429 return (CONST_INT_P (base_or_step)
16430 && IN_RANGE (INTVAL (base_or_step), -16, 15));
16433 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
16434 when applied to mode MODE. Negate X first if NEGATE_P is true. */
16436 bool
16437 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
16439 rtx elt = unwrap_const_vec_duplicate (x);
16440 if (!CONST_INT_P (elt))
16441 return false;
16443 HOST_WIDE_INT val = INTVAL (elt);
16444 if (negate_p)
16445 val = -val;
16446 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
16448 if (val & 0xff)
16449 return IN_RANGE (val, 0, 0xff);
16450 return IN_RANGE (val, 0, 0xff00);
16453 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16454 instructions when applied to mode MODE. Negate X first if NEGATE_P
16455 is true. */
16457 bool
16458 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
16460 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
16461 return false;
16463 /* After the optional negation, the immediate must be nonnegative.
16464 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16465 instead of SQADD Zn.B, Zn.B, #129. */
16466 rtx elt = unwrap_const_vec_duplicate (x);
16467 return negate_p == (INTVAL (elt) < 0);
16470 /* Return true if X is a valid immediate operand for an SVE logical
16471 instruction such as AND. */
16473 bool
16474 aarch64_sve_bitmask_immediate_p (rtx x)
16476 rtx elt;
16478 return (const_vec_duplicate_p (x, &elt)
16479 && CONST_INT_P (elt)
16480 && aarch64_bitmask_imm (INTVAL (elt),
16481 GET_MODE_INNER (GET_MODE (x))));
16484 /* Return true if X is a valid immediate for the SVE DUP and CPY
16485 instructions. */
16487 bool
16488 aarch64_sve_dup_immediate_p (rtx x)
16490 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16491 if (!CONST_INT_P (x))
16492 return false;
16494 HOST_WIDE_INT val = INTVAL (x);
16495 if (val & 0xff)
16496 return IN_RANGE (val, -0x80, 0x7f);
16497 return IN_RANGE (val, -0x8000, 0x7f00);
16500 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16501 SIGNED_P says whether the operand is signed rather than unsigned. */
16503 bool
16504 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16506 x = unwrap_const_vec_duplicate (x);
16507 return (CONST_INT_P (x)
16508 && (signed_p
16509 ? IN_RANGE (INTVAL (x), -16, 15)
16510 : IN_RANGE (INTVAL (x), 0, 127)));
16513 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16514 instruction. Negate X first if NEGATE_P is true. */
16516 bool
16517 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16519 rtx elt;
16520 REAL_VALUE_TYPE r;
16522 if (!const_vec_duplicate_p (x, &elt)
16523 || GET_CODE (elt) != CONST_DOUBLE)
16524 return false;
16526 r = *CONST_DOUBLE_REAL_VALUE (elt);
16528 if (negate_p)
16529 r = real_value_negate (&r);
16531 if (real_equal (&r, &dconst1))
16532 return true;
16533 if (real_equal (&r, &dconsthalf))
16534 return true;
16535 return false;
16538 /* Return true if X is a valid immediate operand for an SVE FMUL
16539 instruction. */
16541 bool
16542 aarch64_sve_float_mul_immediate_p (rtx x)
16544 rtx elt;
16546 return (const_vec_duplicate_p (x, &elt)
16547 && GET_CODE (elt) == CONST_DOUBLE
16548 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16549 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16552 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16553 for the Advanced SIMD operation described by WHICH and INSN. If INFO
16554 is nonnull, use it to describe valid immediates. */
16555 static bool
16556 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16557 simd_immediate_info *info,
16558 enum simd_immediate_check which,
16559 simd_immediate_info::insn_type insn)
16561 /* Try a 4-byte immediate with LSL. */
16562 for (unsigned int shift = 0; shift < 32; shift += 8)
16563 if ((val32 & (0xff << shift)) == val32)
16565 if (info)
16566 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16567 simd_immediate_info::LSL, shift);
16568 return true;
16571 /* Try a 2-byte immediate with LSL. */
16572 unsigned int imm16 = val32 & 0xffff;
16573 if (imm16 == (val32 >> 16))
16574 for (unsigned int shift = 0; shift < 16; shift += 8)
16575 if ((imm16 & (0xff << shift)) == imm16)
16577 if (info)
16578 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16579 simd_immediate_info::LSL, shift);
16580 return true;
16583 /* Try a 4-byte immediate with MSL, except for cases that MVN
16584 can handle. */
16585 if (which == AARCH64_CHECK_MOV)
16586 for (unsigned int shift = 8; shift < 24; shift += 8)
16588 unsigned int low = (1 << shift) - 1;
16589 if (((val32 & (0xff << shift)) | low) == val32)
16591 if (info)
16592 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16593 simd_immediate_info::MSL, shift);
16594 return true;
16598 return false;
16601 /* Return true if replicating VAL64 is a valid immediate for the
16602 Advanced SIMD operation described by WHICH. If INFO is nonnull,
16603 use it to describe valid immediates. */
16604 static bool
16605 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16606 simd_immediate_info *info,
16607 enum simd_immediate_check which)
16609 unsigned int val32 = val64 & 0xffffffff;
16610 unsigned int val16 = val64 & 0xffff;
16611 unsigned int val8 = val64 & 0xff;
16613 if (val32 == (val64 >> 32))
16615 if ((which & AARCH64_CHECK_ORR) != 0
16616 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16617 simd_immediate_info::MOV))
16618 return true;
16620 if ((which & AARCH64_CHECK_BIC) != 0
16621 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16622 simd_immediate_info::MVN))
16623 return true;
16625 /* Try using a replicated byte. */
16626 if (which == AARCH64_CHECK_MOV
16627 && val16 == (val32 >> 16)
16628 && val8 == (val16 >> 8))
16630 if (info)
16631 *info = simd_immediate_info (QImode, val8);
16632 return true;
16636 /* Try using a bit-to-bytemask. */
16637 if (which == AARCH64_CHECK_MOV)
16639 unsigned int i;
16640 for (i = 0; i < 64; i += 8)
16642 unsigned char byte = (val64 >> i) & 0xff;
16643 if (byte != 0 && byte != 0xff)
16644 break;
16646 if (i == 64)
16648 if (info)
16649 *info = simd_immediate_info (DImode, val64);
16650 return true;
16653 return false;
16656 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16657 instruction. If INFO is nonnull, use it to describe valid immediates. */
16659 static bool
16660 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16661 simd_immediate_info *info)
16663 scalar_int_mode mode = DImode;
16664 unsigned int val32 = val64 & 0xffffffff;
16665 if (val32 == (val64 >> 32))
16667 mode = SImode;
16668 unsigned int val16 = val32 & 0xffff;
16669 if (val16 == (val32 >> 16))
16671 mode = HImode;
16672 unsigned int val8 = val16 & 0xff;
16673 if (val8 == (val16 >> 8))
16674 mode = QImode;
16677 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16678 if (IN_RANGE (val, -0x80, 0x7f))
16680 /* DUP with no shift. */
16681 if (info)
16682 *info = simd_immediate_info (mode, val);
16683 return true;
16685 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16687 /* DUP with LSL #8. */
16688 if (info)
16689 *info = simd_immediate_info (mode, val);
16690 return true;
16692 if (aarch64_bitmask_imm (val64, mode))
16694 /* DUPM. */
16695 if (info)
16696 *info = simd_immediate_info (mode, val);
16697 return true;
16699 return false;
16702 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16704 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16706 where PATTERN is the svpattern as a CONST_INT and where ZERO
16707 is a zero constant of the required PTRUE mode (which can have
16708 fewer elements than X's mode, if zero bits are significant).
16710 If so, and if INFO is nonnull, describe the immediate in INFO. */
16711 bool
16712 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16714 if (GET_CODE (x) != CONST)
16715 return false;
16717 x = XEXP (x, 0);
16718 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16719 return false;
16721 if (info)
16723 aarch64_svpattern pattern
16724 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16725 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16726 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16727 *info = simd_immediate_info (int_mode, pattern);
16729 return true;
16732 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
16733 it to describe valid immediates. */
16735 static bool
16736 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16738 if (aarch64_sve_ptrue_svpattern_p (x, info))
16739 return true;
16741 if (x == CONST0_RTX (GET_MODE (x)))
16743 if (info)
16744 *info = simd_immediate_info (DImode, 0);
16745 return true;
16748 /* Analyze the value as a VNx16BImode. This should be relatively
16749 efficient, since rtx_vector_builder has enough built-in capacity
16750 to store all VLA predicate constants without needing the heap. */
16751 rtx_vector_builder builder;
16752 if (!aarch64_get_sve_pred_bits (builder, x))
16753 return false;
16755 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16756 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16758 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16759 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16760 if (pattern != AARCH64_NUM_SVPATTERNS)
16762 if (info)
16764 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16765 *info = simd_immediate_info (int_mode, pattern);
16767 return true;
16770 return false;
16773 /* Return true if OP is a valid SIMD immediate for the operation
16774 described by WHICH. If INFO is nonnull, use it to describe valid
16775 immediates. */
16776 bool
16777 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16778 enum simd_immediate_check which)
16780 machine_mode mode = GET_MODE (op);
16781 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16782 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16783 return false;
16785 if (vec_flags & VEC_SVE_PRED)
16786 return aarch64_sve_pred_valid_immediate (op, info);
16788 scalar_mode elt_mode = GET_MODE_INNER (mode);
16789 rtx base, step;
16790 unsigned int n_elts;
16791 if (GET_CODE (op) == CONST_VECTOR
16792 && CONST_VECTOR_DUPLICATE_P (op))
16793 n_elts = CONST_VECTOR_NPATTERNS (op);
16794 else if ((vec_flags & VEC_SVE_DATA)
16795 && const_vec_series_p (op, &base, &step))
16797 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16798 if (!aarch64_sve_index_immediate_p (base)
16799 || !aarch64_sve_index_immediate_p (step))
16800 return false;
16802 if (info)
16804 /* Get the corresponding container mode. E.g. an INDEX on V2SI
16805 should yield two integer values per 128-bit block, meaning
16806 that we need to treat it in the same way as V2DI and then
16807 ignore the upper 32 bits of each element. */
16808 elt_mode = aarch64_sve_container_int_mode (mode);
16809 *info = simd_immediate_info (elt_mode, base, step);
16811 return true;
16813 else if (GET_CODE (op) == CONST_VECTOR
16814 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16815 /* N_ELTS set above. */;
16816 else
16817 return false;
16819 scalar_float_mode elt_float_mode;
16820 if (n_elts == 1
16821 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16823 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16824 if (aarch64_float_const_zero_rtx_p (elt)
16825 || aarch64_float_const_representable_p (elt))
16827 if (info)
16828 *info = simd_immediate_info (elt_float_mode, elt);
16829 return true;
16833 /* If all elements in an SVE vector have the same value, we have a free
16834 choice between using the element mode and using the container mode.
16835 Using the element mode means that unused parts of the vector are
16836 duplicates of the used elements, while using the container mode means
16837 that the unused parts are an extension of the used elements. Using the
16838 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16839 for its container mode VNx4SI while 0x00000101 isn't.
16841 If not all elements in an SVE vector have the same value, we need the
16842 transition from one element to the next to occur at container boundaries.
16843 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16844 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
16845 scalar_int_mode elt_int_mode;
16846 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
16847 elt_int_mode = aarch64_sve_container_int_mode (mode);
16848 else
16849 elt_int_mode = int_mode_for_mode (elt_mode).require ();
16851 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
16852 if (elt_size > 8)
16853 return false;
16855 /* Expand the vector constant out into a byte vector, with the least
16856 significant byte of the register first. */
16857 auto_vec<unsigned char, 16> bytes;
16858 bytes.reserve (n_elts * elt_size);
16859 for (unsigned int i = 0; i < n_elts; i++)
16861 /* The vector is provided in gcc endian-neutral fashion.
16862 For aarch64_be Advanced SIMD, it must be laid out in the vector
16863 register in reverse order. */
16864 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16865 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16867 if (elt_mode != elt_int_mode)
16868 elt = gen_lowpart (elt_int_mode, elt);
16870 if (!CONST_INT_P (elt))
16871 return false;
16873 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
16874 for (unsigned int byte = 0; byte < elt_size; byte++)
16876 bytes.quick_push (elt_val & 0xff);
16877 elt_val >>= BITS_PER_UNIT;
16881 /* The immediate must repeat every eight bytes. */
16882 unsigned int nbytes = bytes.length ();
16883 for (unsigned i = 8; i < nbytes; ++i)
16884 if (bytes[i] != bytes[i - 8])
16885 return false;
16887 /* Get the repeating 8-byte value as an integer. No endian correction
16888 is needed here because bytes is already in lsb-first order. */
16889 unsigned HOST_WIDE_INT val64 = 0;
16890 for (unsigned int i = 0; i < 8; i++)
16891 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
16892 << (i * BITS_PER_UNIT));
16894 if (vec_flags & VEC_SVE_DATA)
16895 return aarch64_sve_valid_immediate (val64, info);
16896 else
16897 return aarch64_advsimd_valid_immediate (val64, info, which);
16900 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16901 has a step in the range of INDEX. Return the index expression if so,
16902 otherwise return null. */
16904 aarch64_check_zero_based_sve_index_immediate (rtx x)
16906 rtx base, step;
16907 if (const_vec_series_p (x, &base, &step)
16908 && base == const0_rtx
16909 && aarch64_sve_index_immediate_p (step))
16910 return step;
16911 return NULL_RTX;
16914 /* Check of immediate shift constants are within range. */
16915 bool
16916 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
16918 x = unwrap_const_vec_duplicate (x);
16919 if (!CONST_INT_P (x))
16920 return false;
16921 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
16922 if (left)
16923 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
16924 else
16925 return IN_RANGE (INTVAL (x), 1, bit_width);
16928 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16929 operation of width WIDTH at bit position POS. */
16932 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
16934 gcc_assert (CONST_INT_P (width));
16935 gcc_assert (CONST_INT_P (pos));
16937 unsigned HOST_WIDE_INT mask
16938 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
16939 return GEN_INT (mask << UINTVAL (pos));
16942 bool
16943 aarch64_mov_operand_p (rtx x, machine_mode mode)
16945 if (GET_CODE (x) == HIGH
16946 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
16947 return true;
16949 if (CONST_INT_P (x))
16950 return true;
16952 if (VECTOR_MODE_P (GET_MODE (x)))
16954 /* Require predicate constants to be VNx16BI before RA, so that we
16955 force everything to have a canonical form. */
16956 if (!lra_in_progress
16957 && !reload_completed
16958 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
16959 && GET_MODE (x) != VNx16BImode)
16960 return false;
16962 return aarch64_simd_valid_immediate (x, NULL);
16965 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
16966 return true;
16968 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
16969 return true;
16971 return aarch64_classify_symbolic_expression (x)
16972 == SYMBOL_TINY_ABSOLUTE;
16975 /* Return a const_int vector of VAL. */
16977 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
16979 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
16980 return gen_const_vec_duplicate (mode, c);
16983 /* Check OP is a legal scalar immediate for the MOVI instruction. */
16985 bool
16986 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
16988 machine_mode vmode;
16990 vmode = aarch64_simd_container_mode (mode, 64);
16991 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
16992 return aarch64_simd_valid_immediate (op_v, NULL);
16995 /* Construct and return a PARALLEL RTX vector with elements numbering the
16996 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16997 the vector - from the perspective of the architecture. This does not
16998 line up with GCC's perspective on lane numbers, so we end up with
16999 different masks depending on our target endian-ness. The diagram
17000 below may help. We must draw the distinction when building masks
17001 which select one half of the vector. An instruction selecting
17002 architectural low-lanes for a big-endian target, must be described using
17003 a mask selecting GCC high-lanes.
17005 Big-Endian Little-Endian
17007 GCC 0 1 2 3 3 2 1 0
17008 | x | x | x | x | | x | x | x | x |
17009 Architecture 3 2 1 0 3 2 1 0
17011 Low Mask: { 2, 3 } { 0, 1 }
17012 High Mask: { 0, 1 } { 2, 3 }
17014 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17017 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17019 rtvec v = rtvec_alloc (nunits / 2);
17020 int high_base = nunits / 2;
17021 int low_base = 0;
17022 int base;
17023 rtx t1;
17024 int i;
17026 if (BYTES_BIG_ENDIAN)
17027 base = high ? low_base : high_base;
17028 else
17029 base = high ? high_base : low_base;
17031 for (i = 0; i < nunits / 2; i++)
17032 RTVEC_ELT (v, i) = GEN_INT (base + i);
17034 t1 = gen_rtx_PARALLEL (mode, v);
17035 return t1;
17038 /* Check OP for validity as a PARALLEL RTX vector with elements
17039 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17040 from the perspective of the architecture. See the diagram above
17041 aarch64_simd_vect_par_cnst_half for more details. */
17043 bool
17044 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17045 bool high)
17047 int nelts;
17048 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17049 return false;
17051 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17052 HOST_WIDE_INT count_op = XVECLEN (op, 0);
17053 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17054 int i = 0;
17056 if (count_op != count_ideal)
17057 return false;
17059 for (i = 0; i < count_ideal; i++)
17061 rtx elt_op = XVECEXP (op, 0, i);
17062 rtx elt_ideal = XVECEXP (ideal, 0, i);
17064 if (!CONST_INT_P (elt_op)
17065 || INTVAL (elt_ideal) != INTVAL (elt_op))
17066 return false;
17068 return true;
17071 /* Return a PARALLEL containing NELTS elements, with element I equal
17072 to BASE + I * STEP. */
17075 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17077 rtvec vec = rtvec_alloc (nelts);
17078 for (unsigned int i = 0; i < nelts; ++i)
17079 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17080 return gen_rtx_PARALLEL (VOIDmode, vec);
17083 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17084 series with step STEP. */
17086 bool
17087 aarch64_stepped_int_parallel_p (rtx op, int step)
17089 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17090 return false;
17092 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17093 for (int i = 1; i < XVECLEN (op, 0); ++i)
17094 if (!CONST_INT_P (XVECEXP (op, 0, i))
17095 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17096 return false;
17098 return true;
17101 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17102 HIGH (exclusive). */
17103 void
17104 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17105 const_tree exp)
17107 HOST_WIDE_INT lane;
17108 gcc_assert (CONST_INT_P (operand));
17109 lane = INTVAL (operand);
17111 if (lane < low || lane >= high)
17113 if (exp)
17114 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17115 else
17116 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17120 /* Peform endian correction on lane number N, which indexes a vector
17121 of mode MODE, and return the result as an SImode rtx. */
17124 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17126 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17129 /* Return TRUE if OP is a valid vector addressing mode. */
17131 bool
17132 aarch64_simd_mem_operand_p (rtx op)
17134 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17135 || REG_P (XEXP (op, 0)));
17138 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17140 bool
17141 aarch64_sve_ld1r_operand_p (rtx op)
17143 struct aarch64_address_info addr;
17144 scalar_mode mode;
17146 return (MEM_P (op)
17147 && is_a <scalar_mode> (GET_MODE (op), &mode)
17148 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17149 && addr.type == ADDRESS_REG_IMM
17150 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17153 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17154 where the size of the read data is specified by `mode` and the size of the
17155 vector elements are specified by `elem_mode`. */
17156 bool
17157 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
17158 scalar_mode elem_mode)
17160 struct aarch64_address_info addr;
17161 if (!MEM_P (op)
17162 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17163 return false;
17165 if (addr.type == ADDRESS_REG_IMM)
17166 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
17168 if (addr.type == ADDRESS_REG_REG)
17169 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17171 return false;
17174 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17175 bool
17176 aarch64_sve_ld1rq_operand_p (rtx op)
17178 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
17179 GET_MODE_INNER (GET_MODE (op)));
17182 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17183 accessing a vector where the element size is specified by `elem_mode`. */
17184 bool
17185 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
17187 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
17190 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17191 bool
17192 aarch64_sve_ldff1_operand_p (rtx op)
17194 if (!MEM_P (op))
17195 return false;
17197 struct aarch64_address_info addr;
17198 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17199 return false;
17201 if (addr.type == ADDRESS_REG_IMM)
17202 return known_eq (addr.const_offset, 0);
17204 return addr.type == ADDRESS_REG_REG;
17207 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17208 bool
17209 aarch64_sve_ldnf1_operand_p (rtx op)
17211 struct aarch64_address_info addr;
17213 return (MEM_P (op)
17214 && aarch64_classify_address (&addr, XEXP (op, 0),
17215 GET_MODE (op), false)
17216 && addr.type == ADDRESS_REG_IMM);
17219 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17220 The conditions for STR are the same. */
17221 bool
17222 aarch64_sve_ldr_operand_p (rtx op)
17224 struct aarch64_address_info addr;
17226 return (MEM_P (op)
17227 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17228 false, ADDR_QUERY_ANY)
17229 && addr.type == ADDRESS_REG_IMM);
17232 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17233 addressing memory of mode MODE. */
17234 bool
17235 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17237 struct aarch64_address_info addr;
17238 if (!aarch64_classify_address (&addr, op, mode, false))
17239 return false;
17241 if (addr.type == ADDRESS_REG_IMM)
17242 return known_eq (addr.const_offset, 0);
17244 return addr.type == ADDRESS_REG_REG;
17247 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17248 We need to be able to access the individual pieces, so the range
17249 is different from LD[234] and ST[234]. */
17250 bool
17251 aarch64_sve_struct_memory_operand_p (rtx op)
17253 if (!MEM_P (op))
17254 return false;
17256 machine_mode mode = GET_MODE (op);
17257 struct aarch64_address_info addr;
17258 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17259 ADDR_QUERY_ANY)
17260 || addr.type != ADDRESS_REG_IMM)
17261 return false;
17263 poly_int64 first = addr.const_offset;
17264 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17265 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17266 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17269 /* Emit a register copy from operand to operand, taking care not to
17270 early-clobber source registers in the process.
17272 COUNT is the number of components into which the copy needs to be
17273 decomposed. */
17274 void
17275 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17276 unsigned int count)
17278 unsigned int i;
17279 int rdest = REGNO (operands[0]);
17280 int rsrc = REGNO (operands[1]);
17282 if (!reg_overlap_mentioned_p (operands[0], operands[1])
17283 || rdest < rsrc)
17284 for (i = 0; i < count; i++)
17285 emit_move_insn (gen_rtx_REG (mode, rdest + i),
17286 gen_rtx_REG (mode, rsrc + i));
17287 else
17288 for (i = 0; i < count; i++)
17289 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17290 gen_rtx_REG (mode, rsrc + count - i - 1));
17293 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17294 one of VSTRUCT modes: OI, CI, or XI. */
17296 aarch64_simd_attr_length_rglist (machine_mode mode)
17298 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17299 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17302 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17303 alignment of a vector to 128 bits. SVE predicates have an alignment of
17304 16 bits. */
17305 static HOST_WIDE_INT
17306 aarch64_simd_vector_alignment (const_tree type)
17308 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17309 be set for non-predicate vectors of booleans. Modes are the most
17310 direct way we have of identifying real SVE predicate types. */
17311 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17312 return 16;
17313 widest_int min_size
17314 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17315 return wi::umin (min_size, 128).to_uhwi ();
17318 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17319 static poly_uint64
17320 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17322 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17324 /* If the length of the vector is fixed, try to align to that length,
17325 otherwise don't try to align at all. */
17326 HOST_WIDE_INT result;
17327 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17328 result = TYPE_ALIGN (TREE_TYPE (type));
17329 return result;
17331 return TYPE_ALIGN (type);
17334 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17335 static bool
17336 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17338 if (is_packed)
17339 return false;
17341 /* For fixed-length vectors, check that the vectorizer will aim for
17342 full-vector alignment. This isn't true for generic GCC vectors
17343 that are wider than the ABI maximum of 128 bits. */
17344 poly_uint64 preferred_alignment =
17345 aarch64_vectorize_preferred_vector_alignment (type);
17346 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17347 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17348 preferred_alignment))
17349 return false;
17351 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17352 return true;
17355 /* Return true if the vector misalignment factor is supported by the
17356 target. */
17357 static bool
17358 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17359 const_tree type, int misalignment,
17360 bool is_packed)
17362 if (TARGET_SIMD && STRICT_ALIGNMENT)
17364 /* Return if movmisalign pattern is not supported for this mode. */
17365 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17366 return false;
17368 /* Misalignment factor is unknown at compile time. */
17369 if (misalignment == -1)
17370 return false;
17372 return default_builtin_support_vector_misalignment (mode, type, misalignment,
17373 is_packed);
17376 /* If VALS is a vector constant that can be loaded into a register
17377 using DUP, generate instructions to do so and return an RTX to
17378 assign to the register. Otherwise return NULL_RTX. */
17379 static rtx
17380 aarch64_simd_dup_constant (rtx vals)
17382 machine_mode mode = GET_MODE (vals);
17383 machine_mode inner_mode = GET_MODE_INNER (mode);
17384 rtx x;
17386 if (!const_vec_duplicate_p (vals, &x))
17387 return NULL_RTX;
17389 /* We can load this constant by using DUP and a constant in a
17390 single ARM register. This will be cheaper than a vector
17391 load. */
17392 x = copy_to_mode_reg (inner_mode, x);
17393 return gen_vec_duplicate (mode, x);
17397 /* Generate code to load VALS, which is a PARALLEL containing only
17398 constants (for vec_init) or CONST_VECTOR, efficiently into a
17399 register. Returns an RTX to copy into the register, or NULL_RTX
17400 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
17401 static rtx
17402 aarch64_simd_make_constant (rtx vals)
17404 machine_mode mode = GET_MODE (vals);
17405 rtx const_dup;
17406 rtx const_vec = NULL_RTX;
17407 int n_const = 0;
17408 int i;
17410 if (GET_CODE (vals) == CONST_VECTOR)
17411 const_vec = vals;
17412 else if (GET_CODE (vals) == PARALLEL)
17414 /* A CONST_VECTOR must contain only CONST_INTs and
17415 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17416 Only store valid constants in a CONST_VECTOR. */
17417 int n_elts = XVECLEN (vals, 0);
17418 for (i = 0; i < n_elts; ++i)
17420 rtx x = XVECEXP (vals, 0, i);
17421 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17422 n_const++;
17424 if (n_const == n_elts)
17425 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17427 else
17428 gcc_unreachable ();
17430 if (const_vec != NULL_RTX
17431 && aarch64_simd_valid_immediate (const_vec, NULL))
17432 /* Load using MOVI/MVNI. */
17433 return const_vec;
17434 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17435 /* Loaded using DUP. */
17436 return const_dup;
17437 else if (const_vec != NULL_RTX)
17438 /* Load from constant pool. We cannot take advantage of single-cycle
17439 LD1 because we need a PC-relative addressing mode. */
17440 return const_vec;
17441 else
17442 /* A PARALLEL containing something not valid inside CONST_VECTOR.
17443 We cannot construct an initializer. */
17444 return NULL_RTX;
17447 /* Expand a vector initialisation sequence, such that TARGET is
17448 initialised to contain VALS. */
17450 void
17451 aarch64_expand_vector_init (rtx target, rtx vals)
17453 machine_mode mode = GET_MODE (target);
17454 scalar_mode inner_mode = GET_MODE_INNER (mode);
17455 /* The number of vector elements. */
17456 int n_elts = XVECLEN (vals, 0);
17457 /* The number of vector elements which are not constant. */
17458 int n_var = 0;
17459 rtx any_const = NULL_RTX;
17460 /* The first element of vals. */
17461 rtx v0 = XVECEXP (vals, 0, 0);
17462 bool all_same = true;
17464 /* This is a special vec_init<M><N> where N is not an element mode but a
17465 vector mode with half the elements of M. We expect to find two entries
17466 of mode N in VALS and we must put their concatentation into TARGET. */
17467 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17469 gcc_assert (known_eq (GET_MODE_SIZE (mode),
17470 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17471 rtx lo = XVECEXP (vals, 0, 0);
17472 rtx hi = XVECEXP (vals, 0, 1);
17473 machine_mode narrow_mode = GET_MODE (lo);
17474 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17475 gcc_assert (narrow_mode == GET_MODE (hi));
17477 /* When we want to concatenate a half-width vector with zeroes we can
17478 use the aarch64_combinez[_be] patterns. Just make sure that the
17479 zeroes are in the right half. */
17480 if (BYTES_BIG_ENDIAN
17481 && aarch64_simd_imm_zero (lo, narrow_mode)
17482 && general_operand (hi, narrow_mode))
17483 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17484 else if (!BYTES_BIG_ENDIAN
17485 && aarch64_simd_imm_zero (hi, narrow_mode)
17486 && general_operand (lo, narrow_mode))
17487 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17488 else
17490 /* Else create the two half-width registers and combine them. */
17491 if (!REG_P (lo))
17492 lo = force_reg (GET_MODE (lo), lo);
17493 if (!REG_P (hi))
17494 hi = force_reg (GET_MODE (hi), hi);
17496 if (BYTES_BIG_ENDIAN)
17497 std::swap (lo, hi);
17498 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17500 return;
17503 /* Count the number of variable elements to initialise. */
17504 for (int i = 0; i < n_elts; ++i)
17506 rtx x = XVECEXP (vals, 0, i);
17507 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17508 ++n_var;
17509 else
17510 any_const = x;
17512 all_same &= rtx_equal_p (x, v0);
17515 /* No variable elements, hand off to aarch64_simd_make_constant which knows
17516 how best to handle this. */
17517 if (n_var == 0)
17519 rtx constant = aarch64_simd_make_constant (vals);
17520 if (constant != NULL_RTX)
17522 emit_move_insn (target, constant);
17523 return;
17527 /* Splat a single non-constant element if we can. */
17528 if (all_same)
17530 rtx x = copy_to_mode_reg (inner_mode, v0);
17531 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17532 return;
17535 enum insn_code icode = optab_handler (vec_set_optab, mode);
17536 gcc_assert (icode != CODE_FOR_nothing);
17538 /* If there are only variable elements, try to optimize
17539 the insertion using dup for the most common element
17540 followed by insertions. */
17542 /* The algorithm will fill matches[*][0] with the earliest matching element,
17543 and matches[X][1] with the count of duplicate elements (if X is the
17544 earliest element which has duplicates). */
17546 if (n_var == n_elts && n_elts <= 16)
17548 int matches[16][2] = {0};
17549 for (int i = 0; i < n_elts; i++)
17551 for (int j = 0; j <= i; j++)
17553 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17555 matches[i][0] = j;
17556 matches[j][1]++;
17557 break;
17561 int maxelement = 0;
17562 int maxv = 0;
17563 for (int i = 0; i < n_elts; i++)
17564 if (matches[i][1] > maxv)
17566 maxelement = i;
17567 maxv = matches[i][1];
17570 /* Create a duplicate of the most common element, unless all elements
17571 are equally useless to us, in which case just immediately set the
17572 vector register using the first element. */
17574 if (maxv == 1)
17576 /* For vectors of two 64-bit elements, we can do even better. */
17577 if (n_elts == 2
17578 && (inner_mode == E_DImode
17579 || inner_mode == E_DFmode))
17582 rtx x0 = XVECEXP (vals, 0, 0);
17583 rtx x1 = XVECEXP (vals, 0, 1);
17584 /* Combine can pick up this case, but handling it directly
17585 here leaves clearer RTL.
17587 This is load_pair_lanes<mode>, and also gives us a clean-up
17588 for store_pair_lanes<mode>. */
17589 if (memory_operand (x0, inner_mode)
17590 && memory_operand (x1, inner_mode)
17591 && !STRICT_ALIGNMENT
17592 && rtx_equal_p (XEXP (x1, 0),
17593 plus_constant (Pmode,
17594 XEXP (x0, 0),
17595 GET_MODE_SIZE (inner_mode))))
17597 rtx t;
17598 if (inner_mode == DFmode)
17599 t = gen_load_pair_lanesdf (target, x0, x1);
17600 else
17601 t = gen_load_pair_lanesdi (target, x0, x1);
17602 emit_insn (t);
17603 return;
17606 /* The subreg-move sequence below will move into lane zero of the
17607 vector register. For big-endian we want that position to hold
17608 the last element of VALS. */
17609 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17610 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17611 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17613 else
17615 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17616 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17619 /* Insert the rest. */
17620 for (int i = 0; i < n_elts; i++)
17622 rtx x = XVECEXP (vals, 0, i);
17623 if (matches[i][0] == maxelement)
17624 continue;
17625 x = copy_to_mode_reg (inner_mode, x);
17626 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17628 return;
17631 /* Initialise a vector which is part-variable. We want to first try
17632 to build those lanes which are constant in the most efficient way we
17633 can. */
17634 if (n_var != n_elts)
17636 rtx copy = copy_rtx (vals);
17638 /* Load constant part of vector. We really don't care what goes into the
17639 parts we will overwrite, but we're more likely to be able to load the
17640 constant efficiently if it has fewer, larger, repeating parts
17641 (see aarch64_simd_valid_immediate). */
17642 for (int i = 0; i < n_elts; i++)
17644 rtx x = XVECEXP (vals, 0, i);
17645 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17646 continue;
17647 rtx subst = any_const;
17648 for (int bit = n_elts / 2; bit > 0; bit /= 2)
17650 /* Look in the copied vector, as more elements are const. */
17651 rtx test = XVECEXP (copy, 0, i ^ bit);
17652 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17654 subst = test;
17655 break;
17658 XVECEXP (copy, 0, i) = subst;
17660 aarch64_expand_vector_init (target, copy);
17663 /* Insert the variable lanes directly. */
17664 for (int i = 0; i < n_elts; i++)
17666 rtx x = XVECEXP (vals, 0, i);
17667 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17668 continue;
17669 x = copy_to_mode_reg (inner_mode, x);
17670 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17674 /* Emit RTL corresponding to:
17675 insr TARGET, ELEM. */
17677 static void
17678 emit_insr (rtx target, rtx elem)
17680 machine_mode mode = GET_MODE (target);
17681 scalar_mode elem_mode = GET_MODE_INNER (mode);
17682 elem = force_reg (elem_mode, elem);
17684 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17685 gcc_assert (icode != CODE_FOR_nothing);
17686 emit_insn (GEN_FCN (icode) (target, target, elem));
17689 /* Subroutine of aarch64_sve_expand_vector_init for handling
17690 trailing constants.
17691 This function works as follows:
17692 (a) Create a new vector consisting of trailing constants.
17693 (b) Initialize TARGET with the constant vector using emit_move_insn.
17694 (c) Insert remaining elements in TARGET using insr.
17695 NELTS is the total number of elements in original vector while
17696 while NELTS_REQD is the number of elements that are actually
17697 significant.
17699 ??? The heuristic used is to do above only if number of constants
17700 is at least half the total number of elements. May need fine tuning. */
17702 static bool
17703 aarch64_sve_expand_vector_init_handle_trailing_constants
17704 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17706 machine_mode mode = GET_MODE (target);
17707 scalar_mode elem_mode = GET_MODE_INNER (mode);
17708 int n_trailing_constants = 0;
17710 for (int i = nelts_reqd - 1;
17711 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17712 i--)
17713 n_trailing_constants++;
17715 if (n_trailing_constants >= nelts_reqd / 2)
17717 rtx_vector_builder v (mode, 1, nelts);
17718 for (int i = 0; i < nelts; i++)
17719 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17720 rtx const_vec = v.build ();
17721 emit_move_insn (target, const_vec);
17723 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17724 emit_insr (target, builder.elt (i));
17726 return true;
17729 return false;
17732 /* Subroutine of aarch64_sve_expand_vector_init.
17733 Works as follows:
17734 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17735 (b) Skip trailing elements from BUILDER, which are the same as
17736 element NELTS_REQD - 1.
17737 (c) Insert earlier elements in reverse order in TARGET using insr. */
17739 static void
17740 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17741 const rtx_vector_builder &builder,
17742 int nelts_reqd)
17744 machine_mode mode = GET_MODE (target);
17745 scalar_mode elem_mode = GET_MODE_INNER (mode);
17747 struct expand_operand ops[2];
17748 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17749 gcc_assert (icode != CODE_FOR_nothing);
17751 create_output_operand (&ops[0], target, mode);
17752 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17753 expand_insn (icode, 2, ops);
17755 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17756 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17757 emit_insr (target, builder.elt (i));
17760 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17761 when all trailing elements of builder are same.
17762 This works as follows:
17763 (a) Use expand_insn interface to broadcast last vector element in TARGET.
17764 (b) Insert remaining elements in TARGET using insr.
17766 ??? The heuristic used is to do above if number of same trailing elements
17767 is at least 3/4 of total number of elements, loosely based on
17768 heuristic from mostly_zeros_p. May need fine-tuning. */
17770 static bool
17771 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17772 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17774 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17775 if (ndups >= (3 * nelts_reqd) / 4)
17777 aarch64_sve_expand_vector_init_insert_elems (target, builder,
17778 nelts_reqd - ndups + 1);
17779 return true;
17782 return false;
17785 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17786 of elements in BUILDER.
17788 The function tries to initialize TARGET from BUILDER if it fits one
17789 of the special cases outlined below.
17791 Failing that, the function divides BUILDER into two sub-vectors:
17792 v_even = even elements of BUILDER;
17793 v_odd = odd elements of BUILDER;
17795 and recursively calls itself with v_even and v_odd.
17797 if (recursive call succeeded for v_even or v_odd)
17798 TARGET = zip (v_even, v_odd)
17800 The function returns true if it managed to build TARGET from BUILDER
17801 with one of the special cases, false otherwise.
17803 Example: {a, 1, b, 2, c, 3, d, 4}
17805 The vector gets divided into:
17806 v_even = {a, b, c, d}
17807 v_odd = {1, 2, 3, 4}
17809 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17810 initialize tmp2 from constant vector v_odd using emit_move_insn.
17812 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17813 4 elements, so we construct tmp1 from v_even using insr:
17814 tmp1 = dup(d)
17815 insr tmp1, c
17816 insr tmp1, b
17817 insr tmp1, a
17819 And finally:
17820 TARGET = zip (tmp1, tmp2)
17821 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
17823 static bool
17824 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17825 int nelts, int nelts_reqd)
17827 machine_mode mode = GET_MODE (target);
17829 /* Case 1: Vector contains trailing constants. */
17831 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17832 (target, builder, nelts, nelts_reqd))
17833 return true;
17835 /* Case 2: Vector contains leading constants. */
17837 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17838 for (int i = 0; i < nelts_reqd; i++)
17839 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17840 rev_builder.finalize ();
17842 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17843 (target, rev_builder, nelts, nelts_reqd))
17845 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17846 return true;
17849 /* Case 3: Vector contains trailing same element. */
17851 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17852 (target, builder, nelts_reqd))
17853 return true;
17855 /* Case 4: Vector contains leading same element. */
17857 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17858 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17860 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17861 return true;
17864 /* Avoid recursing below 4-elements.
17865 ??? The threshold 4 may need fine-tuning. */
17867 if (nelts_reqd <= 4)
17868 return false;
17870 rtx_vector_builder v_even (mode, 1, nelts);
17871 rtx_vector_builder v_odd (mode, 1, nelts);
17873 for (int i = 0; i < nelts * 2; i += 2)
17875 v_even.quick_push (builder.elt (i));
17876 v_odd.quick_push (builder.elt (i + 1));
17879 v_even.finalize ();
17880 v_odd.finalize ();
17882 rtx tmp1 = gen_reg_rtx (mode);
17883 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
17884 nelts, nelts_reqd / 2);
17886 rtx tmp2 = gen_reg_rtx (mode);
17887 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
17888 nelts, nelts_reqd / 2);
17890 if (!did_even_p && !did_odd_p)
17891 return false;
17893 /* Initialize v_even and v_odd using INSR if it didn't match any of the
17894 special cases and zip v_even, v_odd. */
17896 if (!did_even_p)
17897 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
17899 if (!did_odd_p)
17900 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
17902 rtvec v = gen_rtvec (2, tmp1, tmp2);
17903 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
17904 return true;
17907 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
17909 void
17910 aarch64_sve_expand_vector_init (rtx target, rtx vals)
17912 machine_mode mode = GET_MODE (target);
17913 int nelts = XVECLEN (vals, 0);
17915 rtx_vector_builder v (mode, 1, nelts);
17916 for (int i = 0; i < nelts; i++)
17917 v.quick_push (XVECEXP (vals, 0, i));
17918 v.finalize ();
17920 /* If neither sub-vectors of v could be initialized specially,
17921 then use INSR to insert all elements from v into TARGET.
17922 ??? This might not be optimal for vectors with large
17923 initializers like 16-element or above.
17924 For nelts < 4, it probably isn't useful to handle specially. */
17926 if (nelts < 4
17927 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
17928 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
17931 /* Check whether VALUE is a vector constant in which every element
17932 is either a power of 2 or a negated power of 2. If so, return
17933 a constant vector of log2s, and flip CODE between PLUS and MINUS
17934 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
17936 static rtx
17937 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
17939 if (GET_CODE (value) != CONST_VECTOR)
17940 return NULL_RTX;
17942 rtx_vector_builder builder;
17943 if (!builder.new_unary_operation (GET_MODE (value), value, false))
17944 return NULL_RTX;
17946 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
17947 /* 1 if the result of the multiplication must be negated,
17948 0 if it mustn't, or -1 if we don't yet care. */
17949 int negate = -1;
17950 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
17951 for (unsigned int i = 0; i < encoded_nelts; ++i)
17953 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
17954 if (!CONST_SCALAR_INT_P (elt))
17955 return NULL_RTX;
17956 rtx_mode_t val (elt, int_mode);
17957 wide_int pow2 = wi::neg (val);
17958 if (val != pow2)
17960 /* It matters whether we negate or not. Make that choice,
17961 and make sure that it's consistent with previous elements. */
17962 if (negate == !wi::neg_p (val))
17963 return NULL_RTX;
17964 negate = wi::neg_p (val);
17965 if (!negate)
17966 pow2 = val;
17968 /* POW2 is now the value that we want to be a power of 2. */
17969 int shift = wi::exact_log2 (pow2);
17970 if (shift < 0)
17971 return NULL_RTX;
17972 builder.quick_push (gen_int_mode (shift, int_mode));
17974 if (negate == -1)
17975 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
17976 code = PLUS;
17977 else if (negate == 1)
17978 code = code == PLUS ? MINUS : PLUS;
17979 return builder.build ();
17982 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17983 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
17984 operands array, in the same order as for fma_optab. Return true if
17985 the function emitted all the necessary instructions, false if the caller
17986 should generate the pattern normally with the new OPERANDS array. */
17988 bool
17989 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
17991 machine_mode mode = GET_MODE (operands[0]);
17992 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
17994 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
17995 NULL_RTX, true, OPTAB_DIRECT);
17996 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
17997 operands[3], product, operands[0], true,
17998 OPTAB_DIRECT);
17999 return true;
18001 operands[2] = force_reg (mode, operands[2]);
18002 return false;
18005 /* Likewise, but for a conditional pattern. */
18007 bool
18008 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
18010 machine_mode mode = GET_MODE (operands[0]);
18011 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
18013 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
18014 NULL_RTX, true, OPTAB_DIRECT);
18015 emit_insn (gen_cond (code, mode, operands[0], operands[1],
18016 operands[4], product, operands[5]));
18017 return true;
18019 operands[3] = force_reg (mode, operands[3]);
18020 return false;
18023 static unsigned HOST_WIDE_INT
18024 aarch64_shift_truncation_mask (machine_mode mode)
18026 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18027 return 0;
18028 return GET_MODE_UNIT_BITSIZE (mode) - 1;
18031 /* Select a format to encode pointers in exception handling data. */
18033 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18035 int type;
18036 switch (aarch64_cmodel)
18038 case AARCH64_CMODEL_TINY:
18039 case AARCH64_CMODEL_TINY_PIC:
18040 case AARCH64_CMODEL_SMALL:
18041 case AARCH64_CMODEL_SMALL_PIC:
18042 case AARCH64_CMODEL_SMALL_SPIC:
18043 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18044 for everything. */
18045 type = DW_EH_PE_sdata4;
18046 break;
18047 default:
18048 /* No assumptions here. 8-byte relocs required. */
18049 type = DW_EH_PE_sdata8;
18050 break;
18052 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18055 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18057 static void
18058 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18060 if (TREE_CODE (decl) == FUNCTION_DECL)
18062 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18063 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18065 fprintf (stream, "\t.variant_pcs\t");
18066 assemble_name (stream, name);
18067 fprintf (stream, "\n");
18072 /* The last .arch and .tune assembly strings that we printed. */
18073 static std::string aarch64_last_printed_arch_string;
18074 static std::string aarch64_last_printed_tune_string;
18076 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18077 by the function fndecl. */
18079 void
18080 aarch64_declare_function_name (FILE *stream, const char* name,
18081 tree fndecl)
18083 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18085 struct cl_target_option *targ_options;
18086 if (target_parts)
18087 targ_options = TREE_TARGET_OPTION (target_parts);
18088 else
18089 targ_options = TREE_TARGET_OPTION (target_option_current_node);
18090 gcc_assert (targ_options);
18092 const struct processor *this_arch
18093 = aarch64_get_arch (targ_options->x_explicit_arch);
18095 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18096 std::string extension
18097 = aarch64_get_extension_string_for_isa_flags (isa_flags,
18098 this_arch->flags);
18099 /* Only update the assembler .arch string if it is distinct from the last
18100 such string we printed. */
18101 std::string to_print = this_arch->name + extension;
18102 if (to_print != aarch64_last_printed_arch_string)
18104 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18105 aarch64_last_printed_arch_string = to_print;
18108 /* Print the cpu name we're tuning for in the comments, might be
18109 useful to readers of the generated asm. Do it only when it changes
18110 from function to function and verbose assembly is requested. */
18111 const struct processor *this_tune
18112 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18114 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18116 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18117 this_tune->name);
18118 aarch64_last_printed_tune_string = this_tune->name;
18121 aarch64_asm_output_variant_pcs (stream, fndecl, name);
18123 /* Don't forget the type directive for ELF. */
18124 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18125 ASM_OUTPUT_LABEL (stream, name);
18128 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18130 void
18131 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18133 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18134 const char *value = IDENTIFIER_POINTER (target);
18135 aarch64_asm_output_variant_pcs (stream, decl, name);
18136 ASM_OUTPUT_DEF (stream, name, value);
18139 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18140 function symbol references. */
18142 void
18143 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18145 default_elf_asm_output_external (stream, decl, name);
18146 aarch64_asm_output_variant_pcs (stream, decl, name);
18149 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18150 Used to output the .cfi_b_key_frame directive when signing the current
18151 function with the B key. */
18153 void
18154 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18156 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18157 && aarch64_ra_sign_key == AARCH64_KEY_B)
18158 asm_fprintf (f, "\t.cfi_b_key_frame\n");
18161 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18163 static void
18164 aarch64_start_file (void)
18166 struct cl_target_option *default_options
18167 = TREE_TARGET_OPTION (target_option_default_node);
18169 const struct processor *default_arch
18170 = aarch64_get_arch (default_options->x_explicit_arch);
18171 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18172 std::string extension
18173 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18174 default_arch->flags);
18176 aarch64_last_printed_arch_string = default_arch->name + extension;
18177 aarch64_last_printed_tune_string = "";
18178 asm_fprintf (asm_out_file, "\t.arch %s\n",
18179 aarch64_last_printed_arch_string.c_str ());
18181 default_file_start ();
18184 /* Emit load exclusive. */
18186 static void
18187 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18188 rtx mem, rtx model_rtx)
18190 if (mode == TImode)
18191 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18192 gen_highpart (DImode, rval),
18193 mem, model_rtx));
18194 else
18195 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18198 /* Emit store exclusive. */
18200 static void
18201 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18202 rtx mem, rtx rval, rtx model_rtx)
18204 if (mode == TImode)
18205 emit_insn (gen_aarch64_store_exclusive_pair
18206 (bval, mem, operand_subword (rval, 0, 0, TImode),
18207 operand_subword (rval, 1, 0, TImode), model_rtx));
18208 else
18209 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18212 /* Mark the previous jump instruction as unlikely. */
18214 static void
18215 aarch64_emit_unlikely_jump (rtx insn)
18217 rtx_insn *jump = emit_jump_insn (insn);
18218 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18221 /* We store the names of the various atomic helpers in a 5x4 array.
18222 Return the libcall function given MODE, MODEL and NAMES. */
18225 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18226 const atomic_ool_names *names)
18228 memmodel model = memmodel_base (INTVAL (model_rtx));
18229 int mode_idx, model_idx;
18231 switch (mode)
18233 case E_QImode:
18234 mode_idx = 0;
18235 break;
18236 case E_HImode:
18237 mode_idx = 1;
18238 break;
18239 case E_SImode:
18240 mode_idx = 2;
18241 break;
18242 case E_DImode:
18243 mode_idx = 3;
18244 break;
18245 case E_TImode:
18246 mode_idx = 4;
18247 break;
18248 default:
18249 gcc_unreachable ();
18252 switch (model)
18254 case MEMMODEL_RELAXED:
18255 model_idx = 0;
18256 break;
18257 case MEMMODEL_CONSUME:
18258 case MEMMODEL_ACQUIRE:
18259 model_idx = 1;
18260 break;
18261 case MEMMODEL_RELEASE:
18262 model_idx = 2;
18263 break;
18264 case MEMMODEL_ACQ_REL:
18265 case MEMMODEL_SEQ_CST:
18266 model_idx = 3;
18267 break;
18268 default:
18269 gcc_unreachable ();
18272 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18273 VISIBILITY_HIDDEN);
18276 #define DEF0(B, N) \
18277 { "__aarch64_" #B #N "_relax", \
18278 "__aarch64_" #B #N "_acq", \
18279 "__aarch64_" #B #N "_rel", \
18280 "__aarch64_" #B #N "_acq_rel" }
18282 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18283 { NULL, NULL, NULL, NULL }
18284 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18286 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18287 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18288 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18289 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18290 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18291 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18293 #undef DEF0
18294 #undef DEF4
18295 #undef DEF5
18297 /* Expand a compare and swap pattern. */
18299 void
18300 aarch64_expand_compare_and_swap (rtx operands[])
18302 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18303 machine_mode mode, r_mode;
18305 bval = operands[0];
18306 rval = operands[1];
18307 mem = operands[2];
18308 oldval = operands[3];
18309 newval = operands[4];
18310 is_weak = operands[5];
18311 mod_s = operands[6];
18312 mod_f = operands[7];
18313 mode = GET_MODE (mem);
18315 /* Normally the succ memory model must be stronger than fail, but in the
18316 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18317 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18318 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18319 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18320 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18322 r_mode = mode;
18323 if (mode == QImode || mode == HImode)
18325 r_mode = SImode;
18326 rval = gen_reg_rtx (r_mode);
18329 if (TARGET_LSE)
18331 /* The CAS insn requires oldval and rval overlap, but we need to
18332 have a copy of oldval saved across the operation to tell if
18333 the operation is successful. */
18334 if (reg_overlap_mentioned_p (rval, oldval))
18335 rval = copy_to_mode_reg (r_mode, oldval);
18336 else
18337 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18339 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18340 newval, mod_s));
18341 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18343 else if (TARGET_OUTLINE_ATOMICS)
18345 /* Oldval must satisfy compare afterward. */
18346 if (!aarch64_plus_operand (oldval, mode))
18347 oldval = force_reg (mode, oldval);
18348 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18349 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18350 oldval, mode, newval, mode,
18351 XEXP (mem, 0), Pmode);
18352 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18354 else
18356 /* The oldval predicate varies by mode. Test it and force to reg. */
18357 insn_code code = code_for_aarch64_compare_and_swap (mode);
18358 if (!insn_data[code].operand[2].predicate (oldval, mode))
18359 oldval = force_reg (mode, oldval);
18361 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18362 is_weak, mod_s, mod_f));
18363 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18366 if (r_mode != mode)
18367 rval = gen_lowpart (mode, rval);
18368 emit_move_insn (operands[1], rval);
18370 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18371 emit_insn (gen_rtx_SET (bval, x));
18374 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18375 sequence implementing an atomic operation. */
18377 static void
18378 aarch64_emit_post_barrier (enum memmodel model)
18380 const enum memmodel base_model = memmodel_base (model);
18382 if (is_mm_sync (model)
18383 && (base_model == MEMMODEL_ACQUIRE
18384 || base_model == MEMMODEL_ACQ_REL
18385 || base_model == MEMMODEL_SEQ_CST))
18387 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18391 /* Split a compare and swap pattern. */
18393 void
18394 aarch64_split_compare_and_swap (rtx operands[])
18396 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18397 gcc_assert (epilogue_completed);
18399 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18400 machine_mode mode;
18401 bool is_weak;
18402 rtx_code_label *label1, *label2;
18403 enum memmodel model;
18405 rval = operands[0];
18406 mem = operands[1];
18407 oldval = operands[2];
18408 newval = operands[3];
18409 is_weak = (operands[4] != const0_rtx);
18410 model_rtx = operands[5];
18411 scratch = operands[7];
18412 mode = GET_MODE (mem);
18413 model = memmodel_from_int (INTVAL (model_rtx));
18415 /* When OLDVAL is zero and we want the strong version we can emit a tighter
18416 loop:
18417 .label1:
18418 LD[A]XR rval, [mem]
18419 CBNZ rval, .label2
18420 ST[L]XR scratch, newval, [mem]
18421 CBNZ scratch, .label1
18422 .label2:
18423 CMP rval, 0. */
18424 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18425 oldval == const0_rtx && mode != TImode);
18427 label1 = NULL;
18428 if (!is_weak)
18430 label1 = gen_label_rtx ();
18431 emit_label (label1);
18433 label2 = gen_label_rtx ();
18435 /* The initial load can be relaxed for a __sync operation since a final
18436 barrier will be emitted to stop code hoisting. */
18437 if (is_mm_sync (model))
18438 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18439 else
18440 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18442 if (strong_zero_p)
18443 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18444 else
18446 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18447 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18449 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18450 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18451 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18453 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18455 if (!is_weak)
18457 if (aarch64_track_speculation)
18459 /* Emit an explicit compare instruction, so that we can correctly
18460 track the condition codes. */
18461 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18462 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18464 else
18465 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18467 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18468 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18469 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18471 else
18472 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18474 emit_label (label2);
18476 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18477 to set the condition flags. If this is not used it will be removed by
18478 later passes. */
18479 if (strong_zero_p)
18480 aarch64_gen_compare_reg (NE, rval, const0_rtx);
18482 /* Emit any final barrier needed for a __sync operation. */
18483 if (is_mm_sync (model))
18484 aarch64_emit_post_barrier (model);
18487 /* Split an atomic operation. */
18489 void
18490 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18491 rtx value, rtx model_rtx, rtx cond)
18493 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18494 gcc_assert (epilogue_completed);
18496 machine_mode mode = GET_MODE (mem);
18497 machine_mode wmode = (mode == DImode ? DImode : SImode);
18498 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18499 const bool is_sync = is_mm_sync (model);
18500 rtx_code_label *label;
18501 rtx x;
18503 /* Split the atomic operation into a sequence. */
18504 label = gen_label_rtx ();
18505 emit_label (label);
18507 if (new_out)
18508 new_out = gen_lowpart (wmode, new_out);
18509 if (old_out)
18510 old_out = gen_lowpart (wmode, old_out);
18511 else
18512 old_out = new_out;
18513 value = simplify_gen_subreg (wmode, value, mode, 0);
18515 /* The initial load can be relaxed for a __sync operation since a final
18516 barrier will be emitted to stop code hoisting. */
18517 if (is_sync)
18518 aarch64_emit_load_exclusive (mode, old_out, mem,
18519 GEN_INT (MEMMODEL_RELAXED));
18520 else
18521 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18523 switch (code)
18525 case SET:
18526 new_out = value;
18527 break;
18529 case NOT:
18530 x = gen_rtx_AND (wmode, old_out, value);
18531 emit_insn (gen_rtx_SET (new_out, x));
18532 x = gen_rtx_NOT (wmode, new_out);
18533 emit_insn (gen_rtx_SET (new_out, x));
18534 break;
18536 case MINUS:
18537 if (CONST_INT_P (value))
18539 value = GEN_INT (-INTVAL (value));
18540 code = PLUS;
18542 /* Fall through. */
18544 default:
18545 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18546 emit_insn (gen_rtx_SET (new_out, x));
18547 break;
18550 aarch64_emit_store_exclusive (mode, cond, mem,
18551 gen_lowpart (mode, new_out), model_rtx);
18553 if (aarch64_track_speculation)
18555 /* Emit an explicit compare instruction, so that we can correctly
18556 track the condition codes. */
18557 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18558 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18560 else
18561 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18563 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18564 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18565 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18567 /* Emit any final barrier needed for a __sync operation. */
18568 if (is_sync)
18569 aarch64_emit_post_barrier (model);
18572 static void
18573 aarch64_init_libfuncs (void)
18575 /* Half-precision float operations. The compiler handles all operations
18576 with NULL libfuncs by converting to SFmode. */
18578 /* Conversions. */
18579 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18580 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18582 /* Arithmetic. */
18583 set_optab_libfunc (add_optab, HFmode, NULL);
18584 set_optab_libfunc (sdiv_optab, HFmode, NULL);
18585 set_optab_libfunc (smul_optab, HFmode, NULL);
18586 set_optab_libfunc (neg_optab, HFmode, NULL);
18587 set_optab_libfunc (sub_optab, HFmode, NULL);
18589 /* Comparisons. */
18590 set_optab_libfunc (eq_optab, HFmode, NULL);
18591 set_optab_libfunc (ne_optab, HFmode, NULL);
18592 set_optab_libfunc (lt_optab, HFmode, NULL);
18593 set_optab_libfunc (le_optab, HFmode, NULL);
18594 set_optab_libfunc (ge_optab, HFmode, NULL);
18595 set_optab_libfunc (gt_optab, HFmode, NULL);
18596 set_optab_libfunc (unord_optab, HFmode, NULL);
18599 /* Target hook for c_mode_for_suffix. */
18600 static machine_mode
18601 aarch64_c_mode_for_suffix (char suffix)
18603 if (suffix == 'q')
18604 return TFmode;
18606 return VOIDmode;
18609 /* We can only represent floating point constants which will fit in
18610 "quarter-precision" values. These values are characterised by
18611 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
18614 (-1)^s * (n/16) * 2^r
18616 Where:
18617 's' is the sign bit.
18618 'n' is an integer in the range 16 <= n <= 31.
18619 'r' is an integer in the range -3 <= r <= 4. */
18621 /* Return true iff X can be represented by a quarter-precision
18622 floating point immediate operand X. Note, we cannot represent 0.0. */
18623 bool
18624 aarch64_float_const_representable_p (rtx x)
18626 /* This represents our current view of how many bits
18627 make up the mantissa. */
18628 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18629 int exponent;
18630 unsigned HOST_WIDE_INT mantissa, mask;
18631 REAL_VALUE_TYPE r, m;
18632 bool fail;
18634 x = unwrap_const_vec_duplicate (x);
18635 if (!CONST_DOUBLE_P (x))
18636 return false;
18638 if (GET_MODE (x) == VOIDmode
18639 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18640 return false;
18642 r = *CONST_DOUBLE_REAL_VALUE (x);
18644 /* We cannot represent infinities, NaNs or +/-zero. We won't
18645 know if we have +zero until we analyse the mantissa, but we
18646 can reject the other invalid values. */
18647 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18648 || REAL_VALUE_MINUS_ZERO (r))
18649 return false;
18651 /* Extract exponent. */
18652 r = real_value_abs (&r);
18653 exponent = REAL_EXP (&r);
18655 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18656 highest (sign) bit, with a fixed binary point at bit point_pos.
18657 m1 holds the low part of the mantissa, m2 the high part.
18658 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18659 bits for the mantissa, this can fail (low bits will be lost). */
18660 real_ldexp (&m, &r, point_pos - exponent);
18661 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18663 /* If the low part of the mantissa has bits set we cannot represent
18664 the value. */
18665 if (w.ulow () != 0)
18666 return false;
18667 /* We have rejected the lower HOST_WIDE_INT, so update our
18668 understanding of how many bits lie in the mantissa and
18669 look only at the high HOST_WIDE_INT. */
18670 mantissa = w.elt (1);
18671 point_pos -= HOST_BITS_PER_WIDE_INT;
18673 /* We can only represent values with a mantissa of the form 1.xxxx. */
18674 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18675 if ((mantissa & mask) != 0)
18676 return false;
18678 /* Having filtered unrepresentable values, we may now remove all
18679 but the highest 5 bits. */
18680 mantissa >>= point_pos - 5;
18682 /* We cannot represent the value 0.0, so reject it. This is handled
18683 elsewhere. */
18684 if (mantissa == 0)
18685 return false;
18687 /* Then, as bit 4 is always set, we can mask it off, leaving
18688 the mantissa in the range [0, 15]. */
18689 mantissa &= ~(1 << 4);
18690 gcc_assert (mantissa <= 15);
18692 /* GCC internally does not use IEEE754-like encoding (where normalized
18693 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
18694 Our mantissa values are shifted 4 places to the left relative to
18695 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18696 by 5 places to correct for GCC's representation. */
18697 exponent = 5 - exponent;
18699 return (exponent >= 0 && exponent <= 7);
18702 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18703 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
18704 output MOVI/MVNI, ORR or BIC immediate. */
18705 char*
18706 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18707 enum simd_immediate_check which)
18709 bool is_valid;
18710 static char templ[40];
18711 const char *mnemonic;
18712 const char *shift_op;
18713 unsigned int lane_count = 0;
18714 char element_char;
18716 struct simd_immediate_info info;
18718 /* This will return true to show const_vector is legal for use as either
18719 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18720 It will also update INFO to show how the immediate should be generated.
18721 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
18722 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18723 gcc_assert (is_valid);
18725 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18726 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18728 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18730 gcc_assert (info.insn == simd_immediate_info::MOV
18731 && info.u.mov.shift == 0);
18732 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18733 move immediate path. */
18734 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18735 info.u.mov.value = GEN_INT (0);
18736 else
18738 const unsigned int buf_size = 20;
18739 char float_buf[buf_size] = {'\0'};
18740 real_to_decimal_for_mode (float_buf,
18741 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18742 buf_size, buf_size, 1, info.elt_mode);
18744 if (lane_count == 1)
18745 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18746 else
18747 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18748 lane_count, element_char, float_buf);
18749 return templ;
18753 gcc_assert (CONST_INT_P (info.u.mov.value));
18755 if (which == AARCH64_CHECK_MOV)
18757 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18758 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18759 ? "msl" : "lsl");
18760 if (lane_count == 1)
18761 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18762 mnemonic, UINTVAL (info.u.mov.value));
18763 else if (info.u.mov.shift)
18764 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18765 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18766 element_char, UINTVAL (info.u.mov.value), shift_op,
18767 info.u.mov.shift);
18768 else
18769 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18770 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18771 element_char, UINTVAL (info.u.mov.value));
18773 else
18775 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
18776 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18777 if (info.u.mov.shift)
18778 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18779 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18780 element_char, UINTVAL (info.u.mov.value), "lsl",
18781 info.u.mov.shift);
18782 else
18783 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18784 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18785 element_char, UINTVAL (info.u.mov.value));
18787 return templ;
18790 char*
18791 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18794 /* If a floating point number was passed and we desire to use it in an
18795 integer mode do the conversion to integer. */
18796 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18798 unsigned HOST_WIDE_INT ival;
18799 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18800 gcc_unreachable ();
18801 immediate = gen_int_mode (ival, mode);
18804 machine_mode vmode;
18805 /* use a 64 bit mode for everything except for DI/DF mode, where we use
18806 a 128 bit vector mode. */
18807 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18809 vmode = aarch64_simd_container_mode (mode, width);
18810 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18811 return aarch64_output_simd_mov_immediate (v_op, width);
18814 /* Return the output string to use for moving immediate CONST_VECTOR
18815 into an SVE register. */
18817 char *
18818 aarch64_output_sve_mov_immediate (rtx const_vector)
18820 static char templ[40];
18821 struct simd_immediate_info info;
18822 char element_char;
18824 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18825 gcc_assert (is_valid);
18827 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18829 machine_mode vec_mode = GET_MODE (const_vector);
18830 if (aarch64_sve_pred_mode_p (vec_mode))
18832 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18833 if (info.insn == simd_immediate_info::MOV)
18835 gcc_assert (info.u.mov.value == const0_rtx);
18836 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18838 else
18840 gcc_assert (info.insn == simd_immediate_info::PTRUE);
18841 unsigned int total_bytes;
18842 if (info.u.pattern == AARCH64_SV_ALL
18843 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18844 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
18845 total_bytes / GET_MODE_SIZE (info.elt_mode));
18846 else
18847 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
18848 svpattern_token (info.u.pattern));
18850 return buf;
18853 if (info.insn == simd_immediate_info::INDEX)
18855 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
18856 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
18857 element_char, INTVAL (info.u.index.base),
18858 INTVAL (info.u.index.step));
18859 return templ;
18862 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18864 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18865 info.u.mov.value = GEN_INT (0);
18866 else
18868 const int buf_size = 20;
18869 char float_buf[buf_size] = {};
18870 real_to_decimal_for_mode (float_buf,
18871 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18872 buf_size, buf_size, 1, info.elt_mode);
18874 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
18875 element_char, float_buf);
18876 return templ;
18880 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
18881 element_char, INTVAL (info.u.mov.value));
18882 return templ;
18885 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
18886 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18887 pattern. */
18889 char *
18890 aarch64_output_sve_ptrues (rtx const_unspec)
18892 static char templ[40];
18894 struct simd_immediate_info info;
18895 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
18896 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
18898 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18899 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
18900 svpattern_token (info.u.pattern));
18901 return templ;
18904 /* Split operands into moves from op[1] + op[2] into op[0]. */
18906 void
18907 aarch64_split_combinev16qi (rtx operands[3])
18909 unsigned int dest = REGNO (operands[0]);
18910 unsigned int src1 = REGNO (operands[1]);
18911 unsigned int src2 = REGNO (operands[2]);
18912 machine_mode halfmode = GET_MODE (operands[1]);
18913 unsigned int halfregs = REG_NREGS (operands[1]);
18914 rtx destlo, desthi;
18916 gcc_assert (halfmode == V16QImode);
18918 if (src1 == dest && src2 == dest + halfregs)
18920 /* No-op move. Can't split to nothing; emit something. */
18921 emit_note (NOTE_INSN_DELETED);
18922 return;
18925 /* Preserve register attributes for variable tracking. */
18926 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
18927 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
18928 GET_MODE_SIZE (halfmode));
18930 /* Special case of reversed high/low parts. */
18931 if (reg_overlap_mentioned_p (operands[2], destlo)
18932 && reg_overlap_mentioned_p (operands[1], desthi))
18934 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18935 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
18936 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18938 else if (!reg_overlap_mentioned_p (operands[2], destlo))
18940 /* Try to avoid unnecessary moves if part of the result
18941 is in the right place already. */
18942 if (src1 != dest)
18943 emit_move_insn (destlo, operands[1]);
18944 if (src2 != dest + halfregs)
18945 emit_move_insn (desthi, operands[2]);
18947 else
18949 if (src2 != dest + halfregs)
18950 emit_move_insn (desthi, operands[2]);
18951 if (src1 != dest)
18952 emit_move_insn (destlo, operands[1]);
18956 /* vec_perm support. */
18958 struct expand_vec_perm_d
18960 rtx target, op0, op1;
18961 vec_perm_indices perm;
18962 machine_mode vmode;
18963 unsigned int vec_flags;
18964 bool one_vector_p;
18965 bool testing_p;
18968 /* Generate a variable permutation. */
18970 static void
18971 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
18973 machine_mode vmode = GET_MODE (target);
18974 bool one_vector_p = rtx_equal_p (op0, op1);
18976 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
18977 gcc_checking_assert (GET_MODE (op0) == vmode);
18978 gcc_checking_assert (GET_MODE (op1) == vmode);
18979 gcc_checking_assert (GET_MODE (sel) == vmode);
18980 gcc_checking_assert (TARGET_SIMD);
18982 if (one_vector_p)
18984 if (vmode == V8QImode)
18986 /* Expand the argument to a V16QI mode by duplicating it. */
18987 rtx pair = gen_reg_rtx (V16QImode);
18988 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
18989 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18991 else
18993 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
18996 else
18998 rtx pair;
19000 if (vmode == V8QImode)
19002 pair = gen_reg_rtx (V16QImode);
19003 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
19004 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19006 else
19008 pair = gen_reg_rtx (OImode);
19009 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
19010 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
19015 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19016 NELT is the number of elements in the vector. */
19018 void
19019 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
19020 unsigned int nelt)
19022 machine_mode vmode = GET_MODE (target);
19023 bool one_vector_p = rtx_equal_p (op0, op1);
19024 rtx mask;
19026 /* The TBL instruction does not use a modulo index, so we must take care
19027 of that ourselves. */
19028 mask = aarch64_simd_gen_const_vector_dup (vmode,
19029 one_vector_p ? nelt - 1 : 2 * nelt - 1);
19030 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19032 /* For big-endian, we also need to reverse the index within the vector
19033 (but not which vector). */
19034 if (BYTES_BIG_ENDIAN)
19036 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19037 if (!one_vector_p)
19038 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19039 sel = expand_simple_binop (vmode, XOR, sel, mask,
19040 NULL, 0, OPTAB_LIB_WIDEN);
19042 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19045 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19047 static void
19048 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19050 emit_insn (gen_rtx_SET (target,
19051 gen_rtx_UNSPEC (GET_MODE (target),
19052 gen_rtvec (2, op0, op1), code)));
19055 /* Expand an SVE vec_perm with the given operands. */
19057 void
19058 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19060 machine_mode data_mode = GET_MODE (target);
19061 machine_mode sel_mode = GET_MODE (sel);
19062 /* Enforced by the pattern condition. */
19063 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19065 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19066 size of the two value vectors, i.e. the upper bits of the indices
19067 are effectively ignored. SVE TBL instead produces 0 for any
19068 out-of-range indices, so we need to modulo all the vec_perm indices
19069 to ensure they are all in range. */
19070 rtx sel_reg = force_reg (sel_mode, sel);
19072 /* Check if the sel only references the first values vector. */
19073 if (GET_CODE (sel) == CONST_VECTOR
19074 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19076 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19077 return;
19080 /* Check if the two values vectors are the same. */
19081 if (rtx_equal_p (op0, op1))
19083 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19084 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19085 NULL, 0, OPTAB_DIRECT);
19086 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19087 return;
19090 /* Run TBL on for each value vector and combine the results. */
19092 rtx res0 = gen_reg_rtx (data_mode);
19093 rtx res1 = gen_reg_rtx (data_mode);
19094 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19095 if (GET_CODE (sel) != CONST_VECTOR
19096 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19098 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19099 2 * nunits - 1);
19100 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19101 NULL, 0, OPTAB_DIRECT);
19103 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19104 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19105 NULL, 0, OPTAB_DIRECT);
19106 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19107 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19108 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19109 else
19110 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19113 /* Recognize patterns suitable for the TRN instructions. */
19114 static bool
19115 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19117 HOST_WIDE_INT odd;
19118 poly_uint64 nelt = d->perm.length ();
19119 rtx out, in0, in1, x;
19120 machine_mode vmode = d->vmode;
19122 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19123 return false;
19125 /* Note that these are little-endian tests.
19126 We correct for big-endian later. */
19127 if (!d->perm[0].is_constant (&odd)
19128 || (odd != 0 && odd != 1)
19129 || !d->perm.series_p (0, 2, odd, 2)
19130 || !d->perm.series_p (1, 2, nelt + odd, 2))
19131 return false;
19133 /* Success! */
19134 if (d->testing_p)
19135 return true;
19137 in0 = d->op0;
19138 in1 = d->op1;
19139 /* We don't need a big-endian lane correction for SVE; see the comment
19140 at the head of aarch64-sve.md for details. */
19141 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19143 x = in0, in0 = in1, in1 = x;
19144 odd = !odd;
19146 out = d->target;
19148 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19149 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19150 return true;
19153 /* Recognize patterns suitable for the UZP instructions. */
19154 static bool
19155 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19157 HOST_WIDE_INT odd;
19158 rtx out, in0, in1, x;
19159 machine_mode vmode = d->vmode;
19161 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19162 return false;
19164 /* Note that these are little-endian tests.
19165 We correct for big-endian later. */
19166 if (!d->perm[0].is_constant (&odd)
19167 || (odd != 0 && odd != 1)
19168 || !d->perm.series_p (0, 1, odd, 2))
19169 return false;
19171 /* Success! */
19172 if (d->testing_p)
19173 return true;
19175 in0 = d->op0;
19176 in1 = d->op1;
19177 /* We don't need a big-endian lane correction for SVE; see the comment
19178 at the head of aarch64-sve.md for details. */
19179 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19181 x = in0, in0 = in1, in1 = x;
19182 odd = !odd;
19184 out = d->target;
19186 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19187 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19188 return true;
19191 /* Recognize patterns suitable for the ZIP instructions. */
19192 static bool
19193 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19195 unsigned int high;
19196 poly_uint64 nelt = d->perm.length ();
19197 rtx out, in0, in1, x;
19198 machine_mode vmode = d->vmode;
19200 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19201 return false;
19203 /* Note that these are little-endian tests.
19204 We correct for big-endian later. */
19205 poly_uint64 first = d->perm[0];
19206 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19207 || !d->perm.series_p (0, 2, first, 1)
19208 || !d->perm.series_p (1, 2, first + nelt, 1))
19209 return false;
19210 high = maybe_ne (first, 0U);
19212 /* Success! */
19213 if (d->testing_p)
19214 return true;
19216 in0 = d->op0;
19217 in1 = d->op1;
19218 /* We don't need a big-endian lane correction for SVE; see the comment
19219 at the head of aarch64-sve.md for details. */
19220 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19222 x = in0, in0 = in1, in1 = x;
19223 high = !high;
19225 out = d->target;
19227 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19228 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19229 return true;
19232 /* Recognize patterns for the EXT insn. */
19234 static bool
19235 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19237 HOST_WIDE_INT location;
19238 rtx offset;
19240 /* The first element always refers to the first vector.
19241 Check if the extracted indices are increasing by one. */
19242 if (d->vec_flags == VEC_SVE_PRED
19243 || !d->perm[0].is_constant (&location)
19244 || !d->perm.series_p (0, 1, location, 1))
19245 return false;
19247 /* Success! */
19248 if (d->testing_p)
19249 return true;
19251 /* The case where (location == 0) is a no-op for both big- and little-endian,
19252 and is removed by the mid-end at optimization levels -O1 and higher.
19254 We don't need a big-endian lane correction for SVE; see the comment
19255 at the head of aarch64-sve.md for details. */
19256 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19258 /* After setup, we want the high elements of the first vector (stored
19259 at the LSB end of the register), and the low elements of the second
19260 vector (stored at the MSB end of the register). So swap. */
19261 std::swap (d->op0, d->op1);
19262 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19263 to_constant () is safe since this is restricted to Advanced SIMD
19264 vectors. */
19265 location = d->perm.length ().to_constant () - location;
19268 offset = GEN_INT (location);
19269 emit_set_insn (d->target,
19270 gen_rtx_UNSPEC (d->vmode,
19271 gen_rtvec (3, d->op0, d->op1, offset),
19272 UNSPEC_EXT));
19273 return true;
19276 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19277 within each 64-bit, 32-bit or 16-bit granule. */
19279 static bool
19280 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19282 HOST_WIDE_INT diff;
19283 unsigned int i, size, unspec;
19284 machine_mode pred_mode;
19286 if (d->vec_flags == VEC_SVE_PRED
19287 || !d->one_vector_p
19288 || !d->perm[0].is_constant (&diff))
19289 return false;
19291 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19292 if (size == 8)
19294 unspec = UNSPEC_REV64;
19295 pred_mode = VNx2BImode;
19297 else if (size == 4)
19299 unspec = UNSPEC_REV32;
19300 pred_mode = VNx4BImode;
19302 else if (size == 2)
19304 unspec = UNSPEC_REV16;
19305 pred_mode = VNx8BImode;
19307 else
19308 return false;
19310 unsigned int step = diff + 1;
19311 for (i = 0; i < step; ++i)
19312 if (!d->perm.series_p (i, step, diff - i, step))
19313 return false;
19315 /* Success! */
19316 if (d->testing_p)
19317 return true;
19319 if (d->vec_flags == VEC_SVE_DATA)
19321 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19322 rtx target = gen_reg_rtx (int_mode);
19323 if (BYTES_BIG_ENDIAN)
19324 /* The act of taking a subreg between INT_MODE and d->vmode
19325 is itself a reversing operation on big-endian targets;
19326 see the comment at the head of aarch64-sve.md for details.
19327 First reinterpret OP0 as INT_MODE without using a subreg
19328 and without changing the contents. */
19329 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19330 else
19332 /* For SVE we use REV[BHW] unspecs derived from the element size
19333 of v->mode and vector modes whose elements have SIZE bytes.
19334 This ensures that the vector modes match the predicate modes. */
19335 int unspec = aarch64_sve_rev_unspec (d->vmode);
19336 rtx pred = aarch64_ptrue_reg (pred_mode);
19337 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19338 gen_lowpart (int_mode, d->op0)));
19340 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19341 return true;
19343 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19344 emit_set_insn (d->target, src);
19345 return true;
19348 /* Recognize patterns for the REV insn, which reverses elements within
19349 a full vector. */
19351 static bool
19352 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19354 poly_uint64 nelt = d->perm.length ();
19356 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19357 return false;
19359 if (!d->perm.series_p (0, 1, nelt - 1, -1))
19360 return false;
19362 /* Success! */
19363 if (d->testing_p)
19364 return true;
19366 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19367 emit_set_insn (d->target, src);
19368 return true;
19371 static bool
19372 aarch64_evpc_dup (struct expand_vec_perm_d *d)
19374 rtx out = d->target;
19375 rtx in0;
19376 HOST_WIDE_INT elt;
19377 machine_mode vmode = d->vmode;
19378 rtx lane;
19380 if (d->vec_flags == VEC_SVE_PRED
19381 || d->perm.encoding ().encoded_nelts () != 1
19382 || !d->perm[0].is_constant (&elt))
19383 return false;
19385 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19386 return false;
19388 /* Success! */
19389 if (d->testing_p)
19390 return true;
19392 /* The generic preparation in aarch64_expand_vec_perm_const_1
19393 swaps the operand order and the permute indices if it finds
19394 d->perm[0] to be in the second operand. Thus, we can always
19395 use d->op0 and need not do any extra arithmetic to get the
19396 correct lane number. */
19397 in0 = d->op0;
19398 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
19400 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19401 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19402 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19403 return true;
19406 static bool
19407 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19409 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19410 machine_mode vmode = d->vmode;
19412 /* Make sure that the indices are constant. */
19413 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19414 for (unsigned int i = 0; i < encoded_nelts; ++i)
19415 if (!d->perm[i].is_constant ())
19416 return false;
19418 if (d->testing_p)
19419 return true;
19421 /* Generic code will try constant permutation twice. Once with the
19422 original mode and again with the elements lowered to QImode.
19423 So wait and don't do the selector expansion ourselves. */
19424 if (vmode != V8QImode && vmode != V16QImode)
19425 return false;
19427 /* to_constant is safe since this routine is specific to Advanced SIMD
19428 vectors. */
19429 unsigned int nelt = d->perm.length ().to_constant ();
19430 for (unsigned int i = 0; i < nelt; ++i)
19431 /* If big-endian and two vectors we end up with a weird mixed-endian
19432 mode on NEON. Reverse the index within each word but not the word
19433 itself. to_constant is safe because we checked is_constant above. */
19434 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19435 ? d->perm[i].to_constant () ^ (nelt - 1)
19436 : d->perm[i].to_constant ());
19438 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19439 sel = force_reg (vmode, sel);
19441 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19442 return true;
19445 /* Try to implement D using an SVE TBL instruction. */
19447 static bool
19448 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19450 unsigned HOST_WIDE_INT nelt;
19452 /* Permuting two variable-length vectors could overflow the
19453 index range. */
19454 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19455 return false;
19457 if (d->testing_p)
19458 return true;
19460 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
19461 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19462 if (d->one_vector_p)
19463 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19464 else
19465 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19466 return true;
19469 /* Try to implement D using SVE SEL instruction. */
19471 static bool
19472 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19474 machine_mode vmode = d->vmode;
19475 int unit_size = GET_MODE_UNIT_SIZE (vmode);
19477 if (d->vec_flags != VEC_SVE_DATA
19478 || unit_size > 8)
19479 return false;
19481 int n_patterns = d->perm.encoding ().npatterns ();
19482 poly_int64 vec_len = d->perm.length ();
19484 for (int i = 0; i < n_patterns; ++i)
19485 if (!known_eq (d->perm[i], i)
19486 && !known_eq (d->perm[i], vec_len + i))
19487 return false;
19489 for (int i = n_patterns; i < n_patterns * 2; i++)
19490 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19491 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19492 return false;
19494 if (d->testing_p)
19495 return true;
19497 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
19499 /* Build a predicate that is true when op0 elements should be used. */
19500 rtx_vector_builder builder (pred_mode, n_patterns, 2);
19501 for (int i = 0; i < n_patterns * 2; i++)
19503 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19504 : CONST0_RTX (BImode);
19505 builder.quick_push (elem);
19508 rtx const_vec = builder.build ();
19509 rtx pred = force_reg (pred_mode, const_vec);
19510 /* TARGET = PRED ? OP0 : OP1. */
19511 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
19512 return true;
19515 static bool
19516 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19518 /* The pattern matching functions above are written to look for a small
19519 number to begin the sequence (0, 1, N/2). If we begin with an index
19520 from the second operand, we can swap the operands. */
19521 poly_int64 nelt = d->perm.length ();
19522 if (known_ge (d->perm[0], nelt))
19524 d->perm.rotate_inputs (1);
19525 std::swap (d->op0, d->op1);
19528 if ((d->vec_flags == VEC_ADVSIMD
19529 || d->vec_flags == VEC_SVE_DATA
19530 || d->vec_flags == VEC_SVE_PRED)
19531 && known_gt (nelt, 1))
19533 if (aarch64_evpc_rev_local (d))
19534 return true;
19535 else if (aarch64_evpc_rev_global (d))
19536 return true;
19537 else if (aarch64_evpc_ext (d))
19538 return true;
19539 else if (aarch64_evpc_dup (d))
19540 return true;
19541 else if (aarch64_evpc_zip (d))
19542 return true;
19543 else if (aarch64_evpc_uzp (d))
19544 return true;
19545 else if (aarch64_evpc_trn (d))
19546 return true;
19547 else if (aarch64_evpc_sel (d))
19548 return true;
19549 if (d->vec_flags == VEC_SVE_DATA)
19550 return aarch64_evpc_sve_tbl (d);
19551 else if (d->vec_flags == VEC_ADVSIMD)
19552 return aarch64_evpc_tbl (d);
19554 return false;
19557 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19559 static bool
19560 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19561 rtx op1, const vec_perm_indices &sel)
19563 struct expand_vec_perm_d d;
19565 /* Check whether the mask can be applied to a single vector. */
19566 if (sel.ninputs () == 1
19567 || (op0 && rtx_equal_p (op0, op1)))
19568 d.one_vector_p = true;
19569 else if (sel.all_from_input_p (0))
19571 d.one_vector_p = true;
19572 op1 = op0;
19574 else if (sel.all_from_input_p (1))
19576 d.one_vector_p = true;
19577 op0 = op1;
19579 else
19580 d.one_vector_p = false;
19582 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19583 sel.nelts_per_input ());
19584 d.vmode = vmode;
19585 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19586 d.target = target;
19587 d.op0 = op0;
19588 d.op1 = op1;
19589 d.testing_p = !target;
19591 if (!d.testing_p)
19592 return aarch64_expand_vec_perm_const_1 (&d);
19594 rtx_insn *last = get_last_insn ();
19595 bool ret = aarch64_expand_vec_perm_const_1 (&d);
19596 gcc_assert (last == get_last_insn ());
19598 return ret;
19601 /* Generate a byte permute mask for a register of mode MODE,
19602 which has NUNITS units. */
19605 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19607 /* We have to reverse each vector because we dont have
19608 a permuted load that can reverse-load according to ABI rules. */
19609 rtx mask;
19610 rtvec v = rtvec_alloc (16);
19611 unsigned int i, j;
19612 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19614 gcc_assert (BYTES_BIG_ENDIAN);
19615 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19617 for (i = 0; i < nunits; i++)
19618 for (j = 0; j < usize; j++)
19619 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19620 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19621 return force_reg (V16QImode, mask);
19624 /* Expand an SVE integer comparison using the SVE equivalent of:
19626 (set TARGET (CODE OP0 OP1)). */
19628 void
19629 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19631 machine_mode pred_mode = GET_MODE (target);
19632 machine_mode data_mode = GET_MODE (op0);
19633 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19634 op0, op1);
19635 if (!rtx_equal_p (target, res))
19636 emit_move_insn (target, res);
19639 /* Return the UNSPEC_COND_* code for comparison CODE. */
19641 static unsigned int
19642 aarch64_unspec_cond_code (rtx_code code)
19644 switch (code)
19646 case NE:
19647 return UNSPEC_COND_FCMNE;
19648 case EQ:
19649 return UNSPEC_COND_FCMEQ;
19650 case LT:
19651 return UNSPEC_COND_FCMLT;
19652 case GT:
19653 return UNSPEC_COND_FCMGT;
19654 case LE:
19655 return UNSPEC_COND_FCMLE;
19656 case GE:
19657 return UNSPEC_COND_FCMGE;
19658 case UNORDERED:
19659 return UNSPEC_COND_FCMUO;
19660 default:
19661 gcc_unreachable ();
19665 /* Emit:
19667 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19669 where <X> is the operation associated with comparison CODE.
19670 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19672 static void
19673 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19674 bool known_ptrue_p, rtx op0, rtx op1)
19676 rtx flag = gen_int_mode (known_ptrue_p, SImode);
19677 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19678 gen_rtvec (4, pred, flag, op0, op1),
19679 aarch64_unspec_cond_code (code));
19680 emit_set_insn (target, unspec);
19683 /* Emit the SVE equivalent of:
19685 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19686 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19687 (set TARGET (ior:PRED_MODE TMP1 TMP2))
19689 where <Xi> is the operation associated with comparison CODEi.
19690 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19692 static void
19693 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19694 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19696 machine_mode pred_mode = GET_MODE (pred);
19697 rtx tmp1 = gen_reg_rtx (pred_mode);
19698 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19699 rtx tmp2 = gen_reg_rtx (pred_mode);
19700 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19701 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19704 /* Emit the SVE equivalent of:
19706 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19707 (set TARGET (not TMP))
19709 where <X> is the operation associated with comparison CODE.
19710 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19712 static void
19713 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19714 bool known_ptrue_p, rtx op0, rtx op1)
19716 machine_mode pred_mode = GET_MODE (pred);
19717 rtx tmp = gen_reg_rtx (pred_mode);
19718 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19719 aarch64_emit_unop (target, one_cmpl_optab, tmp);
19722 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19724 (set TARGET (CODE OP0 OP1))
19726 If CAN_INVERT_P is true, the caller can also handle inverted results;
19727 return true if the result is in fact inverted. */
19729 bool
19730 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19731 rtx op0, rtx op1, bool can_invert_p)
19733 machine_mode pred_mode = GET_MODE (target);
19734 machine_mode data_mode = GET_MODE (op0);
19736 rtx ptrue = aarch64_ptrue_reg (pred_mode);
19737 switch (code)
19739 case UNORDERED:
19740 /* UNORDERED has no immediate form. */
19741 op1 = force_reg (data_mode, op1);
19742 /* fall through */
19743 case LT:
19744 case LE:
19745 case GT:
19746 case GE:
19747 case EQ:
19748 case NE:
19750 /* There is native support for the comparison. */
19751 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19752 return false;
19755 case LTGT:
19756 /* This is a trapping operation (LT or GT). */
19757 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19758 return false;
19760 case UNEQ:
19761 if (!flag_trapping_math)
19763 /* This would trap for signaling NaNs. */
19764 op1 = force_reg (data_mode, op1);
19765 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19766 ptrue, true, op0, op1);
19767 return false;
19769 /* fall through */
19770 case UNLT:
19771 case UNLE:
19772 case UNGT:
19773 case UNGE:
19774 if (flag_trapping_math)
19776 /* Work out which elements are ordered. */
19777 rtx ordered = gen_reg_rtx (pred_mode);
19778 op1 = force_reg (data_mode, op1);
19779 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19780 ptrue, true, op0, op1);
19782 /* Test the opposite condition for the ordered elements,
19783 then invert the result. */
19784 if (code == UNEQ)
19785 code = NE;
19786 else
19787 code = reverse_condition_maybe_unordered (code);
19788 if (can_invert_p)
19790 aarch64_emit_sve_fp_cond (target, code,
19791 ordered, false, op0, op1);
19792 return true;
19794 aarch64_emit_sve_invert_fp_cond (target, code,
19795 ordered, false, op0, op1);
19796 return false;
19798 break;
19800 case ORDERED:
19801 /* ORDERED has no immediate form. */
19802 op1 = force_reg (data_mode, op1);
19803 break;
19805 default:
19806 gcc_unreachable ();
19809 /* There is native support for the inverse comparison. */
19810 code = reverse_condition_maybe_unordered (code);
19811 if (can_invert_p)
19813 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19814 return true;
19816 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19817 return false;
19820 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
19821 of the data being selected and CMP_MODE is the mode of the values being
19822 compared. */
19824 void
19825 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19826 rtx *ops)
19828 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
19829 rtx pred = gen_reg_rtx (pred_mode);
19830 if (FLOAT_MODE_P (cmp_mode))
19832 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19833 ops[4], ops[5], true))
19834 std::swap (ops[1], ops[2]);
19836 else
19837 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19839 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19840 ops[1] = force_reg (data_mode, ops[1]);
19841 /* The "false" value can only be zero if the "true" value is a constant. */
19842 if (register_operand (ops[1], data_mode)
19843 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19844 ops[2] = force_reg (data_mode, ops[2]);
19846 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
19847 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
19850 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
19851 true. However due to issues with register allocation it is preferable
19852 to avoid tieing integer scalar and FP scalar modes. Executing integer
19853 operations in general registers is better than treating them as scalar
19854 vector operations. This reduces latency and avoids redundant int<->FP
19855 moves. So tie modes if they are either the same class, or vector modes
19856 with other vector modes, vector structs or any scalar mode. */
19858 static bool
19859 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
19861 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
19862 return true;
19864 /* We specifically want to allow elements of "structure" modes to
19865 be tieable to the structure. This more general condition allows
19866 other rarer situations too. The reason we don't extend this to
19867 predicate modes is that there are no predicate structure modes
19868 nor any specific instructions for extracting part of a predicate
19869 register. */
19870 if (aarch64_vector_data_mode_p (mode1)
19871 && aarch64_vector_data_mode_p (mode2))
19872 return true;
19874 /* Also allow any scalar modes with vectors. */
19875 if (aarch64_vector_mode_supported_p (mode1)
19876 || aarch64_vector_mode_supported_p (mode2))
19877 return true;
19879 return false;
19882 /* Return a new RTX holding the result of moving POINTER forward by
19883 AMOUNT bytes. */
19885 static rtx
19886 aarch64_move_pointer (rtx pointer, poly_int64 amount)
19888 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
19890 return adjust_automodify_address (pointer, GET_MODE (pointer),
19891 next, amount);
19894 /* Return a new RTX holding the result of moving POINTER forward by the
19895 size of the mode it points to. */
19897 static rtx
19898 aarch64_progress_pointer (rtx pointer)
19900 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
19903 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19904 MODE bytes. */
19906 static void
19907 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
19908 machine_mode mode)
19910 rtx reg = gen_reg_rtx (mode);
19912 /* "Cast" the pointers to the correct mode. */
19913 *src = adjust_address (*src, mode, 0);
19914 *dst = adjust_address (*dst, mode, 0);
19915 /* Emit the memcpy. */
19916 emit_move_insn (reg, *src);
19917 emit_move_insn (*dst, reg);
19918 /* Move the pointers forward. */
19919 *src = aarch64_progress_pointer (*src);
19920 *dst = aarch64_progress_pointer (*dst);
19923 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
19924 we succeed, otherwise return false. */
19926 bool
19927 aarch64_expand_cpymem (rtx *operands)
19929 int n, mode_bits;
19930 rtx dst = operands[0];
19931 rtx src = operands[1];
19932 rtx base;
19933 machine_mode cur_mode = BLKmode, next_mode;
19934 bool speed_p = !optimize_function_for_size_p (cfun);
19936 /* When optimizing for size, give a better estimate of the length of a
19937 memcpy call, but use the default otherwise. Moves larger than 8 bytes
19938 will always require an even number of instructions to do now. And each
19939 operation requires both a load+store, so devide the max number by 2. */
19940 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
19942 /* We can't do anything smart if the amount to copy is not constant. */
19943 if (!CONST_INT_P (operands[2]))
19944 return false;
19946 n = INTVAL (operands[2]);
19948 /* Try to keep the number of instructions low. For all cases we will do at
19949 most two moves for the residual amount, since we'll always overlap the
19950 remainder. */
19951 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
19952 return false;
19954 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
19955 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
19957 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
19958 src = adjust_automodify_address (src, VOIDmode, base, 0);
19960 /* Convert n to bits to make the rest of the code simpler. */
19961 n = n * BITS_PER_UNIT;
19963 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
19964 larger than TImode, but we should not use them for loads/stores here. */
19965 const int copy_limit = GET_MODE_BITSIZE (TImode);
19967 while (n > 0)
19969 /* Find the largest mode in which to do the copy in without over reading
19970 or writing. */
19971 opt_scalar_int_mode mode_iter;
19972 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
19973 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
19974 cur_mode = mode_iter.require ();
19976 gcc_assert (cur_mode != BLKmode);
19978 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
19979 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
19981 n -= mode_bits;
19983 /* Do certain trailing copies as overlapping if it's going to be
19984 cheaper. i.e. less instructions to do so. For instance doing a 15
19985 byte copy it's more efficient to do two overlapping 8 byte copies than
19986 8 + 6 + 1. */
19987 if (n > 0 && n <= 8 * BITS_PER_UNIT)
19989 next_mode = smallest_mode_for_size (n, MODE_INT);
19990 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
19991 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
19992 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
19993 n = n_bits;
19997 return true;
20000 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20001 SImode stores. Handle the case when the constant has identical
20002 bottom and top halves. This is beneficial when the two stores can be
20003 merged into an STP and we avoid synthesising potentially expensive
20004 immediates twice. Return true if such a split is possible. */
20006 bool
20007 aarch64_split_dimode_const_store (rtx dst, rtx src)
20009 rtx lo = gen_lowpart (SImode, src);
20010 rtx hi = gen_highpart_mode (SImode, DImode, src);
20012 bool size_p = optimize_function_for_size_p (cfun);
20014 if (!rtx_equal_p (lo, hi))
20015 return false;
20017 unsigned int orig_cost
20018 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
20019 unsigned int lo_cost
20020 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
20022 /* We want to transform:
20023 MOV x1, 49370
20024 MOVK x1, 0x140, lsl 16
20025 MOVK x1, 0xc0da, lsl 32
20026 MOVK x1, 0x140, lsl 48
20027 STR x1, [x0]
20028 into:
20029 MOV w1, 49370
20030 MOVK w1, 0x140, lsl 16
20031 STP w1, w1, [x0]
20032 So we want to perform this only when we save two instructions
20033 or more. When optimizing for size, however, accept any code size
20034 savings we can. */
20035 if (size_p && orig_cost <= lo_cost)
20036 return false;
20038 if (!size_p
20039 && (orig_cost <= lo_cost + 1))
20040 return false;
20042 rtx mem_lo = adjust_address (dst, SImode, 0);
20043 if (!aarch64_mem_pair_operand (mem_lo, SImode))
20044 return false;
20046 rtx tmp_reg = gen_reg_rtx (SImode);
20047 aarch64_expand_mov_immediate (tmp_reg, lo);
20048 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20049 /* Don't emit an explicit store pair as this may not be always profitable.
20050 Let the sched-fusion logic decide whether to merge them. */
20051 emit_move_insn (mem_lo, tmp_reg);
20052 emit_move_insn (mem_hi, tmp_reg);
20054 return true;
20057 /* Generate RTL for a conditional branch with rtx comparison CODE in
20058 mode CC_MODE. The destination of the unlikely conditional branch
20059 is LABEL_REF. */
20061 void
20062 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20063 rtx label_ref)
20065 rtx x;
20066 x = gen_rtx_fmt_ee (code, VOIDmode,
20067 gen_rtx_REG (cc_mode, CC_REGNUM),
20068 const0_rtx);
20070 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20071 gen_rtx_LABEL_REF (VOIDmode, label_ref),
20072 pc_rtx);
20073 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20076 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20078 OP1 represents the TImode destination operand 1
20079 OP2 represents the TImode destination operand 2
20080 LOW_DEST represents the low half (DImode) of TImode operand 0
20081 LOW_IN1 represents the low half (DImode) of TImode operand 1
20082 LOW_IN2 represents the low half (DImode) of TImode operand 2
20083 HIGH_DEST represents the high half (DImode) of TImode operand 0
20084 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20085 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20087 void
20088 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20089 rtx *low_in1, rtx *low_in2,
20090 rtx *high_dest, rtx *high_in1,
20091 rtx *high_in2)
20093 *low_dest = gen_reg_rtx (DImode);
20094 *low_in1 = gen_lowpart (DImode, op1);
20095 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20096 subreg_lowpart_offset (DImode, TImode));
20097 *high_dest = gen_reg_rtx (DImode);
20098 *high_in1 = gen_highpart (DImode, op1);
20099 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20100 subreg_highpart_offset (DImode, TImode));
20103 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20105 This function differs from 'arch64_addti_scratch_regs' in that
20106 OP1 can be an immediate constant (zero). We must call
20107 subreg_highpart_offset with DImode and TImode arguments, otherwise
20108 VOIDmode will be used for the const_int which generates an internal
20109 error from subreg_size_highpart_offset which does not expect a size of zero.
20111 OP1 represents the TImode destination operand 1
20112 OP2 represents the TImode destination operand 2
20113 LOW_DEST represents the low half (DImode) of TImode operand 0
20114 LOW_IN1 represents the low half (DImode) of TImode operand 1
20115 LOW_IN2 represents the low half (DImode) of TImode operand 2
20116 HIGH_DEST represents the high half (DImode) of TImode operand 0
20117 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20118 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20121 void
20122 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20123 rtx *low_in1, rtx *low_in2,
20124 rtx *high_dest, rtx *high_in1,
20125 rtx *high_in2)
20127 *low_dest = gen_reg_rtx (DImode);
20128 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20129 subreg_lowpart_offset (DImode, TImode));
20131 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20132 subreg_lowpart_offset (DImode, TImode));
20133 *high_dest = gen_reg_rtx (DImode);
20135 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20136 subreg_highpart_offset (DImode, TImode));
20137 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20138 subreg_highpart_offset (DImode, TImode));
20141 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20143 OP0 represents the TImode destination operand 0
20144 LOW_DEST represents the low half (DImode) of TImode operand 0
20145 LOW_IN1 represents the low half (DImode) of TImode operand 1
20146 LOW_IN2 represents the low half (DImode) of TImode operand 2
20147 HIGH_DEST represents the high half (DImode) of TImode operand 0
20148 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20149 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20150 UNSIGNED_P is true if the operation is being performed on unsigned
20151 values. */
20152 void
20153 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20154 rtx low_in2, rtx high_dest, rtx high_in1,
20155 rtx high_in2, bool unsigned_p)
20157 if (low_in2 == const0_rtx)
20159 low_dest = low_in1;
20160 high_in2 = force_reg (DImode, high_in2);
20161 if (unsigned_p)
20162 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20163 else
20164 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20166 else
20168 if (CONST_INT_P (low_in2))
20170 high_in2 = force_reg (DImode, high_in2);
20171 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20172 GEN_INT (-INTVAL (low_in2))));
20174 else
20175 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20177 if (unsigned_p)
20178 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20179 else
20180 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20183 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20184 emit_move_insn (gen_highpart (DImode, op0), high_dest);
20188 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20190 static unsigned HOST_WIDE_INT
20191 aarch64_asan_shadow_offset (void)
20193 if (TARGET_ILP32)
20194 return (HOST_WIDE_INT_1 << 29);
20195 else
20196 return (HOST_WIDE_INT_1 << 36);
20199 static rtx
20200 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20201 int code, tree treeop0, tree treeop1)
20203 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20204 rtx op0, op1;
20205 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20206 insn_code icode;
20207 struct expand_operand ops[4];
20209 start_sequence ();
20210 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20212 op_mode = GET_MODE (op0);
20213 if (op_mode == VOIDmode)
20214 op_mode = GET_MODE (op1);
20216 switch (op_mode)
20218 case E_QImode:
20219 case E_HImode:
20220 case E_SImode:
20221 cmp_mode = SImode;
20222 icode = CODE_FOR_cmpsi;
20223 break;
20225 case E_DImode:
20226 cmp_mode = DImode;
20227 icode = CODE_FOR_cmpdi;
20228 break;
20230 case E_SFmode:
20231 cmp_mode = SFmode;
20232 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20233 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20234 break;
20236 case E_DFmode:
20237 cmp_mode = DFmode;
20238 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20239 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20240 break;
20242 default:
20243 end_sequence ();
20244 return NULL_RTX;
20247 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20248 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20249 if (!op0 || !op1)
20251 end_sequence ();
20252 return NULL_RTX;
20254 *prep_seq = get_insns ();
20255 end_sequence ();
20257 create_fixed_operand (&ops[0], op0);
20258 create_fixed_operand (&ops[1], op1);
20260 start_sequence ();
20261 if (!maybe_expand_insn (icode, 2, ops))
20263 end_sequence ();
20264 return NULL_RTX;
20266 *gen_seq = get_insns ();
20267 end_sequence ();
20269 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20270 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20273 static rtx
20274 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20275 int cmp_code, tree treeop0, tree treeop1, int bit_code)
20277 rtx op0, op1, target;
20278 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20279 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20280 insn_code icode;
20281 struct expand_operand ops[6];
20282 int aarch64_cond;
20284 push_to_sequence (*prep_seq);
20285 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20287 op_mode = GET_MODE (op0);
20288 if (op_mode == VOIDmode)
20289 op_mode = GET_MODE (op1);
20291 switch (op_mode)
20293 case E_QImode:
20294 case E_HImode:
20295 case E_SImode:
20296 cmp_mode = SImode;
20297 break;
20299 case E_DImode:
20300 cmp_mode = DImode;
20301 break;
20303 case E_SFmode:
20304 cmp_mode = SFmode;
20305 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20306 break;
20308 case E_DFmode:
20309 cmp_mode = DFmode;
20310 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20311 break;
20313 default:
20314 end_sequence ();
20315 return NULL_RTX;
20318 icode = code_for_ccmp (cc_mode, cmp_mode);
20320 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20321 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20322 if (!op0 || !op1)
20324 end_sequence ();
20325 return NULL_RTX;
20327 *prep_seq = get_insns ();
20328 end_sequence ();
20330 target = gen_rtx_REG (cc_mode, CC_REGNUM);
20331 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20333 if (bit_code != AND)
20335 /* Treat the ccmp patterns as canonical and use them where possible,
20336 but fall back to ccmp_rev patterns if there's no other option. */
20337 rtx_code prev_code = GET_CODE (prev);
20338 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
20339 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
20340 && !(prev_code == EQ
20341 || prev_code == NE
20342 || prev_code == ORDERED
20343 || prev_code == UNORDERED))
20344 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
20345 else
20347 rtx_code code = reverse_condition (prev_code);
20348 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
20350 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20353 create_fixed_operand (&ops[0], XEXP (prev, 0));
20354 create_fixed_operand (&ops[1], target);
20355 create_fixed_operand (&ops[2], op0);
20356 create_fixed_operand (&ops[3], op1);
20357 create_fixed_operand (&ops[4], prev);
20358 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20360 push_to_sequence (*gen_seq);
20361 if (!maybe_expand_insn (icode, 6, ops))
20363 end_sequence ();
20364 return NULL_RTX;
20367 *gen_seq = get_insns ();
20368 end_sequence ();
20370 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
20373 #undef TARGET_GEN_CCMP_FIRST
20374 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20376 #undef TARGET_GEN_CCMP_NEXT
20377 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20379 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
20380 instruction fusion of some sort. */
20382 static bool
20383 aarch64_macro_fusion_p (void)
20385 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20389 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
20390 should be kept together during scheduling. */
20392 static bool
20393 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20395 rtx set_dest;
20396 rtx prev_set = single_set (prev);
20397 rtx curr_set = single_set (curr);
20398 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
20399 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20401 if (!aarch64_macro_fusion_p ())
20402 return false;
20404 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20406 /* We are trying to match:
20407 prev (mov) == (set (reg r0) (const_int imm16))
20408 curr (movk) == (set (zero_extract (reg r0)
20409 (const_int 16)
20410 (const_int 16))
20411 (const_int imm16_1)) */
20413 set_dest = SET_DEST (curr_set);
20415 if (GET_CODE (set_dest) == ZERO_EXTRACT
20416 && CONST_INT_P (SET_SRC (curr_set))
20417 && CONST_INT_P (SET_SRC (prev_set))
20418 && CONST_INT_P (XEXP (set_dest, 2))
20419 && INTVAL (XEXP (set_dest, 2)) == 16
20420 && REG_P (XEXP (set_dest, 0))
20421 && REG_P (SET_DEST (prev_set))
20422 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20424 return true;
20428 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20431 /* We're trying to match:
20432 prev (adrp) == (set (reg r1)
20433 (high (symbol_ref ("SYM"))))
20434 curr (add) == (set (reg r0)
20435 (lo_sum (reg r1)
20436 (symbol_ref ("SYM"))))
20437 Note that r0 need not necessarily be the same as r1, especially
20438 during pre-regalloc scheduling. */
20440 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20441 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20443 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20444 && REG_P (XEXP (SET_SRC (curr_set), 0))
20445 && REGNO (XEXP (SET_SRC (curr_set), 0))
20446 == REGNO (SET_DEST (prev_set))
20447 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20448 XEXP (SET_SRC (curr_set), 1)))
20449 return true;
20453 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20456 /* We're trying to match:
20457 prev (movk) == (set (zero_extract (reg r0)
20458 (const_int 16)
20459 (const_int 32))
20460 (const_int imm16_1))
20461 curr (movk) == (set (zero_extract (reg r0)
20462 (const_int 16)
20463 (const_int 48))
20464 (const_int imm16_2)) */
20466 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20467 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20468 && REG_P (XEXP (SET_DEST (prev_set), 0))
20469 && REG_P (XEXP (SET_DEST (curr_set), 0))
20470 && REGNO (XEXP (SET_DEST (prev_set), 0))
20471 == REGNO (XEXP (SET_DEST (curr_set), 0))
20472 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20473 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20474 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20475 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20476 && CONST_INT_P (SET_SRC (prev_set))
20477 && CONST_INT_P (SET_SRC (curr_set)))
20478 return true;
20481 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20483 /* We're trying to match:
20484 prev (adrp) == (set (reg r0)
20485 (high (symbol_ref ("SYM"))))
20486 curr (ldr) == (set (reg r1)
20487 (mem (lo_sum (reg r0)
20488 (symbol_ref ("SYM")))))
20490 curr (ldr) == (set (reg r1)
20491 (zero_extend (mem
20492 (lo_sum (reg r0)
20493 (symbol_ref ("SYM")))))) */
20494 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20495 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20497 rtx curr_src = SET_SRC (curr_set);
20499 if (GET_CODE (curr_src) == ZERO_EXTEND)
20500 curr_src = XEXP (curr_src, 0);
20502 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20503 && REG_P (XEXP (XEXP (curr_src, 0), 0))
20504 && REGNO (XEXP (XEXP (curr_src, 0), 0))
20505 == REGNO (SET_DEST (prev_set))
20506 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20507 XEXP (SET_SRC (prev_set), 0)))
20508 return true;
20512 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
20513 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20514 && prev_set && curr_set && any_condjump_p (curr)
20515 && GET_CODE (SET_SRC (prev_set)) == COMPARE
20516 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
20517 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
20518 return true;
20520 /* Fuse flag-setting ALU instructions and conditional branch. */
20521 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20522 && any_condjump_p (curr))
20524 unsigned int condreg1, condreg2;
20525 rtx cc_reg_1;
20526 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20527 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20529 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20530 && prev
20531 && modified_in_p (cc_reg_1, prev))
20533 enum attr_type prev_type = get_attr_type (prev);
20535 /* FIXME: this misses some which is considered simple arthematic
20536 instructions for ThunderX. Simple shifts are missed here. */
20537 if (prev_type == TYPE_ALUS_SREG
20538 || prev_type == TYPE_ALUS_IMM
20539 || prev_type == TYPE_LOGICS_REG
20540 || prev_type == TYPE_LOGICS_IMM)
20541 return true;
20545 /* Fuse ALU instructions and CBZ/CBNZ. */
20546 if (prev_set
20547 && curr_set
20548 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
20549 && any_condjump_p (curr))
20551 /* We're trying to match:
20552 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20553 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
20554 (const_int 0))
20555 (label_ref ("SYM"))
20556 (pc)) */
20557 if (SET_DEST (curr_set) == (pc_rtx)
20558 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20559 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20560 && REG_P (SET_DEST (prev_set))
20561 && REGNO (SET_DEST (prev_set))
20562 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20564 /* Fuse ALU operations followed by conditional branch instruction. */
20565 switch (get_attr_type (prev))
20567 case TYPE_ALU_IMM:
20568 case TYPE_ALU_SREG:
20569 case TYPE_ADC_REG:
20570 case TYPE_ADC_IMM:
20571 case TYPE_ADCS_REG:
20572 case TYPE_ADCS_IMM:
20573 case TYPE_LOGIC_REG:
20574 case TYPE_LOGIC_IMM:
20575 case TYPE_CSEL:
20576 case TYPE_ADR:
20577 case TYPE_MOV_IMM:
20578 case TYPE_SHIFT_REG:
20579 case TYPE_SHIFT_IMM:
20580 case TYPE_BFM:
20581 case TYPE_RBIT:
20582 case TYPE_REV:
20583 case TYPE_EXTEND:
20584 return true;
20586 default:;
20591 return false;
20594 /* Return true iff the instruction fusion described by OP is enabled. */
20596 bool
20597 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20599 return (aarch64_tune_params.fusible_ops & op) != 0;
20602 /* If MEM is in the form of [base+offset], extract the two parts
20603 of address and set to BASE and OFFSET, otherwise return false
20604 after clearing BASE and OFFSET. */
20606 bool
20607 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20609 rtx addr;
20611 gcc_assert (MEM_P (mem));
20613 addr = XEXP (mem, 0);
20615 if (REG_P (addr))
20617 *base = addr;
20618 *offset = const0_rtx;
20619 return true;
20622 if (GET_CODE (addr) == PLUS
20623 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20625 *base = XEXP (addr, 0);
20626 *offset = XEXP (addr, 1);
20627 return true;
20630 *base = NULL_RTX;
20631 *offset = NULL_RTX;
20633 return false;
20636 /* Types for scheduling fusion. */
20637 enum sched_fusion_type
20639 SCHED_FUSION_NONE = 0,
20640 SCHED_FUSION_LD_SIGN_EXTEND,
20641 SCHED_FUSION_LD_ZERO_EXTEND,
20642 SCHED_FUSION_LD,
20643 SCHED_FUSION_ST,
20644 SCHED_FUSION_NUM
20647 /* If INSN is a load or store of address in the form of [base+offset],
20648 extract the two parts and set to BASE and OFFSET. Return scheduling
20649 fusion type this INSN is. */
20651 static enum sched_fusion_type
20652 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20654 rtx x, dest, src;
20655 enum sched_fusion_type fusion = SCHED_FUSION_LD;
20657 gcc_assert (INSN_P (insn));
20658 x = PATTERN (insn);
20659 if (GET_CODE (x) != SET)
20660 return SCHED_FUSION_NONE;
20662 src = SET_SRC (x);
20663 dest = SET_DEST (x);
20665 machine_mode dest_mode = GET_MODE (dest);
20667 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20668 return SCHED_FUSION_NONE;
20670 if (GET_CODE (src) == SIGN_EXTEND)
20672 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20673 src = XEXP (src, 0);
20674 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20675 return SCHED_FUSION_NONE;
20677 else if (GET_CODE (src) == ZERO_EXTEND)
20679 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20680 src = XEXP (src, 0);
20681 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20682 return SCHED_FUSION_NONE;
20685 if (GET_CODE (src) == MEM && REG_P (dest))
20686 extract_base_offset_in_addr (src, base, offset);
20687 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20689 fusion = SCHED_FUSION_ST;
20690 extract_base_offset_in_addr (dest, base, offset);
20692 else
20693 return SCHED_FUSION_NONE;
20695 if (*base == NULL_RTX || *offset == NULL_RTX)
20696 fusion = SCHED_FUSION_NONE;
20698 return fusion;
20701 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20703 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20704 and PRI are only calculated for these instructions. For other instruction,
20705 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
20706 type instruction fusion can be added by returning different priorities.
20708 It's important that irrelevant instructions get the largest FUSION_PRI. */
20710 static void
20711 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20712 int *fusion_pri, int *pri)
20714 int tmp, off_val;
20715 rtx base, offset;
20716 enum sched_fusion_type fusion;
20718 gcc_assert (INSN_P (insn));
20720 tmp = max_pri - 1;
20721 fusion = fusion_load_store (insn, &base, &offset);
20722 if (fusion == SCHED_FUSION_NONE)
20724 *pri = tmp;
20725 *fusion_pri = tmp;
20726 return;
20729 /* Set FUSION_PRI according to fusion type and base register. */
20730 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20732 /* Calculate PRI. */
20733 tmp /= 2;
20735 /* INSN with smaller offset goes first. */
20736 off_val = (int)(INTVAL (offset));
20737 if (off_val >= 0)
20738 tmp -= (off_val & 0xfffff);
20739 else
20740 tmp += ((- off_val) & 0xfffff);
20742 *pri = tmp;
20743 return;
20746 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20747 Adjust priority of sha1h instructions so they are scheduled before
20748 other SHA1 instructions. */
20750 static int
20751 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20753 rtx x = PATTERN (insn);
20755 if (GET_CODE (x) == SET)
20757 x = SET_SRC (x);
20759 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20760 return priority + 10;
20763 return priority;
20766 /* Given OPERANDS of consecutive load/store, check if we can merge
20767 them into ldp/stp. LOAD is true if they are load instructions.
20768 MODE is the mode of memory operands. */
20770 bool
20771 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20772 machine_mode mode)
20774 HOST_WIDE_INT offval_1, offval_2, msize;
20775 enum reg_class rclass_1, rclass_2;
20776 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20778 if (load)
20780 mem_1 = operands[1];
20781 mem_2 = operands[3];
20782 reg_1 = operands[0];
20783 reg_2 = operands[2];
20784 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20785 if (REGNO (reg_1) == REGNO (reg_2))
20786 return false;
20788 else
20790 mem_1 = operands[0];
20791 mem_2 = operands[2];
20792 reg_1 = operands[1];
20793 reg_2 = operands[3];
20796 /* The mems cannot be volatile. */
20797 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20798 return false;
20800 /* If we have SImode and slow unaligned ldp,
20801 check the alignment to be at least 8 byte. */
20802 if (mode == SImode
20803 && (aarch64_tune_params.extra_tuning_flags
20804 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20805 && !optimize_size
20806 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20807 return false;
20809 /* Check if the addresses are in the form of [base+offset]. */
20810 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20811 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20812 return false;
20813 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20814 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20815 return false;
20817 /* Check if the bases are same. */
20818 if (!rtx_equal_p (base_1, base_2))
20819 return false;
20821 /* The operands must be of the same size. */
20822 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20823 GET_MODE_SIZE (GET_MODE (mem_2))));
20825 offval_1 = INTVAL (offset_1);
20826 offval_2 = INTVAL (offset_2);
20827 /* We should only be trying this for fixed-sized modes. There is no
20828 SVE LDP/STP instruction. */
20829 msize = GET_MODE_SIZE (mode).to_constant ();
20830 /* Check if the offsets are consecutive. */
20831 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20832 return false;
20834 /* Check if the addresses are clobbered by load. */
20835 if (load)
20837 if (reg_mentioned_p (reg_1, mem_1))
20838 return false;
20840 /* In increasing order, the last load can clobber the address. */
20841 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20842 return false;
20845 /* One of the memory accesses must be a mempair operand.
20846 If it is not the first one, they need to be swapped by the
20847 peephole. */
20848 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
20849 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
20850 return false;
20852 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
20853 rclass_1 = FP_REGS;
20854 else
20855 rclass_1 = GENERAL_REGS;
20857 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
20858 rclass_2 = FP_REGS;
20859 else
20860 rclass_2 = GENERAL_REGS;
20862 /* Check if the registers are of same class. */
20863 if (rclass_1 != rclass_2)
20864 return false;
20866 return true;
20869 /* Given OPERANDS of consecutive load/store that can be merged,
20870 swap them if they are not in ascending order. */
20871 void
20872 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
20874 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
20875 HOST_WIDE_INT offval_1, offval_2;
20877 if (load)
20879 mem_1 = operands[1];
20880 mem_2 = operands[3];
20882 else
20884 mem_1 = operands[0];
20885 mem_2 = operands[2];
20888 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20889 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20891 offval_1 = INTVAL (offset_1);
20892 offval_2 = INTVAL (offset_2);
20894 if (offval_1 > offval_2)
20896 /* Irrespective of whether this is a load or a store,
20897 we do the same swap. */
20898 std::swap (operands[0], operands[2]);
20899 std::swap (operands[1], operands[3]);
20903 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20904 comparison between the two. */
20906 aarch64_host_wide_int_compare (const void *x, const void *y)
20908 return wi::cmps (* ((const HOST_WIDE_INT *) x),
20909 * ((const HOST_WIDE_INT *) y));
20912 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20913 other pointing to a REG rtx containing an offset, compare the offsets
20914 of the two pairs.
20916 Return:
20918 1 iff offset (X) > offset (Y)
20919 0 iff offset (X) == offset (Y)
20920 -1 iff offset (X) < offset (Y) */
20922 aarch64_ldrstr_offset_compare (const void *x, const void *y)
20924 const rtx * operands_1 = (const rtx *) x;
20925 const rtx * operands_2 = (const rtx *) y;
20926 rtx mem_1, mem_2, base, offset_1, offset_2;
20928 if (MEM_P (operands_1[0]))
20929 mem_1 = operands_1[0];
20930 else
20931 mem_1 = operands_1[1];
20933 if (MEM_P (operands_2[0]))
20934 mem_2 = operands_2[0];
20935 else
20936 mem_2 = operands_2[1];
20938 /* Extract the offsets. */
20939 extract_base_offset_in_addr (mem_1, &base, &offset_1);
20940 extract_base_offset_in_addr (mem_2, &base, &offset_2);
20942 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
20944 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
20947 /* Given OPERANDS of consecutive load/store, check if we can merge
20948 them into ldp/stp by adjusting the offset. LOAD is true if they
20949 are load instructions. MODE is the mode of memory operands.
20951 Given below consecutive stores:
20953 str w1, [xb, 0x100]
20954 str w1, [xb, 0x104]
20955 str w1, [xb, 0x108]
20956 str w1, [xb, 0x10c]
20958 Though the offsets are out of the range supported by stp, we can
20959 still pair them after adjusting the offset, like:
20961 add scratch, xb, 0x100
20962 stp w1, w1, [scratch]
20963 stp w1, w1, [scratch, 0x8]
20965 The peephole patterns detecting this opportunity should guarantee
20966 the scratch register is avaliable. */
20968 bool
20969 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
20970 scalar_mode mode)
20972 const int num_insns = 4;
20973 enum reg_class rclass;
20974 HOST_WIDE_INT offvals[num_insns], msize;
20975 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
20977 if (load)
20979 for (int i = 0; i < num_insns; i++)
20981 reg[i] = operands[2 * i];
20982 mem[i] = operands[2 * i + 1];
20984 gcc_assert (REG_P (reg[i]));
20987 /* Do not attempt to merge the loads if the loads clobber each other. */
20988 for (int i = 0; i < 8; i += 2)
20989 for (int j = i + 2; j < 8; j += 2)
20990 if (reg_overlap_mentioned_p (operands[i], operands[j]))
20991 return false;
20993 else
20994 for (int i = 0; i < num_insns; i++)
20996 mem[i] = operands[2 * i];
20997 reg[i] = operands[2 * i + 1];
21000 /* Skip if memory operand is by itself valid for ldp/stp. */
21001 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
21002 return false;
21004 for (int i = 0; i < num_insns; i++)
21006 /* The mems cannot be volatile. */
21007 if (MEM_VOLATILE_P (mem[i]))
21008 return false;
21010 /* Check if the addresses are in the form of [base+offset]. */
21011 extract_base_offset_in_addr (mem[i], base + i, offset + i);
21012 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
21013 return false;
21016 /* Check if the registers are of same class. */
21017 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
21018 ? FP_REGS : GENERAL_REGS;
21020 for (int i = 1; i < num_insns; i++)
21021 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
21023 if (rclass != FP_REGS)
21024 return false;
21026 else
21028 if (rclass != GENERAL_REGS)
21029 return false;
21032 /* Only the last register in the order in which they occur
21033 may be clobbered by the load. */
21034 if (rclass == GENERAL_REGS && load)
21035 for (int i = 0; i < num_insns - 1; i++)
21036 if (reg_mentioned_p (reg[i], mem[i]))
21037 return false;
21039 /* Check if the bases are same. */
21040 for (int i = 0; i < num_insns - 1; i++)
21041 if (!rtx_equal_p (base[i], base[i + 1]))
21042 return false;
21044 for (int i = 0; i < num_insns; i++)
21045 offvals[i] = INTVAL (offset[i]);
21047 msize = GET_MODE_SIZE (mode);
21049 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21050 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21051 aarch64_host_wide_int_compare);
21053 if (!(offvals[1] == offvals[0] + msize
21054 && offvals[3] == offvals[2] + msize))
21055 return false;
21057 /* Check that offsets are within range of each other. The ldp/stp
21058 instructions have 7 bit immediate offsets, so use 0x80. */
21059 if (offvals[2] - offvals[0] >= msize * 0x80)
21060 return false;
21062 /* The offsets must be aligned with respect to each other. */
21063 if (offvals[0] % msize != offvals[2] % msize)
21064 return false;
21066 /* If we have SImode and slow unaligned ldp,
21067 check the alignment to be at least 8 byte. */
21068 if (mode == SImode
21069 && (aarch64_tune_params.extra_tuning_flags
21070 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21071 && !optimize_size
21072 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21073 return false;
21075 return true;
21078 /* Given OPERANDS of consecutive load/store, this function pairs them
21079 into LDP/STP after adjusting the offset. It depends on the fact
21080 that the operands can be sorted so the offsets are correct for STP.
21081 MODE is the mode of memory operands. CODE is the rtl operator
21082 which should be applied to all memory operands, it's SIGN_EXTEND,
21083 ZERO_EXTEND or UNKNOWN. */
21085 bool
21086 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21087 scalar_mode mode, RTX_CODE code)
21089 rtx base, offset_1, offset_3, t1, t2;
21090 rtx mem_1, mem_2, mem_3, mem_4;
21091 rtx temp_operands[8];
21092 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21093 stp_off_upper_limit, stp_off_lower_limit, msize;
21095 /* We make changes on a copy as we may still bail out. */
21096 for (int i = 0; i < 8; i ++)
21097 temp_operands[i] = operands[i];
21099 /* Sort the operands. */
21100 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21102 /* Copy the memory operands so that if we have to bail for some
21103 reason the original addresses are unchanged. */
21104 if (load)
21106 mem_1 = copy_rtx (temp_operands[1]);
21107 mem_2 = copy_rtx (temp_operands[3]);
21108 mem_3 = copy_rtx (temp_operands[5]);
21109 mem_4 = copy_rtx (temp_operands[7]);
21111 else
21113 mem_1 = copy_rtx (temp_operands[0]);
21114 mem_2 = copy_rtx (temp_operands[2]);
21115 mem_3 = copy_rtx (temp_operands[4]);
21116 mem_4 = copy_rtx (temp_operands[6]);
21117 gcc_assert (code == UNKNOWN);
21120 extract_base_offset_in_addr (mem_1, &base, &offset_1);
21121 extract_base_offset_in_addr (mem_3, &base, &offset_3);
21122 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21123 && offset_3 != NULL_RTX);
21125 /* Adjust offset so it can fit in LDP/STP instruction. */
21126 msize = GET_MODE_SIZE (mode);
21127 stp_off_upper_limit = msize * (0x40 - 1);
21128 stp_off_lower_limit = - msize * 0x40;
21130 off_val_1 = INTVAL (offset_1);
21131 off_val_3 = INTVAL (offset_3);
21133 /* The base offset is optimally half way between the two STP/LDP offsets. */
21134 if (msize <= 4)
21135 base_off = (off_val_1 + off_val_3) / 2;
21136 else
21137 /* However, due to issues with negative LDP/STP offset generation for
21138 larger modes, for DF, DI and vector modes. we must not use negative
21139 addresses smaller than 9 signed unadjusted bits can store. This
21140 provides the most range in this case. */
21141 base_off = off_val_1;
21143 /* Adjust the base so that it is aligned with the addresses but still
21144 optimal. */
21145 if (base_off % msize != off_val_1 % msize)
21146 /* Fix the offset, bearing in mind we want to make it bigger not
21147 smaller. */
21148 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21149 else if (msize <= 4)
21150 /* The negative range of LDP/STP is one larger than the positive range. */
21151 base_off += msize;
21153 /* Check if base offset is too big or too small. We can attempt to resolve
21154 this issue by setting it to the maximum value and seeing if the offsets
21155 still fit. */
21156 if (base_off >= 0x1000)
21158 base_off = 0x1000 - 1;
21159 /* We must still make sure that the base offset is aligned with respect
21160 to the address. But it may may not be made any bigger. */
21161 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21164 /* Likewise for the case where the base is too small. */
21165 if (base_off <= -0x1000)
21167 base_off = -0x1000 + 1;
21168 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21171 /* Offset of the first STP/LDP. */
21172 new_off_1 = off_val_1 - base_off;
21174 /* Offset of the second STP/LDP. */
21175 new_off_3 = off_val_3 - base_off;
21177 /* The offsets must be within the range of the LDP/STP instructions. */
21178 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21179 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21180 return false;
21182 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21183 new_off_1), true);
21184 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21185 new_off_1 + msize), true);
21186 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21187 new_off_3), true);
21188 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21189 new_off_3 + msize), true);
21191 if (!aarch64_mem_pair_operand (mem_1, mode)
21192 || !aarch64_mem_pair_operand (mem_3, mode))
21193 return false;
21195 if (code == ZERO_EXTEND)
21197 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21198 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21199 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21200 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21202 else if (code == SIGN_EXTEND)
21204 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21205 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21206 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21207 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21210 if (load)
21212 operands[0] = temp_operands[0];
21213 operands[1] = mem_1;
21214 operands[2] = temp_operands[2];
21215 operands[3] = mem_2;
21216 operands[4] = temp_operands[4];
21217 operands[5] = mem_3;
21218 operands[6] = temp_operands[6];
21219 operands[7] = mem_4;
21221 else
21223 operands[0] = mem_1;
21224 operands[1] = temp_operands[1];
21225 operands[2] = mem_2;
21226 operands[3] = temp_operands[3];
21227 operands[4] = mem_3;
21228 operands[5] = temp_operands[5];
21229 operands[6] = mem_4;
21230 operands[7] = temp_operands[7];
21233 /* Emit adjusting instruction. */
21234 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21235 /* Emit ldp/stp instructions. */
21236 t1 = gen_rtx_SET (operands[0], operands[1]);
21237 t2 = gen_rtx_SET (operands[2], operands[3]);
21238 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21239 t1 = gen_rtx_SET (operands[4], operands[5]);
21240 t2 = gen_rtx_SET (operands[6], operands[7]);
21241 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21242 return true;
21245 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21246 it isn't worth branching around empty masked ops (including masked
21247 stores). */
21249 static bool
21250 aarch64_empty_mask_is_expensive (unsigned)
21252 return false;
21255 /* Return 1 if pseudo register should be created and used to hold
21256 GOT address for PIC code. */
21258 bool
21259 aarch64_use_pseudo_pic_reg (void)
21261 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21264 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21266 static int
21267 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21269 switch (XINT (x, 1))
21271 case UNSPEC_GOTSMALLPIC:
21272 case UNSPEC_GOTSMALLPIC28K:
21273 case UNSPEC_GOTTINYPIC:
21274 return 0;
21275 default:
21276 break;
21279 return default_unspec_may_trap_p (x, flags);
21283 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21284 return the log2 of that value. Otherwise return -1. */
21287 aarch64_fpconst_pow_of_2 (rtx x)
21289 const REAL_VALUE_TYPE *r;
21291 if (!CONST_DOUBLE_P (x))
21292 return -1;
21294 r = CONST_DOUBLE_REAL_VALUE (x);
21296 if (REAL_VALUE_NEGATIVE (*r)
21297 || REAL_VALUE_ISNAN (*r)
21298 || REAL_VALUE_ISINF (*r)
21299 || !real_isinteger (r, DFmode))
21300 return -1;
21302 return exact_log2 (real_to_integer (r));
21305 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21306 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21307 return n. Otherwise return -1. */
21310 aarch64_fpconst_pow2_recip (rtx x)
21312 REAL_VALUE_TYPE r0;
21314 if (!CONST_DOUBLE_P (x))
21315 return -1;
21317 r0 = *CONST_DOUBLE_REAL_VALUE (x);
21318 if (exact_real_inverse (DFmode, &r0)
21319 && !REAL_VALUE_NEGATIVE (r0))
21321 int ret = exact_log2 (real_to_integer (&r0));
21322 if (ret >= 1 && ret <= 32)
21323 return ret;
21325 return -1;
21328 /* If X is a vector of equal CONST_DOUBLE values and that value is
21329 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21332 aarch64_vec_fpconst_pow_of_2 (rtx x)
21334 int nelts;
21335 if (GET_CODE (x) != CONST_VECTOR
21336 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21337 return -1;
21339 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21340 return -1;
21342 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21343 if (firstval <= 0)
21344 return -1;
21346 for (int i = 1; i < nelts; i++)
21347 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21348 return -1;
21350 return firstval;
21353 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21354 to float.
21356 __fp16 always promotes through this hook.
21357 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21358 through the generic excess precision logic rather than here. */
21360 static tree
21361 aarch64_promoted_type (const_tree t)
21363 if (SCALAR_FLOAT_TYPE_P (t)
21364 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21365 return float_type_node;
21367 return NULL_TREE;
21370 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
21372 static bool
21373 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
21374 optimization_type opt_type)
21376 switch (op)
21378 case rsqrt_optab:
21379 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
21381 default:
21382 return true;
21386 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
21388 static unsigned int
21389 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
21390 int *offset)
21392 /* Polynomial invariant 1 == (VG / 2) - 1. */
21393 gcc_assert (i == 1);
21394 *factor = 2;
21395 *offset = 1;
21396 return AARCH64_DWARF_VG;
21399 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21400 if MODE is HFmode, and punt to the generic implementation otherwise. */
21402 static bool
21403 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21405 return (mode == HFmode
21406 ? true
21407 : default_libgcc_floating_mode_supported_p (mode));
21410 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21411 if MODE is HFmode, and punt to the generic implementation otherwise. */
21413 static bool
21414 aarch64_scalar_mode_supported_p (scalar_mode mode)
21416 return (mode == HFmode
21417 ? true
21418 : default_scalar_mode_supported_p (mode));
21421 /* Set the value of FLT_EVAL_METHOD.
21422 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21424 0: evaluate all operations and constants, whose semantic type has at
21425 most the range and precision of type float, to the range and
21426 precision of float; evaluate all other operations and constants to
21427 the range and precision of the semantic type;
21429 N, where _FloatN is a supported interchange floating type
21430 evaluate all operations and constants, whose semantic type has at
21431 most the range and precision of _FloatN type, to the range and
21432 precision of the _FloatN type; evaluate all other operations and
21433 constants to the range and precision of the semantic type;
21435 If we have the ARMv8.2-A extensions then we support _Float16 in native
21436 precision, so we should set this to 16. Otherwise, we support the type,
21437 but want to evaluate expressions in float precision, so set this to
21438 0. */
21440 static enum flt_eval_method
21441 aarch64_excess_precision (enum excess_precision_type type)
21443 switch (type)
21445 case EXCESS_PRECISION_TYPE_FAST:
21446 case EXCESS_PRECISION_TYPE_STANDARD:
21447 /* We can calculate either in 16-bit range and precision or
21448 32-bit range and precision. Make that decision based on whether
21449 we have native support for the ARMv8.2-A 16-bit floating-point
21450 instructions or not. */
21451 return (TARGET_FP_F16INST
21452 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21453 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21454 case EXCESS_PRECISION_TYPE_IMPLICIT:
21455 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21456 default:
21457 gcc_unreachable ();
21459 return FLT_EVAL_METHOD_UNPREDICTABLE;
21462 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
21463 scheduled for speculative execution. Reject the long-running division
21464 and square-root instructions. */
21466 static bool
21467 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21469 switch (get_attr_type (insn))
21471 case TYPE_SDIV:
21472 case TYPE_UDIV:
21473 case TYPE_FDIVS:
21474 case TYPE_FDIVD:
21475 case TYPE_FSQRTS:
21476 case TYPE_FSQRTD:
21477 case TYPE_NEON_FP_SQRT_S:
21478 case TYPE_NEON_FP_SQRT_D:
21479 case TYPE_NEON_FP_SQRT_S_Q:
21480 case TYPE_NEON_FP_SQRT_D_Q:
21481 case TYPE_NEON_FP_DIV_S:
21482 case TYPE_NEON_FP_DIV_D:
21483 case TYPE_NEON_FP_DIV_S_Q:
21484 case TYPE_NEON_FP_DIV_D_Q:
21485 return false;
21486 default:
21487 return true;
21491 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
21493 static int
21494 aarch64_compute_pressure_classes (reg_class *classes)
21496 int i = 0;
21497 classes[i++] = GENERAL_REGS;
21498 classes[i++] = FP_REGS;
21499 /* PR_REGS isn't a useful pressure class because many predicate pseudo
21500 registers need to go in PR_LO_REGS at some point during their
21501 lifetime. Splitting it into two halves has the effect of making
21502 all predicates count against PR_LO_REGS, so that we try whenever
21503 possible to restrict the number of live predicates to 8. This
21504 greatly reduces the amount of spilling in certain loops. */
21505 classes[i++] = PR_LO_REGS;
21506 classes[i++] = PR_HI_REGS;
21507 return i;
21510 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
21512 static bool
21513 aarch64_can_change_mode_class (machine_mode from,
21514 machine_mode to, reg_class_t)
21516 unsigned int from_flags = aarch64_classify_vector_mode (from);
21517 unsigned int to_flags = aarch64_classify_vector_mode (to);
21519 bool from_sve_p = (from_flags & VEC_ANY_SVE);
21520 bool to_sve_p = (to_flags & VEC_ANY_SVE);
21522 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
21523 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
21525 /* Don't allow changes between partial SVE modes and other modes.
21526 The contents of partial SVE modes are distributed evenly across
21527 the register, whereas GCC expects them to be clustered together. */
21528 if (from_partial_sve_p != to_partial_sve_p)
21529 return false;
21531 /* Similarly reject changes between partial SVE modes that have
21532 different patterns of significant and insignificant bits. */
21533 if (from_partial_sve_p
21534 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
21535 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
21536 return false;
21538 if (BYTES_BIG_ENDIAN)
21540 /* Don't allow changes between SVE data modes and non-SVE modes.
21541 See the comment at the head of aarch64-sve.md for details. */
21542 if (from_sve_p != to_sve_p)
21543 return false;
21545 /* Don't allow changes in element size: lane 0 of the new vector
21546 would not then be lane 0 of the old vector. See the comment
21547 above aarch64_maybe_expand_sve_subreg_move for a more detailed
21548 description.
21550 In the worst case, this forces a register to be spilled in
21551 one mode and reloaded in the other, which handles the
21552 endianness correctly. */
21553 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21554 return false;
21556 return true;
21559 /* Implement TARGET_EARLY_REMAT_MODES. */
21561 static void
21562 aarch64_select_early_remat_modes (sbitmap modes)
21564 /* SVE values are not normally live across a call, so it should be
21565 worth doing early rematerialization even in VL-specific mode. */
21566 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21567 if (aarch64_sve_mode_p ((machine_mode) i))
21568 bitmap_set_bit (modes, i);
21571 /* Override the default target speculation_safe_value. */
21572 static rtx
21573 aarch64_speculation_safe_value (machine_mode mode,
21574 rtx result, rtx val, rtx failval)
21576 /* Maybe we should warn if falling back to hard barriers. They are
21577 likely to be noticably more expensive than the alternative below. */
21578 if (!aarch64_track_speculation)
21579 return default_speculation_safe_value (mode, result, val, failval);
21581 if (!REG_P (val))
21582 val = copy_to_mode_reg (mode, val);
21584 if (!aarch64_reg_or_zero (failval, mode))
21585 failval = copy_to_mode_reg (mode, failval);
21587 emit_insn (gen_despeculate_copy (mode, result, val, failval));
21588 return result;
21591 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21592 Look into the tuning structure for an estimate.
21593 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21594 Advanced SIMD 128 bits. */
21596 static HOST_WIDE_INT
21597 aarch64_estimated_poly_value (poly_int64 val)
21599 enum aarch64_sve_vector_bits_enum width_source
21600 = aarch64_tune_params.sve_width;
21602 /* If we still don't have an estimate, use the default. */
21603 if (width_source == SVE_SCALABLE)
21604 return default_estimated_poly_value (val);
21606 HOST_WIDE_INT over_128 = width_source - 128;
21607 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21611 /* Return true for types that could be supported as SIMD return or
21612 argument types. */
21614 static bool
21615 supported_simd_type (tree t)
21617 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21619 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21620 return s == 1 || s == 2 || s == 4 || s == 8;
21622 return false;
21625 /* Return true for types that currently are supported as SIMD return
21626 or argument types. */
21628 static bool
21629 currently_supported_simd_type (tree t, tree b)
21631 if (COMPLEX_FLOAT_TYPE_P (t))
21632 return false;
21634 if (TYPE_SIZE (t) != TYPE_SIZE (b))
21635 return false;
21637 return supported_simd_type (t);
21640 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
21642 static int
21643 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21644 struct cgraph_simd_clone *clonei,
21645 tree base_type, int num)
21647 tree t, ret_type, arg_type;
21648 unsigned int elt_bits, vec_bits, count;
21650 if (!TARGET_SIMD)
21651 return 0;
21653 if (clonei->simdlen
21654 && (clonei->simdlen < 2
21655 || clonei->simdlen > 1024
21656 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21658 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21659 "unsupported simdlen %d", clonei->simdlen);
21660 return 0;
21663 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21664 if (TREE_CODE (ret_type) != VOID_TYPE
21665 && !currently_supported_simd_type (ret_type, base_type))
21667 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21668 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21669 "GCC does not currently support mixed size types "
21670 "for %<simd%> functions");
21671 else if (supported_simd_type (ret_type))
21672 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21673 "GCC does not currently support return type %qT "
21674 "for %<simd%> functions", ret_type);
21675 else
21676 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21677 "unsupported return type %qT for %<simd%> functions",
21678 ret_type);
21679 return 0;
21682 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21684 arg_type = TREE_TYPE (t);
21686 if (!currently_supported_simd_type (arg_type, base_type))
21688 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21689 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21690 "GCC does not currently support mixed size types "
21691 "for %<simd%> functions");
21692 else
21693 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21694 "GCC does not currently support argument type %qT "
21695 "for %<simd%> functions", arg_type);
21696 return 0;
21700 clonei->vecsize_mangle = 'n';
21701 clonei->mask_mode = VOIDmode;
21702 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21703 if (clonei->simdlen == 0)
21705 count = 2;
21706 vec_bits = (num == 0 ? 64 : 128);
21707 clonei->simdlen = vec_bits / elt_bits;
21709 else
21711 count = 1;
21712 vec_bits = clonei->simdlen * elt_bits;
21713 if (vec_bits != 64 && vec_bits != 128)
21715 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21716 "GCC does not currently support simdlen %d for type %qT",
21717 clonei->simdlen, base_type);
21718 return 0;
21721 clonei->vecsize_int = vec_bits;
21722 clonei->vecsize_float = vec_bits;
21723 return count;
21726 /* Implement TARGET_SIMD_CLONE_ADJUST. */
21728 static void
21729 aarch64_simd_clone_adjust (struct cgraph_node *node)
21731 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21732 use the correct ABI. */
21734 tree t = TREE_TYPE (node->decl);
21735 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21736 TYPE_ATTRIBUTES (t));
21739 /* Implement TARGET_SIMD_CLONE_USABLE. */
21741 static int
21742 aarch64_simd_clone_usable (struct cgraph_node *node)
21744 switch (node->simdclone->vecsize_mangle)
21746 case 'n':
21747 if (!TARGET_SIMD)
21748 return -1;
21749 return 0;
21750 default:
21751 gcc_unreachable ();
21755 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21757 static int
21758 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21760 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21761 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21762 return 0;
21763 return 1;
21766 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21768 static const char *
21769 aarch64_get_multilib_abi_name (void)
21771 if (TARGET_BIG_END)
21772 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21773 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21776 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21777 global variable based guard use the default else
21778 return a null tree. */
21779 static tree
21780 aarch64_stack_protect_guard (void)
21782 if (aarch64_stack_protector_guard == SSP_GLOBAL)
21783 return default_stack_protect_guard ();
21785 return NULL_TREE;
21788 /* Return the diagnostic message string if conversion from FROMTYPE to
21789 TOTYPE is not allowed, NULL otherwise. */
21791 static const char *
21792 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
21794 if (element_mode (fromtype) != element_mode (totype))
21796 /* Do no allow conversions to/from BFmode scalar types. */
21797 if (TYPE_MODE (fromtype) == BFmode)
21798 return N_("invalid conversion from type %<bfloat16_t%>");
21799 if (TYPE_MODE (totype) == BFmode)
21800 return N_("invalid conversion to type %<bfloat16_t%>");
21803 /* Conversion allowed. */
21804 return NULL;
21807 /* Return the diagnostic message string if the unary operation OP is
21808 not permitted on TYPE, NULL otherwise. */
21810 static const char *
21811 aarch64_invalid_unary_op (int op, const_tree type)
21813 /* Reject all single-operand operations on BFmode except for &. */
21814 if (element_mode (type) == BFmode && op != ADDR_EXPR)
21815 return N_("operation not permitted on type %<bfloat16_t%>");
21817 /* Operation allowed. */
21818 return NULL;
21821 /* Return the diagnostic message string if the binary operation OP is
21822 not permitted on TYPE1 and TYPE2, NULL otherwise. */
21824 static const char *
21825 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
21826 const_tree type2)
21828 /* Reject all 2-operand operations on BFmode. */
21829 if (element_mode (type1) == BFmode
21830 || element_mode (type2) == BFmode)
21831 return N_("operation not permitted on type %<bfloat16_t%>");
21833 /* Operation allowed. */
21834 return NULL;
21837 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
21838 section at the end if needed. */
21839 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
21840 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
21841 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
21842 void
21843 aarch64_file_end_indicate_exec_stack ()
21845 file_end_indicate_exec_stack ();
21847 unsigned feature_1_and = 0;
21848 if (aarch64_bti_enabled ())
21849 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
21851 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
21852 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
21854 if (feature_1_and)
21856 /* Generate .note.gnu.property section. */
21857 switch_to_section (get_section (".note.gnu.property",
21858 SECTION_NOTYPE, NULL));
21860 /* PT_NOTE header: namesz, descsz, type.
21861 namesz = 4 ("GNU\0")
21862 descsz = 16 (Size of the program property array)
21863 [(12 + padding) * Number of array elements]
21864 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
21865 assemble_align (POINTER_SIZE);
21866 assemble_integer (GEN_INT (4), 4, 32, 1);
21867 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
21868 assemble_integer (GEN_INT (5), 4, 32, 1);
21870 /* PT_NOTE name. */
21871 assemble_string ("GNU", 4);
21873 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21874 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21875 datasz = 4
21876 data = feature_1_and. */
21877 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
21878 assemble_integer (GEN_INT (4), 4, 32, 1);
21879 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
21881 /* Pad the size of the note to the required alignment. */
21882 assemble_align (POINTER_SIZE);
21885 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21886 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21887 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21889 /* Target-specific selftests. */
21891 #if CHECKING_P
21893 namespace selftest {
21895 /* Selftest for the RTL loader.
21896 Verify that the RTL loader copes with a dump from
21897 print_rtx_function. This is essentially just a test that class
21898 function_reader can handle a real dump, but it also verifies
21899 that lookup_reg_by_dump_name correctly handles hard regs.
21900 The presence of hard reg names in the dump means that the test is
21901 target-specific, hence it is in this file. */
21903 static void
21904 aarch64_test_loading_full_dump ()
21906 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
21908 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
21910 rtx_insn *insn_1 = get_insn_by_uid (1);
21911 ASSERT_EQ (NOTE, GET_CODE (insn_1));
21913 rtx_insn *insn_15 = get_insn_by_uid (15);
21914 ASSERT_EQ (INSN, GET_CODE (insn_15));
21915 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
21917 /* Verify crtl->return_rtx. */
21918 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
21919 ASSERT_EQ (0, REGNO (crtl->return_rtx));
21920 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
21923 /* Run all target-specific selftests. */
21925 static void
21926 aarch64_run_selftests (void)
21928 aarch64_test_loading_full_dump ();
21931 } // namespace selftest
21933 #endif /* #if CHECKING_P */
21935 #undef TARGET_STACK_PROTECT_GUARD
21936 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21938 #undef TARGET_ADDRESS_COST
21939 #define TARGET_ADDRESS_COST aarch64_address_cost
21941 /* This hook will determines whether unnamed bitfields affect the alignment
21942 of the containing structure. The hook returns true if the structure
21943 should inherit the alignment requirements of an unnamed bitfield's
21944 type. */
21945 #undef TARGET_ALIGN_ANON_BITFIELD
21946 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21948 #undef TARGET_ASM_ALIGNED_DI_OP
21949 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21951 #undef TARGET_ASM_ALIGNED_HI_OP
21952 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21954 #undef TARGET_ASM_ALIGNED_SI_OP
21955 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21957 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21958 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21959 hook_bool_const_tree_hwi_hwi_const_tree_true
21961 #undef TARGET_ASM_FILE_START
21962 #define TARGET_ASM_FILE_START aarch64_start_file
21964 #undef TARGET_ASM_OUTPUT_MI_THUNK
21965 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21967 #undef TARGET_ASM_SELECT_RTX_SECTION
21968 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21970 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21971 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21973 #undef TARGET_BUILD_BUILTIN_VA_LIST
21974 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21976 #undef TARGET_CALLEE_COPIES
21977 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21979 #undef TARGET_CAN_ELIMINATE
21980 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21982 #undef TARGET_CAN_INLINE_P
21983 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21985 #undef TARGET_CANNOT_FORCE_CONST_MEM
21986 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21988 #undef TARGET_CASE_VALUES_THRESHOLD
21989 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21991 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21992 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21994 /* Only the least significant bit is used for initialization guard
21995 variables. */
21996 #undef TARGET_CXX_GUARD_MASK_BIT
21997 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21999 #undef TARGET_C_MODE_FOR_SUFFIX
22000 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22002 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22003 #undef TARGET_DEFAULT_TARGET_FLAGS
22004 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22005 #endif
22007 #undef TARGET_CLASS_MAX_NREGS
22008 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22010 #undef TARGET_BUILTIN_DECL
22011 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22013 #undef TARGET_BUILTIN_RECIPROCAL
22014 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22016 #undef TARGET_C_EXCESS_PRECISION
22017 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22019 #undef TARGET_EXPAND_BUILTIN
22020 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22022 #undef TARGET_EXPAND_BUILTIN_VA_START
22023 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22025 #undef TARGET_FOLD_BUILTIN
22026 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22028 #undef TARGET_FUNCTION_ARG
22029 #define TARGET_FUNCTION_ARG aarch64_function_arg
22031 #undef TARGET_FUNCTION_ARG_ADVANCE
22032 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22034 #undef TARGET_FUNCTION_ARG_BOUNDARY
22035 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22037 #undef TARGET_FUNCTION_ARG_PADDING
22038 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22040 #undef TARGET_GET_RAW_RESULT_MODE
22041 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22042 #undef TARGET_GET_RAW_ARG_MODE
22043 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22045 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22046 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22048 #undef TARGET_FUNCTION_VALUE
22049 #define TARGET_FUNCTION_VALUE aarch64_function_value
22051 #undef TARGET_FUNCTION_VALUE_REGNO_P
22052 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22054 #undef TARGET_GIMPLE_FOLD_BUILTIN
22055 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22057 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22058 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22060 #undef TARGET_INIT_BUILTINS
22061 #define TARGET_INIT_BUILTINS aarch64_init_builtins
22063 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22064 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22065 aarch64_ira_change_pseudo_allocno_class
22067 #undef TARGET_LEGITIMATE_ADDRESS_P
22068 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22070 #undef TARGET_LEGITIMATE_CONSTANT_P
22071 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22073 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22074 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22075 aarch64_legitimize_address_displacement
22077 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22078 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22080 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22081 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22082 aarch64_libgcc_floating_mode_supported_p
22084 #undef TARGET_MANGLE_TYPE
22085 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22087 #undef TARGET_INVALID_CONVERSION
22088 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22090 #undef TARGET_INVALID_UNARY_OP
22091 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22093 #undef TARGET_INVALID_BINARY_OP
22094 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22096 #undef TARGET_VERIFY_TYPE_CONTEXT
22097 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22099 #undef TARGET_MEMORY_MOVE_COST
22100 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22102 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22103 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22105 #undef TARGET_MUST_PASS_IN_STACK
22106 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22108 /* This target hook should return true if accesses to volatile bitfields
22109 should use the narrowest mode possible. It should return false if these
22110 accesses should use the bitfield container type. */
22111 #undef TARGET_NARROW_VOLATILE_BITFIELD
22112 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22114 #undef TARGET_OPTION_OVERRIDE
22115 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22117 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22118 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22119 aarch64_override_options_after_change
22121 #undef TARGET_OPTION_SAVE
22122 #define TARGET_OPTION_SAVE aarch64_option_save
22124 #undef TARGET_OPTION_RESTORE
22125 #define TARGET_OPTION_RESTORE aarch64_option_restore
22127 #undef TARGET_OPTION_PRINT
22128 #define TARGET_OPTION_PRINT aarch64_option_print
22130 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22131 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22133 #undef TARGET_SET_CURRENT_FUNCTION
22134 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22136 #undef TARGET_PASS_BY_REFERENCE
22137 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22139 #undef TARGET_PREFERRED_RELOAD_CLASS
22140 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22142 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22143 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22145 #undef TARGET_PROMOTED_TYPE
22146 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22148 #undef TARGET_SECONDARY_RELOAD
22149 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22151 #undef TARGET_SHIFT_TRUNCATION_MASK
22152 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22154 #undef TARGET_SETUP_INCOMING_VARARGS
22155 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22157 #undef TARGET_STRUCT_VALUE_RTX
22158 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22160 #undef TARGET_REGISTER_MOVE_COST
22161 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22163 #undef TARGET_RETURN_IN_MEMORY
22164 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22166 #undef TARGET_RETURN_IN_MSB
22167 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22169 #undef TARGET_RTX_COSTS
22170 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22172 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22173 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22175 #undef TARGET_SCHED_ISSUE_RATE
22176 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22178 #undef TARGET_SCHED_VARIABLE_ISSUE
22179 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22181 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22182 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22183 aarch64_sched_first_cycle_multipass_dfa_lookahead
22185 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22186 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22187 aarch64_first_cycle_multipass_dfa_lookahead_guard
22189 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22190 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22191 aarch64_get_separate_components
22193 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22194 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22195 aarch64_components_for_bb
22197 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22198 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22199 aarch64_disqualify_components
22201 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22202 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22203 aarch64_emit_prologue_components
22205 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22206 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22207 aarch64_emit_epilogue_components
22209 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22210 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22211 aarch64_set_handled_components
22213 #undef TARGET_TRAMPOLINE_INIT
22214 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22216 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22217 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22219 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22220 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22222 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22223 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22225 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22226 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22227 aarch64_builtin_support_vector_misalignment
22229 #undef TARGET_ARRAY_MODE
22230 #define TARGET_ARRAY_MODE aarch64_array_mode
22232 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22233 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22235 #undef TARGET_VECTORIZE_ADD_STMT_COST
22236 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22238 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22239 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22240 aarch64_builtin_vectorization_cost
22242 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22243 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22245 #undef TARGET_VECTORIZE_BUILTINS
22246 #define TARGET_VECTORIZE_BUILTINS
22248 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22249 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22250 aarch64_builtin_vectorized_function
22252 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22253 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22254 aarch64_autovectorize_vector_modes
22256 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22257 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22258 aarch64_atomic_assign_expand_fenv
22260 /* Section anchor support. */
22262 #undef TARGET_MIN_ANCHOR_OFFSET
22263 #define TARGET_MIN_ANCHOR_OFFSET -256
22265 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22266 byte offset; we can do much more for larger data types, but have no way
22267 to determine the size of the access. We assume accesses are aligned. */
22268 #undef TARGET_MAX_ANCHOR_OFFSET
22269 #define TARGET_MAX_ANCHOR_OFFSET 4095
22271 #undef TARGET_VECTOR_ALIGNMENT
22272 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22274 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22275 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22276 aarch64_vectorize_preferred_vector_alignment
22277 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22278 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22279 aarch64_simd_vector_alignment_reachable
22281 /* vec_perm support. */
22283 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22284 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22285 aarch64_vectorize_vec_perm_const
22287 #undef TARGET_VECTORIZE_RELATED_MODE
22288 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22289 #undef TARGET_VECTORIZE_GET_MASK_MODE
22290 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22291 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22292 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22293 aarch64_empty_mask_is_expensive
22294 #undef TARGET_PREFERRED_ELSE_VALUE
22295 #define TARGET_PREFERRED_ELSE_VALUE \
22296 aarch64_preferred_else_value
22298 #undef TARGET_INIT_LIBFUNCS
22299 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22301 #undef TARGET_FIXED_CONDITION_CODE_REGS
22302 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22304 #undef TARGET_FLAGS_REGNUM
22305 #define TARGET_FLAGS_REGNUM CC_REGNUM
22307 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22308 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22310 #undef TARGET_ASAN_SHADOW_OFFSET
22311 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22313 #undef TARGET_LEGITIMIZE_ADDRESS
22314 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22316 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22317 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22319 #undef TARGET_CAN_USE_DOLOOP_P
22320 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22322 #undef TARGET_SCHED_ADJUST_PRIORITY
22323 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22325 #undef TARGET_SCHED_MACRO_FUSION_P
22326 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22328 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22329 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22331 #undef TARGET_SCHED_FUSION_PRIORITY
22332 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22334 #undef TARGET_UNSPEC_MAY_TRAP_P
22335 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22337 #undef TARGET_USE_PSEUDO_PIC_REG
22338 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22340 #undef TARGET_PRINT_OPERAND
22341 #define TARGET_PRINT_OPERAND aarch64_print_operand
22343 #undef TARGET_PRINT_OPERAND_ADDRESS
22344 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22346 #undef TARGET_OPTAB_SUPPORTED_P
22347 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22349 #undef TARGET_OMIT_STRUCT_RETURN_REG
22350 #define TARGET_OMIT_STRUCT_RETURN_REG true
22352 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22353 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22354 aarch64_dwarf_poly_indeterminate_value
22356 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
22357 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22358 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22360 #undef TARGET_HARD_REGNO_NREGS
22361 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22362 #undef TARGET_HARD_REGNO_MODE_OK
22363 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22365 #undef TARGET_MODES_TIEABLE_P
22366 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22368 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22369 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22370 aarch64_hard_regno_call_part_clobbered
22372 #undef TARGET_INSN_CALLEE_ABI
22373 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22375 #undef TARGET_CONSTANT_ALIGNMENT
22376 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22378 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22379 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22380 aarch64_stack_clash_protection_alloca_probe_range
22382 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22383 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22385 #undef TARGET_CAN_CHANGE_MODE_CLASS
22386 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22388 #undef TARGET_SELECT_EARLY_REMAT_MODES
22389 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22391 #undef TARGET_SPECULATION_SAFE_VALUE
22392 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22394 #undef TARGET_ESTIMATED_POLY_VALUE
22395 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22397 #undef TARGET_ATTRIBUTE_TABLE
22398 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22400 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22401 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22402 aarch64_simd_clone_compute_vecsize_and_simdlen
22404 #undef TARGET_SIMD_CLONE_ADJUST
22405 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22407 #undef TARGET_SIMD_CLONE_USABLE
22408 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22410 #undef TARGET_COMP_TYPE_ATTRIBUTES
22411 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22413 #undef TARGET_GET_MULTILIB_ABI_NAME
22414 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22416 #undef TARGET_FNTYPE_ABI
22417 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22419 #if CHECKING_P
22420 #undef TARGET_RUN_TARGET_SELFTESTS
22421 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22422 #endif /* #if CHECKING_P */
22424 #undef TARGET_ASM_POST_CFI_STARTPROC
22425 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22427 #undef TARGET_STRICT_ARGUMENT_NAMING
22428 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22430 #undef TARGET_MD_ASM_ADJUST
22431 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22433 struct gcc_target targetm = TARGET_INITIALIZER;
22435 #include "gt-aarch64.h"